def test_precompute_gencode(self): k = 1 file_name = get_unzipped_file_name(skr_config.GENCODE_HUMAN) with open('../cache/' + file_name, mode='r') as infasta: (mean, std, unnormalized_frequency, names) = compute_normalization_and_frequency(infasta, k, False) print('For k=', k, 'length is', len(mean)) assert len(mean) == 4**k
def build_cache_files(): if not os.path.exists(CACHE_DIR): os.mkdir(CACHE_DIR) fasta_sets = get_precomputed_fasta_sets() for fasta_set in fasta_sets: if VERBOSE: print('Getting ', fasta_set) fasta_file = getGenCode.get_unzipped_file_name(fasta_set) dir_name = pathlib.PurePath(fasta_file).stem path_to_dir = os.path.join(CACHE_DIR, dir_name) if not os.path.exists(path_to_dir): os.mkdir(path_to_dir) names_written = False tsave = 0 if VERBOSE: print(dir_name + ' computing normalization took\t', end='') for kmer_length in range(1, skr_config.MAX_KMER_LENGTH_PRECOMPUTE + 1): fasta_path = os.path.join(CACHE_DIR, fasta_file) with open(fasta_path, mode='r') as infasta: t1 = time.perf_counter() (mean, std, unnormalized_frequency, names) = compute_normalization_and_frequency( infasta, kmer_length, return_normalized=False) t2 = time.perf_counter() if VERBOSE: print('k=' + str(kmer_length) + ',%.3fs;\t' % (t2 - t1), end='') t1 = time.perf_counter() np.save( get_file_path_for(fasta_file, kmer_length, CACHE_FILE_TYPES.get('mean')), mean) np.save( get_file_path_for(fasta_file, kmer_length, CACHE_FILE_TYPES.get('std')), std) np.save( get_file_path_for( fasta_file, kmer_length, CACHE_FILE_TYPES.get('unnormalized_frequency')), unnormalized_frequency) if not names_written: with open( get_file_path_for(fasta_file, kmer_length, CACHE_FILE_TYPES.get('names')), 'wb') as names_file: pickle.dump(names, names_file) names_written = True t2 = time.perf_counter() tsave += (t2 - t1) if VERBOSE: print('\nAggregate save time for ' + dir_name + ' was %.3fs' % tsave)
def get_precomputed_frequency_path(comparison_set, kmer_length): if comparison_set is None or len(comparison_set) <= 0: return None fasta_sets = get_precomputed_fasta_sets() for fasta_set in fasta_sets: if comparison_set == fasta_set.server_name: fasta_file = getGenCode.get_unzipped_file_name(fasta_set) unnormalized_frequency_path = get_file_path_for(fasta_file, kmer_length, CACHE_FILE_TYPES.get('unnormalized_frequency')) names_path = get_file_path_for(fasta_file, kmer_length, CACHE_FILE_TYPES.get('names')) if os.path.exists(unnormalized_frequency_path) and os.path.exists(names_path): return unnormalized_frequency_path, names_path return None
def get_precomputed_normalization_path(parameters): normal_set = parameters['normal_set'] if normal_set is None or len(normal_set) <= 0: return None fasta_sets = get_precomputed_fasta_sets() for fasta_set in fasta_sets: if normal_set == fasta_set.server_name: fasta_file = getGenCode.get_unzipped_file_name(fasta_set) mean_path = get_file_path_for(fasta_file, parameters['kmer_length'], CACHE_FILE_TYPES.get('mean')) std_path = get_file_path_for(fasta_file, parameters['kmer_length'], CACHE_FILE_TYPES.get('std')) if os.path.exists(mean_path) and os.path.exists(std_path): return (mean_path, std_path) else: raise SeekrServerError('Fasta file <' + fasta_file + '> not found for kmer_length=' + str(parameters['kmer_length']) ) return None return None