def kmer_sample_db(data, kim_file, contains_sample_kmer_file, lock, truths=None): kim = load_pickle(kim_file) kmer_sample_chunk = [] for line in data: linelist = line.split('\t') kmer = linelist[0] kmer_sample_lines = [] for sample_ in linelist[1:]: sample = sample_.split(',') kmer_sample_lines.append( f'{sample[0]}\t{kim[kmer]}' ) # add some perturbation of sample[1] to the end to get different truth values for different num CNVs kmer_sample_chunk.append('\n'.join(kmer_sample_lines)) # write every 500k to limit memory usage if len(kmer_sample_chunk) >= 500000: lock.acquire() write_list(kmer_sample_chunk, contains_sample_kmer_file) lock.release() kmer_sample_chunk = [] write_files(lock, (kmer_sample_chunk, contains_sample_kmer_file))
def process(data, lock, pim, kim_file, fsa_file, scored_kmers_file): kim = load_pickle(kim_file) chunk = [] for line in data: linelist = line.split() outline = (kim[int(linelist[0])], pim[int(linelist[1])], linelist[2]) chunk.append(outline) kmers = [f'>{i}\n{line[0]}' for i, line in enumerate(chunk)] values = ['\t'.join(tup) for tup in chunk] write_files(lock, (values, scored_kmers_file), (kmers, fsa_file))
def kmer_pheno_db(data, kim_file, value_kmer_pheno_file, truth_kmer_pheno_file, baseline_kmer_pheno_file, lock, truths=None, baseline=None): kim = load_pickle(kim_file) kmer_pheno_chunk = [] if truths: truths_chunk = [] else: truths_chunk = None if baseline: baseline_chunk = [] else: baseline_chunk = None for line in data: linelist = line.split('\t') kmer = linelist[0] for pheno in linelist[1:]: kmer_pheno_chunk.append(f'{kim[kmer]}\t{pheno}') if truths and kmer_in_truths(kmer, truths, pheno): truths_chunk.append(f'{kim[kmer]}\t{pheno}') if baseline: score = kmer_in_truths(kmer, baseline, pheno) if score is True: baseline_chunk.append(f'{kim[kmer]}\t{pheno}') elif score is not False and score > 0.0 and score < 1.0: baseline_chunk.append(f'{kim[kmer]}\t{pheno}\t{score}') if len(kmer_pheno_chunk) >= 500000: write_files(lock, (kmer_pheno_chunk, value_kmer_pheno_file), (truths_chunk, truth_kmer_pheno_file), (baseline_chunk, baseline_kmer_pheno_file)) kmer_pheno_chunk = [] if truths_chunk is not None: truths_chunk = [] write_files(lock, (kmer_pheno_chunk, value_kmer_pheno_file), (truths_chunk, truth_kmer_pheno_file), (baseline_chunk, baseline_kmer_pheno_file))
def filter_kmers(data, thresh, dfdisp, dfnodisp, kmer_sample_file, kmer_pheno_file, lock): printd('Filtering kmers...') nphenos = dfdisp.shape[1] kmer_samples = [] kmer_phenos = [] while (data): line = data.pop() kmer = line[0] # collect resistant/vulnerable frequencies for each antibiotic for # this unitig disp = sum(dfdisp[sample_id[0]] for sample_id in line[1:]) nodisp = sum(dfnodisp[sample_id[0]] for sample_id in line[1:]) # 1 test per antibiotic; unitig needs to pass only 1 to avoid # getting filtered out samples_thresh = 5 a = np.where((disp + nodisp >= samples_thresh) \ & (disp / (disp + nodisp + .01) > thresh))[0] if a.size == 0: continue kmer_pheno_chunk = [kmer] for pheno in a: kmer_pheno_chunk.append(str(pheno)) kmer_phenos.append('\t'.join(kmer_pheno_chunk)) kmer_samples.append('\t'.join(map(format_tuple, line))) # write every 500K kmers to keep memory consumption under control if len(kmer_phenos) >= 500000: write_files(lock, (kmer_samples, kmer_sample_file), (kmer_phenos, kmer_pheno_file)) kmer_samples = [] kmer_phenos = [] write_files(lock, (kmer_samples, kmer_sample_file), (kmer_phenos, kmer_pheno_file)) printd('Finished filtering kmers.') return