def create_pheno_int_map(phenos, pim_file): printd('Creating pheno int map...') with open(phenos, 'r') as f: line = f.readline() # only need first line which contains headers pim = {} for i, p in enumerate(line.split()[1:]): # skip id column pim[i] = p pim[p] = i with open(pim_file, 'wb') as f: pickle.dump(pim, f) printd('Successfully created pheno int map.')
def create_kmer_int_map(kmers, kim_file): printd('Creating kmer int map...') kim = {} with open(kmers, 'r') as f: lines = f.readlines() for i, line in enumerate(lines): u = line.split('\t')[0] kim[u] = i kim[i] = u with open(kim_file, 'wb') as f: pickle.dump(kim, f) printd('Successfully created kmer int map.')
def sample_kmers(data, n, seed=randint(1, 100000)): printd('Sampling kmers...') sample_matrix = np.zeros((n, n)) rng = Random(seed) num_kmers = int(len(data) * 0.05) sampled = rng.sample(data, num_kmers) for line in sampled: samplelist = line[1:] for i, s1 in enumerate(samplelist): for s2 in samplelist[i:]: sample_matrix[s1[0]][s2[0]] += 1 sample_matrix[s2[0]][s1[0]] += 1 printd('Finished sampling kmers.') return num_kmers, sample_matrix
def main(): # load params params = get_params() project = params['project'] k = params['k'] # define file paths samples_file = join(project, 'data', 'raw', params['sample']) outfile = join(project, 'data', 'preprocessed', 'unique_kmers.txt') catted_samples = join(project, 'data', 'preprocessed', 'samples.fa') # check if output file exists; if so, do nothing. if file_exists(outfile): exit(0) # create catted samples file if it does not exist. if not file_exists(catted_samples): cat_samples(samples_file, catted_samples) # multiprocessing queue for transferring data to the main thread q = Manager().Queue() # invoke process(...) on catted_samples files with kwargs, for each thread process_file(process, catted_samples, q=q, k=k) # consolidate all threads' counters into single counter holding all kmers counter = Counter() while not q.empty(): counter.update(q.get()) for kmer in counter.keys(): comp = complement(kmer) if comp in counter: comp_count = counter[comp] counter[comp] = 0 counter[kmer] += comp_count counter = +counter printd('Finished consolidating counters.') # write counter to file write_dict(counter, outfile, sep='\t') # remove catted samples file if file_exists(catted_samples): remove(catted_samples)
def consolidate(data, k): printd('Consolidating chunk...') prev_line = data.pop() prev_unitig = prev_line[0] unitigs = [] while (data): line = data.pop() this_unitig = line[0] # kmers are sequential and the same set of samples contain both kmers if prev_unitig[0:k - 1] == this_unitig[-(k - 1):] \ and len(line) == len(prev_line) \ and set(line[1:]) == set(prev_line[1:]): this_unitig = this_unitig[0] + prev_unitig line = (this_unitig, *line[1:]) else: unitigs.append(prev_line) prev_line = line prev_unitig = this_unitig printd('Finished consolidating chunk.') return unitigs
def filter_kmers(data, thresh, dfdisp, dfnodisp, kmer_sample_file, kmer_pheno_file, lock): printd('Filtering kmers...') nphenos = dfdisp.shape[1] kmer_samples = [] kmer_phenos = [] while (data): line = data.pop() kmer = line[0] # collect resistant/vulnerable frequencies for each antibiotic for # this unitig disp = sum(dfdisp[sample_id[0]] for sample_id in line[1:]) nodisp = sum(dfnodisp[sample_id[0]] for sample_id in line[1:]) # 1 test per antibiotic; unitig needs to pass only 1 to avoid # getting filtered out samples_thresh = 5 a = np.where((disp + nodisp >= samples_thresh) \ & (disp / (disp + nodisp + .01) > thresh))[0] if a.size == 0: continue kmer_pheno_chunk = [kmer] for pheno in a: kmer_pheno_chunk.append(str(pheno)) kmer_phenos.append('\t'.join(kmer_pheno_chunk)) kmer_samples.append('\t'.join(map(format_tuple, line))) # write every 500K kmers to keep memory consumption under control if len(kmer_phenos) >= 500000: write_files(lock, (kmer_samples, kmer_sample_file), (kmer_phenos, kmer_pheno_file)) kmer_samples = [] kmer_phenos = [] write_files(lock, (kmer_samples, kmer_sample_file), (kmer_phenos, kmer_pheno_file)) printd('Finished filtering kmers.') return
def create_kmer_sample_map(data, raw, q, k, upper, lower, thresh, dfdisp, dfnodisp, sim, n, lock, kmer_sample_file, kmer_pheno_file): printd('Creating kmer sample map...') # get all kmers in chunk and complement them kmers = {} for line in data: kmer, count = line.split('\t') if kmer_frequency_fails(count, upper, lower): continue kmers[kmer] = Counter() # map all kmers in chunk to samples containing them for count, (raw_id, seq) in enumerate(raw.items()): sample_id = sim.get(raw_id, None) if sample_id is None: continue for c_id, contig in enumerate(seq): l = len(contig) if l >= k: # ensure this contig is long enough to sample for i in range(l - k + 1): kmer = contig[i:i + k] kmerlist = kmers.get(kmer, None) if kmerlist is not None: kmerlist[sample_id] += 1 else: complist = kmers.get(complement(kmer), None) if complist is not None: complist[sample_id] += 1 kmers = [(key, *v.items()) for key, v in kmers.items()] printd('Finished creating kmer sample map.') num_kmers, sample_matrix = sample_kmers(kmers, n) printd('Putting data in queue') q.put((num_kmers, sample_matrix)) printd('Finished putting data in queue') if kmer_sample_file is None and kmer_pheno_file is None: return # consolidate() will clear kmers list as it builds unitigs list # with net 0 memory gain kmers = consolidate(kmers, k) # filter_unitigs() will clear unitigs list as it builds new unitigs list # with net 0 memory gain filter_kmers(kmers, thresh, dfdisp, dfnodisp, kmer_sample_file, kmer_pheno_file, lock) return
def consolidate(name, unitigs, outdir): unitigs = list(unitigs.iloc[:, 0]) unitigs = [(u, [1/(i+1)]) for i, u in enumerate(unitigs)] printd('Original num kmers:', len(unitigs)) for k in range(30, 20, -1): unitigs = consolidate_model(unitigs, k, p=False) printd(f'K: {k}, num kmers: {len(unitigs)}') printd('Final num kmers:', len(unitigs)) unitigs = [(unitig, 1 / (sum(ranks) / len(ranks))) for unitig, ranks in unitigs] unitigs = sorted(unitigs, key=lambda x: x[1]) unitigs = [u[0] for u in unitigs] write_list(unitigs, join(outdir, name))
def create_sample_int_map(samples, phenos, sim_file): printd('Creating sample int map...') sim = {} with open(samples, 'r') as f: lines = f.readlines() phenosdf = pd.read_csv(phenos, sep='\t', index_col=0) phenosdf.dropna(how='all', inplace=True) droppedsamples = [] i = 0 for line in lines[1:]: # ignore header name = line.split('\t')[0] if name in phenosdf.index: sim[name] = i sim[i] = name i += 1 else: droppedsamples.append(name) with open(sim_file, 'wb') as f: pickle.dump(sim, f) if len(droppedsamples) > 0: printd(('Ignoring samples not present in both' f' sample and pheno files: {droppedsamples}')) printd('Successfully created sample int map.')
def similar_sample(sample_matrix, num_kmers, similarities_tsv, hist_orig_file, hist_sim_scaled_file, hist_dissim_scaled_file, similarities_file, dissimilarities_file): if not file_exists(similarities_tsv): # scale similarities matrix by the mean num sampled kmers each sample # shares with itself. Then, normalize to [0,1]. Then remove the diagonal # and the lower triangle of the array (since it is symmetric about the # major diagonal), and finally round values to 4 decimal places. mean_shared_w_self = sample_matrix.diagonal().mean() sample_matrix /= mean_shared_w_self sample_matrix += 0.001 # ensure all values are nonzero sample_matrix *= 1.0 / sample_matrix.max() np.fill_diagonal(sample_matrix, np.nan) sample_matrix = np.triu(sample_matrix) np.round(sample_matrix, 4) df = pd.DataFrame(sample_matrix) # dump to tsv file for ease of restoring, and because tsv file of similarities # is a common input to other mGWAS programs df.to_csv(similarities_tsv, sep='\t') else: df = pd.read_csv(similarities_tsv, sep='\t', index_col=0) # create similarity histogram and save it plt.hist(df.values, facecolor='green') plt.savefig(hist_orig_file, dpi=150) plt.clf() df = df.stack() df = df.reset_index() df = df[df[0] > 0] # remove the lower half of the triangle # set threshold; 0.75 means drop lowest 75%, keep highest 25% highthresh = 0.9 lowthresh = 0.1 # find numeric cutoff; the lowest 75% of the data are below this value highcutoff = df[0].quantile(highthresh) lowcutoff = df[0].quantile(lowthresh) # cut off all everything in the middle; only keep the very similar and very dissimilar simdf = df[df[0] >= highcutoff].copy(deep=True) dissimdf = df[df[0] <= lowcutoff].copy(deep=True) dissimdf[0] = 1 - dissimdf[0] dfs = (simdf, dissimdf) # determine new min, max, range files = ((hist_sim_scaled_file, similarities_file), (hist_dissim_scaled_file, dissimilarities_file)) for i, (pngfile, outfile) in enumerate(files): df = dfs[i] min_ = df[0].min() max_ = df[0].max() range_ = max_ - min_ # shift df left by the min so the new min is 0 df[0] -= min_ # rescale data to [0,0.5] or [0,1] if i == 0: # high scale_factor = 2 intercept = 0.5 else: # low scale_factor = 2 intercept = 0.5 df[0] /= range_ * scale_factor # rescale data to [0.5, 1] or [0, 1] df[0] += intercept # create similarity histogram and save it try: plt.hist(df[0], bins=50, facecolor='green') plt.savefig(pngfile, dpi=150) plt.clf() except ValueError as e: printd('Unable to generate histogram of scaled data') # write to csv df.to_csv(outfile, sep='\t', index=False, header=False)