def gen_kmer_profile(path_reads, path_model, path_kmers, k, verbose=True, v=10000): model = Word2Vec.load(path_model) model_wv = model.wv del model alphabet = {'A': 'A', 'C': 'C', 'G': 'G', 'T': 'T'} open_file = emb.open_file_method(path_reads) in_file = open_file(path_reads) kmer_counter = {kmer: 0 for kmer in model_wv.vocab} read_counter = {} sample_profile = {} v_counter = 0 t_total = 0 t1 = time.time() for line in in_file: l = line.decode('utf-8').strip('\n') if l[0] == '>': l = l.strip('>') sample = l[:l.find('_')] else: if verbose: if v_counter % v == 0: t2 = time.time() t_diff = (t2 - t1) / 60 t_total += t_diff print( 'Processing read %s. Last batch: %.3f minutes. Total time: %.3f hours.' % (v_counter, t_diff, t_total / 60)) t1 = time.time() v_counter += 1 read = '' M = len(l) - k + 1 for n in range(M): l = list(l) nts = l[n:n + k] kmer = '' for nt in nts: try: kmer += alphabet[nt] except: continue if len(kmer) == k: if kmer in model_wv: kmer_counter[kmer] += 1 try: read_counter[sample] += 1 except: read_counter[sample] = 1 sample_profile[sample] = {} sample_profile[sample][kmer] = 1 continue try: sample_profile[sample][kmer] += 1 except: sample_profile[sample][kmer] = 1 total_kmers = np.sum([count for count in kmer_counter.values()]) kmer_counter = { kmer: count / total_kmers for kmer, count in kmer_counter.items() } print('\nSaving results.') six.moves.cPickle.dump( { 'sample_profile': sample_profile, 'kmer_counter': kmer_counter, 'read_counter': read_counter }, open(path_kmers, 'wb'))
def embed_samples(path, model, samp_ids, k=None, a=1e-4, n_components=1, path_match=None, checks=False, verbose=True, v=2500): if path_match is not None: file_match = gzip.open(path_matches, 'r') samp_ids_counts = collections.Counter(samp_ids.values()) samp_idx = {samp: i for i, samp in enumerate(set(samp_ids.values()))} n_samps = len(samp_idx) f_kmers = [(w, model.wv.vocab[w].count) for w in model.wv.vocab] n_corpus = np.sum([count for _, count in f_kmers]) f_kmers = {kmer: count / n_corpus for kmer, count in f_kmers} file_open = emb.open_file_method(path) file = file_open(path) wemb_samp = np.zeros((model.layer1_size, n_samps), dtype='float64') if verbose: t1 = time.time() t_total = 0 print('Beginning file sweep.\n') wemb_read_check = 0 wemb_samp_check = 0 for i, line in enumerate(file): kmers = line.decode('utf-8').split(' ') if len(kmers) > 0: kmers = [kmer for kmer in kmers[:-1] if kmer in model.wv] samp_id = samp_ids[i] wemb_read = np.zeros(model.layer1_size, dtype='float64') wemb_read_count = 0 for kmer in kmers: wemb_read += model.wv[kmer] * a / (a + f_kmers[kmer]) wemb_read_count += 1 if path_match is not None: match = str(wemb_read_count) + ',' file_match.write(match.encode) wemb_read /= wemb_read_count wemb_samp[:, samp_idx[samp_id]] += wemb_read / samp_ids_counts[samp_id] if checks: wemb_read_tmp = np.max(np.abs(wemb_read)) if wemb_read_tmp > wemb_read_check: wemb_read_check = wemb_read_tmp wemb_samp_tmp = np.max(np.abs(wemb_samp[:, samp_idx[samp_id]])) if wemb_samp_tmp > wemb_samp_check: wemb_samp_check = wemb_samp_tmp if verbose: if i % v == 0: t2 = time.time() t_diff = (t2 - t1) / 60 t_total += t_diff / 60 print( 'Processed read %s/%s in %.2f minutes (Total: %.2f hours).' % (str(i), str(len(samp_ids)), t_diff, t_total)) if checks: print('Max w-read = %.3f\nMax w-samp = %.3f\n' % (wemb_read_check, wemb_samp_check)) t1 = time.time() file_match.close() if verbose: print('Performing SVD on weighted embedding matrix (%s, %s)' % (wemb_samp.shape[0], wemb_samp.shape[1])) svd = TruncatedSVD(n_components=n_components, n_iter=7, random_state=0) svd.fit(wemb_samp) pc = svd.components_ wemb_samp -= wemb_samp.dot(pc.T) * pc return wemb_samp
ids_fn = os.path.join(data_dir, name + '_' + str(k) + '_ids.pkl') kmers_fn = os.path.join(data_dir, name + '_' + str(k) + '_kmers.csv.gz') if os.path.exists(ids_fn): print('%s already exists' % (ids_fn)) sys.exit() if os.path.exists(kmers_fn): print('%s already exists' % (kmers_fn)) sys.exit() alphabet = {'A': 'A', 'C': 'C', 'G': 'G', 'T': 'T'} print(name + ':\tLoading reads.') open_file = open_file_method(reads_fn) in_file = open_file(reads_fn) out_kmers = gzip.open(kmers_fn, 'w') ids = [] read_idx = 0 t1 = time.time() for line in in_file: l = line.decode('utf-8').strip('\n') if l[0] == '>': ids.append(l.strip('>')) read_idx += 1 if read_idx % v == 0: t_diff = str(round((time.time() - t1) / 60, 1)) + ' min.'
reads_fn = 'gg_13_5.fasta.gz' if name == 'kegg': reads_fn = 'reference_seqs.fna.gz' if name == 'query': # query_hmp reads_fn = 'seqs.fna.gz' if name == 'query_oral': reads_fn = 'seqs_oral.fna.gz' ids_fn = name + '_' + str(k) + '_ids.pkl' kmers_fn = name + '_' + str(k) + '_kmers.csv.gz' alphabet = {'A': 'A', 'C': 'C', 'G': 'G', 'T': 'T'} print(name + ':\tLoading reads.') open_file = emb.open_file_method(reads_fn) in_file = open_file(reads_fn) out_kmers = gzip.open(kmers_fn, 'w') ids = [] read_idx = 0 t1 = time.time() for line in in_file: l = line.decode("utf-8").strip('\n') if l[0] == '>': ids.append(l.strip('>')) read_idx += 1 if read_idx % v == 0: t_diff = str(round((time.time() - t1) / 60, 1)) + ' min.'
while True: print('Copying ' + kegg_fn_in) copyfile(kegg_fn_in,kegg_fn) if os.path.getsize(kegg_fn_in) == os.path.getsize(kegg_fn): break while True: print('Copying ' + query_fn_in) copyfile(query_fn_in,query_fn) if os.path.getsize(query_fn_in) == os.path.getsize(query_fn): break kegg_ids = six.moves.cPickle.load(open(kegg_ids_fn,'rb'))['ids'] query_ids = six.moves.cPickle.load(open(query_ids_fn,'rb'))['ids'] file_open = emb.open_file_method(kegg_fn) kegg_file = file_open(kegg_fn) print('Calculating reference counts.') r_lines = [line.decode('utf-8').split() for line in kegg_file] r_kmers = [set(line) for line in r_lines] r_counts = [Counter(line) for line in r_lines] file_open = emb.open_file_method(query_fn) query_file = file_open(query_fn) nn_count = dict() nn = dict() print('Finding nearest neighbors') t1 = time.time()
while True: print('Copying ' + ref_fn_in) copyfile(ref_fn_in, ref_fn) if os.path.getsize(ref_fn_in) == os.path.getsize(ref_fn): break while True: print('Copying ' + query_fn_in) copyfile(query_fn_in, query_fn) if os.path.getsize(query_fn_in) == os.path.getsize(query_fn): break r_ids = six.moves.cPickle.load(open(r_ids_fn, 'rb'))['ids'] q_ids = six.moves.cPickle.load(open(q_ids_fn, 'rb'))['ids'] file_open = emb.open_file_method(ref_fn) r_file = file_open(ref_fn) print('Calculating reference counts.') r_lines = [line.decode('utf-8').split() for line in r_file] r_kmers = [set(line) for line in r_lines] r_counts = [Counter(line) for line in r_lines] file_open = emb.open_file_method(query_fn) q_file = file_open(query_fn) print('Finding nearest neighbors') def worker(lines): result = {