def gen_kmer_profile(path_reads,
                     path_model,
                     path_kmers,
                     k,
                     verbose=True,
                     v=10000):
    model = Word2Vec.load(path_model)
    model_wv = model.wv
    del model

    alphabet = {'A': 'A', 'C': 'C', 'G': 'G', 'T': 'T'}

    open_file = emb.open_file_method(path_reads)
    in_file = open_file(path_reads)

    kmer_counter = {kmer: 0 for kmer in model_wv.vocab}
    read_counter = {}
    sample_profile = {}

    v_counter = 0
    t_total = 0
    t1 = time.time()
    for line in in_file:

        l = line.decode('utf-8').strip('\n')
        if l[0] == '>':
            l = l.strip('>')
            sample = l[:l.find('_')]
        else:

            if verbose:
                if v_counter % v == 0:
                    t2 = time.time()
                    t_diff = (t2 - t1) / 60
                    t_total += t_diff
                    print(
                        'Processing read %s. Last batch: %.3f minutes. Total time: %.3f hours.'
                        % (v_counter, t_diff, t_total / 60))
                    t1 = time.time()
                v_counter += 1

            read = ''
            M = len(l) - k + 1
            for n in range(M):
                l = list(l)
                nts = l[n:n + k]
                kmer = ''
                for nt in nts:
                    try:
                        kmer += alphabet[nt]
                    except:
                        continue
                if len(kmer) == k:
                    if kmer in model_wv:
                        kmer_counter[kmer] += 1

                        try:
                            read_counter[sample] += 1
                        except:
                            read_counter[sample] = 1
                            sample_profile[sample] = {}
                            sample_profile[sample][kmer] = 1

                            continue

                        try:
                            sample_profile[sample][kmer] += 1
                        except:
                            sample_profile[sample][kmer] = 1

    total_kmers = np.sum([count for count in kmer_counter.values()])
    kmer_counter = {
        kmer: count / total_kmers
        for kmer, count in kmer_counter.items()
    }

    print('\nSaving results.')
    six.moves.cPickle.dump(
        {
            'sample_profile': sample_profile,
            'kmer_counter': kmer_counter,
            'read_counter': read_counter
        }, open(path_kmers, 'wb'))
def embed_samples(path,
                  model,
                  samp_ids,
                  k=None,
                  a=1e-4,
                  n_components=1,
                  path_match=None,
                  checks=False,
                  verbose=True,
                  v=2500):

    if path_match is not None:
        file_match = gzip.open(path_matches, 'r')

    samp_ids_counts = collections.Counter(samp_ids.values())
    samp_idx = {samp: i for i, samp in enumerate(set(samp_ids.values()))}
    n_samps = len(samp_idx)

    f_kmers = [(w, model.wv.vocab[w].count) for w in model.wv.vocab]
    n_corpus = np.sum([count for _, count in f_kmers])
    f_kmers = {kmer: count / n_corpus for kmer, count in f_kmers}

    file_open = emb.open_file_method(path)
    file = file_open(path)

    wemb_samp = np.zeros((model.layer1_size, n_samps), dtype='float64')

    if verbose:
        t1 = time.time()
        t_total = 0
        print('Beginning file sweep.\n')

    wemb_read_check = 0
    wemb_samp_check = 0
    for i, line in enumerate(file):

        kmers = line.decode('utf-8').split(' ')

        if len(kmers) > 0:

            kmers = [kmer for kmer in kmers[:-1] if kmer in model.wv]

            samp_id = samp_ids[i]

            wemb_read = np.zeros(model.layer1_size, dtype='float64')
            wemb_read_count = 0

            for kmer in kmers:
                wemb_read += model.wv[kmer] * a / (a + f_kmers[kmer])
                wemb_read_count += 1

            if path_match is not None:
                match = str(wemb_read_count) + ','
                file_match.write(match.encode)

            wemb_read /= wemb_read_count
            wemb_samp[:,
                      samp_idx[samp_id]] += wemb_read / samp_ids_counts[samp_id]

            if checks:
                wemb_read_tmp = np.max(np.abs(wemb_read))
                if wemb_read_tmp > wemb_read_check:
                    wemb_read_check = wemb_read_tmp
                wemb_samp_tmp = np.max(np.abs(wemb_samp[:, samp_idx[samp_id]]))
                if wemb_samp_tmp > wemb_samp_check:
                    wemb_samp_check = wemb_samp_tmp

            if verbose:
                if i % v == 0:
                    t2 = time.time()
                    t_diff = (t2 - t1) / 60
                    t_total += t_diff / 60
                    print(
                        'Processed read %s/%s in %.2f minutes (Total: %.2f hours).'
                        % (str(i), str(len(samp_ids)), t_diff, t_total))
                    if checks:
                        print('Max w-read = %.3f\nMax w-samp = %.3f\n' %
                              (wemb_read_check, wemb_samp_check))
                    t1 = time.time()

    file_match.close()

    if verbose:
        print('Performing SVD on weighted embedding matrix (%s, %s)' %
              (wemb_samp.shape[0], wemb_samp.shape[1]))

    svd = TruncatedSVD(n_components=n_components, n_iter=7, random_state=0)
    svd.fit(wemb_samp)
    pc = svd.components_
    wemb_samp -= wemb_samp.dot(pc.T) * pc

    return wemb_samp
Example #3
0
ids_fn = os.path.join(data_dir, name + '_' + str(k) + '_ids.pkl')
kmers_fn = os.path.join(data_dir, name + '_' + str(k) + '_kmers.csv.gz')

if os.path.exists(ids_fn):
    print('%s already exists' % (ids_fn))
    sys.exit()
if os.path.exists(kmers_fn):
    print('%s already exists' % (kmers_fn))
    sys.exit()

alphabet = {'A': 'A', 'C': 'C', 'G': 'G', 'T': 'T'}

print(name + ':\tLoading reads.')

open_file = open_file_method(reads_fn)
in_file = open_file(reads_fn)

out_kmers = gzip.open(kmers_fn, 'w')

ids = []
read_idx = 0
t1 = time.time()

for line in in_file:
    l = line.decode('utf-8').strip('\n')
    if l[0] == '>':
        ids.append(l.strip('>'))
        read_idx += 1
        if read_idx % v == 0:
            t_diff = str(round((time.time() - t1) / 60, 1)) + ' min.'
Example #4
0
    reads_fn = 'gg_13_5.fasta.gz'
if name == 'kegg':
    reads_fn = 'reference_seqs.fna.gz'
if name == 'query':  # query_hmp
    reads_fn = 'seqs.fna.gz'
if name == 'query_oral':
    reads_fn = 'seqs_oral.fna.gz'

ids_fn = name + '_' + str(k) + '_ids.pkl'
kmers_fn = name + '_' + str(k) + '_kmers.csv.gz'

alphabet = {'A': 'A', 'C': 'C', 'G': 'G', 'T': 'T'}

print(name + ':\tLoading reads.')

open_file = emb.open_file_method(reads_fn)
in_file = open_file(reads_fn)

out_kmers = gzip.open(kmers_fn, 'w')

ids = []
read_idx = 0
t1 = time.time()

for line in in_file:
    l = line.decode("utf-8").strip('\n')
    if l[0] == '>':
        ids.append(l.strip('>'))
        read_idx += 1
        if read_idx % v == 0:
            t_diff = str(round((time.time() - t1) / 60, 1)) + ' min.'
Example #5
0
    while True:
        print('Copying ' + kegg_fn_in)
        copyfile(kegg_fn_in,kegg_fn)
        if os.path.getsize(kegg_fn_in) == os.path.getsize(kegg_fn):
            break

    while True:
        print('Copying ' + query_fn_in)
        copyfile(query_fn_in,query_fn)
        if os.path.getsize(query_fn_in) == os.path.getsize(query_fn):
            break

    kegg_ids = six.moves.cPickle.load(open(kegg_ids_fn,'rb'))['ids']
    query_ids = six.moves.cPickle.load(open(query_ids_fn,'rb'))['ids']

    file_open = emb.open_file_method(kegg_fn)
    kegg_file = file_open(kegg_fn)
    
    print('Calculating reference counts.')
    r_lines = [line.decode('utf-8').split() for line in kegg_file]
    r_kmers = [set(line) for line in r_lines]
    r_counts = [Counter(line) for line in r_lines]

    file_open = emb.open_file_method(query_fn)
    query_file = file_open(query_fn)

    nn_count = dict()
    nn = dict()
    
    print('Finding nearest neighbors')
    t1 = time.time()
Example #6
0
    while True:
        print('Copying ' + ref_fn_in)
        copyfile(ref_fn_in, ref_fn)
        if os.path.getsize(ref_fn_in) == os.path.getsize(ref_fn):
            break

    while True:
        print('Copying ' + query_fn_in)
        copyfile(query_fn_in, query_fn)
        if os.path.getsize(query_fn_in) == os.path.getsize(query_fn):
            break

    r_ids = six.moves.cPickle.load(open(r_ids_fn, 'rb'))['ids']
    q_ids = six.moves.cPickle.load(open(q_ids_fn, 'rb'))['ids']

    file_open = emb.open_file_method(ref_fn)
    r_file = file_open(ref_fn)

    print('Calculating reference counts.')
    r_lines = [line.decode('utf-8').split() for line in r_file]
    r_kmers = [set(line) for line in r_lines]
    r_counts = [Counter(line) for line in r_lines]

    file_open = emb.open_file_method(query_fn)
    q_file = file_open(query_fn)

    print('Finding nearest neighbors')

    def worker(lines):

        result = {