Example #1
0
    def infer(self, fp, steps=1000, fmt='pfamscan', truncate_by=0):
        if (self.mode == 'ensemble') and (not self.warn_on_ensemble_inference):
            eprint(
                '''Warning: Inference w/ a model ensemble will work well if you don't combine the resulting vectors w/ the indexed ones. This is because small variations in the inference will magnify in model ensembles to offset the inferred and indexed vectors by more than they actually differ.
                ''')
            self.warn_on_ensemble_inference = True  # print only once
        bag = []

        for model, mu in zip(self.models, self.means):
            v = infer_genome_vector(fp,
                                    model,
                                    steps=steps,
                                    fmt=fmt,
                                    truncate_by=truncate_by)
            v_ = v - mu

            bag.append(v_)

        ve = np.mean(bag, axis=0)  # ensemble vector
        ve = np.array([ve], dtype='float32')  # cast for norm and index search

        if self.norm == 'l2':
            # eprint('L2 normalization ...')
            ve = normalize(ve, norm=self.norm, axis=1)  #.reshape(self.dim)
            # w/o reshape, dim is (1, dim), not (dim,) like the model's vecs;
            # this causes problems when we want to combine trained and inferred
            # vecs
            # on the diff btw/ (100,) and (100, 1) see
            # stackoverflow.com/questions/22053050

        return ve
Example #2
0
def search(annotation, topn, models, mode, taxonomy, out):
    '''
    Usage:

    \b
    nanotext search \\
        --embedding embedding.genomes.model --topn 3 --out - \\
        --genome .../tara/annotation/TARA_ION_MAG_00012/orfs.domtbl.tsv
    # Loading model ...
    # Inferring genome vector ...
    # GCA_000634215.1 0.9344
    # GCF_000759935.1 0.9282
    # GCF_000759855.1 0.9276
    # Done.
    '''
    from nanotext.classes import GenomeModel
    from nanotext.io import load_embedding, smart_open, eprint
    from nanotext.utils import infer_genome_vector, get_taxa_from_names

    # fp, mode, 

    eprint('Loading model ...')
    model = GenomeModel(models, mode=mode, norm='l2')
    v = model.infer(annotation, fmt='pfamscan', steps=1000)
    sim = model.search(v, topn)
    with smart_open(out) as fh:
        for name, cos in sim:
            fh.write(f'{name}\t{round(float(cos), 4)}\n')

    if taxonomy:
        names, _ = zip(*sim)
        df = get_taxa_from_names(taxonomy, names)
        df.to_csv('taxonomy.tsv', sep='\t', index=None)

    eprint('Done.')
Example #3
0
def index_model(names, models, norm='l2'):
    '''
    To normalize or not to normalize:
    
    - stats.stackexchange.com/questions/177905
    - stackoverflow.com/questions/36034454

    Usage:

    fp = f'{base}/models/{n}/nanotext_r89.model'
    model3 = load_embedding(fp)
    m3 = subtract_mean(model3)
    found, m, index = index_model(names, [m1, m2, m3], norm=norm)
    '''
    import faiss
    import numpy as np
    from sklearn.preprocessing import normalize

    from nanotext.io import eprint

    m = []
    found, notfound = [], 0

    # first take mean of vectors ...
    for i in names:
        model_vv = []
        try:
            for model in models:
                model_vv.append(model[i])
        except KeyError:
            notfound += 1
            continue

        sum_ = np.sum(model_vv, axis=0) / len(model_vv)
        found.append(i)
        m.append(sum_)
        # if only one model is present, this will return the original vector

    db = np.array(m, dtype='float32')
    dim = db.shape[1]  # dimensions

    # ... then normalize
    if not norm:
        index = faiss.IndexFlatL2(dim)
    elif norm == 'l2':
        index = faiss.IndexFlatIP(dim)
        db = normalize(db, norm=norm, axis=1)
        # the inner product IP of two unit length vectors = cosine similarity
    else:
        raise ValueError('This norm is not supported, abort!')

    index.add(db)
    if notfound > 0:
        fraction = round(notfound / len(names), 4)
        eprint(f'{notfound} entries ({fraction}) not found.')
    return found, db, index
Example #4
0
def get_taxa_from_names(db, names):
    '''Given a list of IDs return GTDB taxonomy

    The order of <names> is preserved, e.g. if they are sorted by distance.
    '''
    import pandas as pd

    from nanotext.utils import strip_name
    from nanotext.io import dbopen, eprint

    with dbopen(db) as cursor:
        # cannot use placeholders here
        # stackoverflow.com/questions/31277027

        # needs to be a tuple otherwise OperationalError: no such table ...
        n = tuple(names)
        # hack that covers case of only one query
        # problem is that tuple([1]) -> tuple(1,)
        # -- trailing comma causes error
        if len(n) == 1:
            n = n * 2

        statement = f'SELECT accession_redux, gtdb_taxonomy FROM metadata WHERE accession_redux IN {n}'
        # statement = f'SELECT accession_redux, gtdb_taxonomy FROM metadata WHERE accession_redux = {n}'

        cursor.execute(statement)
        l = cursor.fetchall()

    # order of names is preserved, e.g. when ordered by distance
    taxa = {}
    for name, taxon in l:
        # 'd__Bacteria;p__Cyanobacteriota;c__Cyanobacteriia;[...]'
        taxa[name] = [name] + [j.split('__')[1] for j in taxon.split(';')]

    found, notfound = [], []
    for i in names:
        taxon = taxa.get(i, None)
        if taxon:
            found.append(taxon)
        else:
            notfound.append(i)

    # df = pd.DataFrame.from_records([taxa[i] for i in names])
    # preserves order
    if notfound:
        eprint(f'Did not find {len(notfound)} out of {len(names)} queries:')
        eprint(notfound)
    df = pd.DataFrame.from_records(found)
    if df.empty:  # no query found -- len(pd.DataFrame.from_records([]))
        return df
    else:
        df.columns = 'name domain phylum class order family genus species'.split(
        )
        return df
Example #5
0
 def subset(self, names):
     '''
     Given a list of names, return a dict of name: vector
     '''
     d = self.embedding.copy()
     embedding = {}
     notfound = 0
     for i in names:
         try:
             v = d[i]
             embedding[i] = v
         except KeyError:
             notfound += 1
     eprint(f'{notfound} records not found')
     return embedding
Example #6
0
def predict(genome, embedding, db, model, out, topn):
    '''
    From a <genome> w/ annotated protein domains predict a phenotype. Requires
    the learned <model> (genotype-phenotype mapping) as well as a genome
    <embedding>. Return the closest <topn> vectors from a database <db>.

    Usage:

    \b
    nanotext predict \\
        --out - \\
        --model data/media_prediction.h5 \\
        --db data/embedding.media.json \\
        --embedding data/embedding.genomes.model \\
        --genome data/TARA_ION_MAG_00012.domtbl.tsv \\
        --topn 3
    '''
    import json
    import os

    import numpy as np
    import tensorflow as tf
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # turn off debugging info
    from keras.models import load_model

    from nanotext.io import load_embedding, smart_open, eprint
    from nanotext.utils import infer_genome_vector

    eprint('Loading embedding model for genomes ...')
    e = load_embedding(embedding)
    eprint('Inferring genome vector ...')
    v = infer_genome_vector(genome, e)

    eprint('Loading media vector database ...')
    with open(db, 'r') as file:
        vv = json.load(file)  # vv .. vectors

    eprint('Loading predictive model ...')
    nn = load_model(model)

    y_hat = nn.predict(np.array([v]))[0]  # [0] .. only single genome for now
    sim = cosim2(y_hat, vv, topn)
    with smart_open(out) as fh:
        # fh.write('\nmedium\tcosine\n')
        for name, cos in sim:
            fh.write(f'{name}\t{round(cos, 4)}\n')
    eprint('Done.')
Example #7
0
def evaluate(model, corpus, ecotypes, outfile):
    '''
    A test battery:
    
    - SOMO task -- find the domain in a sequence that does not fit
    - Ecotype task -- separate niche-specific subpopulations of misc species

    Returns a list of accuracy - (sub)task pairs.
    '''
    with open(outfile, 'w+') as out:

        # (1)
        eprint('SOMO task ...')
        SOMO = odd_one(corpus, model, n_not_odd=5)
        out.write(f'{str(SOMO)}\tSOMO\n')

        # (2)
        eprint('Ecotype task ...')
        m = load_embedding(model)
        for task in ['vibrio', 'prochlorococcus', 'pseudomonas']:
            rank, eco = load_ecotypes(f'{ecotypes}/{task}.tsv')
            d = {k: v2 for k, (v1, v2) in eco.items()}
            for k, v in ecotype_task(d, m).items():
                if k != 'NA':
                    out.write(f'{v}\t{k}\n')

        eprint('Evaluation done.')
Example #8
0
def train(corpus, config, out, threads):
    '''
    Usage:

    \b
    nanotext train \\
        --corpus corpus.txt \\
        --out nanotext.model \\
        --params config.json

    '''
    if not config:
        eprint('Will use preselected parameters ...')
        params = {
            'vector_size': 100,
            'hs': 0,
            'negative': 5,
            'min_count': 3,
            'sample': 0.001,
            'window': 10,
            'dm': 0,
            'dbow_words': 1,
            'epochs': 10,
            'alpha': 0.025,
            'min_alpha': 0.0001,
            'seed': 42,
        }

    else:
        with open(config, 'r') as file:
            params = json.load(file)

    # TODO: use <corpus_file> arg
    # https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/Any2Vec_Filebased.ipynb
    # model = Doc2Vec(corpus_file=corpus, workers=threads, **params)
    model = Doc2Vec(workers=threads, **params)
    eprint('Setup:', model)

    eprint('Building vocabulary ...')
    stream = CorpusStream(corpus)
    model.build_vocab(stream)

    eprint('Training starts ...')
    _ = model.train(stream,
                    total_examples=model.corpus_count,
                    epochs=model.epochs)

    model.save(out)
Example #9
0
def get_vectors(l, model, normalized=False):
    '''Get array of document vectors given an ID list and a model'''
    import numpy as np
    from sklearn.preprocessing import normalize

    from nanotext.io import eprint

    m, found = [], []
    cnt = 0

    for i in l:
        try:
            m.append(model.docvecs[i])
            found.append(i)
        except KeyError:
            cnt += 1

    m = np.array(m, dtype='float64')
    if normalized:
        m = normalize(m, norm='l2', axis=1)
    if cnt:
        eprint(f'{cnt} entries missing ({round(cnt/len(l), 4)} %)')
    return found, m
Example #10
0
def evaluate(model, corpus, outfile, clusters):
    '''
    A test battery:
    
    - SOMO
    - king queen
    - various clusterings w/ associated tables
        - e coli
        - closridia
        - chlamydia
        - prochlorococcus
    
    closest genomes prochlorococcus or Tara
    '''
    results = {}

    # SOMO task -- tests word embedding quality
    eprint('SOMO task ...')
    SOMO = odd_one(corpus, model, n_not_odd=5)
    results['SOMO'] = SOMO

    # Ecotype task -- tests document embedding quality
    eprint('Ecotype task ...')
    truth, cluster_labels = [], []
    d = defaultdict(list)

    with open(clusters, 'r') as file:
        _ = next(file)  # header
        for line in file:
            row = line.strip().split('\t')
            clade = row[3]
            cluster = row[6]
            if (clade in ['HL', 'LL']) and (cluster != -1):
                d[clade].append(row[2])  # genome UID
                truth.append(clade)
                cluster_labels.append(cluster)
    h, c, v = [round(i, 4) for i in hcv(truth, cluster_labels)]
    results.update(
        dict(zip('homogeneity completeness vscore'.split(), [h, c, v])))

    # Distance
    eprint('Median cosine distance btw/ points of different ecotypes ...')
    model = load_embedding(model)
    l = []
    for i, j in product(d['HL'], d['LL']):
        try:
            a = model.docvecs[i]
            b = model.docvecs[j]
            l.append(cosine(a, b))
        except KeyError:
            continue

    results['ecotypes_distance'] = round(float(1 - np.median(l)), 4)

    with open(outfile, 'w+') as out:
        json.dump(results, out, indent=4)
Example #11
0
    def __init__(self, fp, mode='ensemble', norm=None, names=None):

        self.mode = mode
        if mode == 'ensemble':
            nn = ['22', '45', '93']
        elif mode == 'core':
            nn = ['93']
        elif mode == 'accessory':
            nn = ['22']
        else:
            raise ValueError(
                'More not implemented (try "ensemble", "core" or "accessory")')

        self.models = []
        for n in nn:
            p = Path(fp) / f'{n}/nanotext_r89.model'
            model = load_embedding(str(p))
            self.models.append(model)

        self.norm = norm
        eprint('Subtracting mean from model(s) ...')
        self.nomean = self._demean(self.models)

        if not names:
            self.names = self.models[0].docvecs.index2entity

        self.dim = len(self.models[0].docvecs[0])

        eprint('Indexing model(s) ...')
        if norm:
            eprint(f'{self.norm} norm will be applied to vectors')
        found, m, self.index = index_model(self.names,
                                           [i for i in self.nomean], self.norm)
        self.embedding = dict(zip(found, m))

        self.means = []
        for model in self.models:
            mu = np.mean([model.docvecs[i] for i in range(len(model.docvecs))],
                         axis=0)
            self.means.append(mu)

        self.warn_on_ensemble_inference = False
Example #12
0
def cluster_subset(model, rank, name, taxonomy, outfile, soft, ecotypes,
                   projection_method):
    '''
    TODO: https://github.com/lmcinnes/umap/issues/90

    Iterate over a taxonomic rank such as class and cluster using HDBSCAN. One 
    reason we believe we can do this is that at higher ranks there are clear
    boundaries between organisms. The main motivation behind it is that
    HDBSCAN clusters are rather coarse when the whole dataset is clustered at 
    once, and "soft-clustering" does not scale.


    python ~/Dropbox/repos_git/nanotext/nanotext/workflows/train_nanotext/scripts/cluster.py -m nanotext_r89.model --name Clostridia --taxonomy /Users/phi/data_local/databases/gtdb/bac_taxonomy_r83.tsv -o clusters.tsv
    '''
    # args = Namespace(
    #     model='nanotext_r89.model',
    #     rank='class',
    #     name='Oxyphotobacteria',
    #     taxonomy='/Users/phi/data_local/databases/gtdb/bac_taxonomy_r83.tsv',
    #     outfile='clusters.tsv',
    #     soft=False,
    #     )

    args = Namespace(model=model,
                     rank=rank,
                     name=name,
                     taxonomy=taxonomy,
                     outfile=outfile,
                     soft=soft,
                     ecotypes=ecotypes)
    '''
    umap.UMAP(
    ['n_neighbors=15', 'n_components=2', "metric='euclidean'", 'n_epochs=None', 'learning_rate=1.0', "init='spectral'", 'min_dist=0.1', 'spread=1.0', 'set_op_mix_ratio=1.0', 'local_connectivity=1.0', 'repulsion_strength=1.0', 'negative_sample_rate=5', 'transform_queue_size=4.0', 'a=None', 'b=None', 'random_state=None', 'metric_kwds=None', 'angular_rp_forest=False', 'target_n_neighbors=-1', "target_metric='categorical'", 'target_metric_kwds=None', 'target_weight=0.5', 'transform_seed=42', 'verbose=False'],)
    '''
    config_umap_visualisation = {
        'metric': 'cosine',
        'n_components': 2,
        # 'n_neighbors': 5,
        # min_dist=0.05,
        # 'spread': 5,
        'random_state': 42,
    }

    config_umap_dim_reduction = {
        'metric': 'cosine',
        'n_components': 10,
        # 'n_neighbors': 10,
        # 'min_dist': 0.05,
        # 'spread': 5,
        'random_state': 42,
    }

    # min_cluster_size 3 leaf works great
    config_hdbscan = {
        'min_cluster_size': 5,
        # 'min_samples': 1,
        'cluster_selection_method': 'eom',
    }

    # Filter taxonomy file for a given rank
    names = []
    with open(args.taxonomy, 'r') as file:
        for line in file:
            if f'{args.rank[0]}__{args.name}' in line:  # d__ for domain etc.
                names.append(line.strip().split('\t')[0])
    eprint(f'There are {len(names)} data points for {args.rank} {args.name}.')

    # Extract only those vectors
    model = load_embedding(args.model)
    m, found = [], []
    for i in names:
        try:
            m.append(model.docvecs[strip_name(i)])
            found.append(strip_name(i))
        except KeyError:
            continue

    if args.ecotypes:
        ecotypes = {}
        with open(args.ecotypes) as csvfile:
            _ = next(csvfile)
            reader = csv.reader(csvfile, delimiter='\t')
            for row in reader:
                genome, e, curator = row[0], row[-2], row[-1]
                if '_' in e:
                    subtype = e
                    e = e.split('_')[0]
                else:
                    subtype = 'NA'
                ecotypes[genome] = (e, subtype, curator)

        # Extend sample list
        # TODO: this will be unnecessary once we have the r89 tax
        for i in ecotypes.keys():
            try:
                m.append(model.docvecs[i])
                found.append(i)
            except KeyError:
                continue

    m = np.array(m, dtype='float64')
    ratio = int(round(m.shape[0] / len(names), 2) * 100)
    eprint(f'Of those, {m.shape[0]} ({ratio}%) are present in the model.')

    pm = projection_method.upper()
    eprint(f'Projecting points (visualisation) using {pm} ...')
    if projection_method == 'tsne':
        projection = TSNE(n_components=2, random_state=42).fit_transform(m)
    elif projection_method == 'umap':
        reducer = umap.UMAP(**config_umap_visualisation)
        projection = reducer.fit_transform(m)
        # projection[-10:] == reducer.embedding_[-10:]
    else:
        eprint('No valid projection method. Abort!')
        sys.exit(-1)

    eprint('Projecting points (dimension reduction) ...')
    # Reduce dimensions before clustering
    reducer = umap.UMAP(**config_umap_dim_reduction)
    m_redux = reducer.fit_transform(m)

    eprint('Clustering ...')
    m_norm = normalize(m_redux, norm='l2', axis=1)
    # prediction_data=True, eom/ leaf
    clusterer = hdbscan.HDBSCAN(**config_hdbscan)
    cluster_labels = clusterer.fit_predict(projection)
    # Or soft clustering: init clusterer w/ prediction_data=True
    # soft_clusters = hdbscan.all_points_membership_vectors(clusterer)
    # cluster_labels = [np.argmax(x) for x in soft_clusters]

    if args.ecotypes:
        with open(args.outfile, 'w+') as out:
            out.write('c1\tc2\tname\tclade\tsubclade\tcollection\tcluster\n')

            for i, j, k in zip(projection, found, cluster_labels):
                c1, c2 = i
                clade, subclade, curator = ecotypes.get(
                    j, 2 * ['NA'] + ['GTDB'])
                out.write(
                    f'{c1}\t{c2}\t{j}\t{clade}\t{subclade}\t{curator}\t{k}\n')
    else:
        with open(args.outfile, 'w+') as out:
            out.write('c1\tc2\tname\tcluster\n')

            for i, j, k in zip(projection, found, cluster_labels):
                c1, c2 = i
                out.write(f'{c1}\t{c2}\t{j}\t{k}\n')

    eprint('Done.')
Example #13
0
def taxonomy(query, taxonomy, embedding, topn, outfile, fmt, steps):
    '''
    Given a query vector, get the <n> closest vectors and their taxonomy and
    then report their <raw> taxonomy or use <majority vote> to identify the
    most likely (?) one.

    Usage:

    \b
    nanotext taxonomy \\
        --embedding nanotext_r89.model --taxonomy bac_taxonomy_r86.tsv \\
        --query JFOD01_pfam.tsv --fmt pfamscan --topn 10 -o results.json

    '''

    '''
    TODO: new fmt

    name, cos, ranks

    2nd output

    majority vote across columns
    '''
    from collections import Counter, defaultdict
    import json
    import pdb
    import random

    import numpy as np
    from sklearn.manifold import TSNE
    import umap

    from nanotext.io import load_taxonomy_gtdb, load_embedding, eprint
    from nanotext.utils import infer_genome_vector, strip_name


    config_umap_visualisation = {
        'metric': 'cosine',
        'n_components': 2,
        # 'repulsion_strength': 5,
        # 'n_neighbors': 5,
        # min_dist=0.05,
        # 'spread': 5,
        'random_state': 42,
        }


    ranks = [
        'domain', 'phylum', 'class', 'order', 'family', 'genus', 'species']
    notfound = []
    
    db = load_taxonomy_gtdb(taxonomy)
    model = load_embedding(embedding)

    v_query = infer_genome_vector(query, model, fmt=fmt, steps=steps)
    sim = model.docvecs.most_similar([v_query], topn=topn)


    taxcollector = {i: [] for i in ranks}
    distance = {}

    for name, cos_sim in sim:
        distance[name] = round(cos_sim, 4)
        try:
            for k, v in zip(ranks, db[name]):
                taxcollector[k].append(v)

        except KeyError:
            eprint(f'{name} has no taxonomy record')
            continue


    # What is the last uniform rank?
    cache = ()
    for i in ranks:
        if (len(set(taxcollector[i])) == 1) and (taxcollector[i][0] != ''):
            cache = (i, taxcollector[i][0])
            continue
        else:
            pass
    

    # p__Firmicutes_A
    # Collect the UIDs for this rank.
    eprint(f'Will collect all vectors for {cache[0]} {cache[1]} ...')
    names = []
    with open(taxonomy, 'r') as file:
        for line in file:
            # if 'c__Clostridia' in line:
            # if 'f__Pseudomonadaceae' in line:
            # if ('p__Firmicutes_A' in line) or ('p__Firmicutes_B' in line): 
            if f'{cache[0][0]}__{cache[1]}' in line:  # d__ for domain etc.
                names.append(line.strip().split('\t')[0])


    # Collect the associated document vector for each UID.
    m, found = [], []
    for name in names:
        try:
            m.append(model.docvecs[strip_name(name)])
            found.append(strip_name(name))
        except KeyError:
            continue


    # Project into 2D.
    eprint(f'Projecting with UMAP ...')
    m = np.array(m, dtype='float64')
    reducer = umap.UMAP(**config_umap_visualisation)
    # projection = reducer.fit_transform(m)
    eprint(f'Projecting with TSNE ...')
    projection = TSNE(n_components=2, random_state=42).fit_transform(m)

    results = defaultdict(list)
    # results['query'].extend(reducer.transform([v_query])[0])
    # results['query'].extend(7*['query'])

    for i, j in zip(found, projection):
        results[i].extend(j)
        results[i].extend(db[i])


    # Add distance info.
    for k, v in results.items():
        results[k].append(distance.get(k, 'NA'))

    # majority, majority_ratio = {}, {}

    # for rank, taxa in vote.items():
    #     results['raw'][rank] = taxa
    #     cnt = Counter(taxa)
    #     maxn = max(cnt.values())
    #     hits = [k for k, v in cnt.items() if v == maxn]
    #     pick = random.choice(hits)
    #     majority[rank] = pick
    #     majority_ratio[rank] = round(cnt[pick]/len(taxa), 2)

    # results['majority'] = majority
    # results['ratio'] = majority_ratio
    
    with open(outfile, 'w+') as out:
        out.write('\t'.join(
            'name c1 c2 domain phylum class order family genus species cos'.split())+'\n')
        # json.dump(dict(sorted(results.items())), out, indent=4)
        for k, v in results.items():
            line = '\t'.join([str(i) for i in [k]+v])+'\n'
            out.write(line)
Example #14
0
    'min_samples': 1,
    'cluster_selection_method': 'eom',
}

model = load_embedding(args.model)
names = model.docvecs.index2entity

# https://github.com/scikit-learn-contrib/hdbscan
# because t-SNE for clustering is no good
# stats.stackexchange.com/questions/308132
m = []
for i in names:  # names .. just a list of accession IDs
    m.append(model.docvecs[i])
m = np.array(m, dtype='float64')

eprint('Projecting points (dimension reduction) ...')
# Reduce dimensions before clustering
reducer = umap.UMAP(**config_umap_dim_reduction)
m_redux = reducer.fit_transform(m)
m_norm = normalize(m_redux, norm='l2', axis=1)
clusterer = hdbscan.HDBSCAN(**config_hdbscan)
cluster_labels = clusterer.fit_predict(m_norm)

# We now want to extract the clustering hierarchy from the clusterer object
# (i.e. the tree).
# condensed tree to newick
# stackoverflow.com/questions/46444454
g = clusterer.condensed_tree_.to_networkx()

# Find root. The root of a tree has an indegree of 0.
# stackoverflow.com/questions/4122390