def infer(self, fp, steps=1000, fmt='pfamscan', truncate_by=0): if (self.mode == 'ensemble') and (not self.warn_on_ensemble_inference): eprint( '''Warning: Inference w/ a model ensemble will work well if you don't combine the resulting vectors w/ the indexed ones. This is because small variations in the inference will magnify in model ensembles to offset the inferred and indexed vectors by more than they actually differ. ''') self.warn_on_ensemble_inference = True # print only once bag = [] for model, mu in zip(self.models, self.means): v = infer_genome_vector(fp, model, steps=steps, fmt=fmt, truncate_by=truncate_by) v_ = v - mu bag.append(v_) ve = np.mean(bag, axis=0) # ensemble vector ve = np.array([ve], dtype='float32') # cast for norm and index search if self.norm == 'l2': # eprint('L2 normalization ...') ve = normalize(ve, norm=self.norm, axis=1) #.reshape(self.dim) # w/o reshape, dim is (1, dim), not (dim,) like the model's vecs; # this causes problems when we want to combine trained and inferred # vecs # on the diff btw/ (100,) and (100, 1) see # stackoverflow.com/questions/22053050 return ve
def search(annotation, topn, models, mode, taxonomy, out): ''' Usage: \b nanotext search \\ --embedding embedding.genomes.model --topn 3 --out - \\ --genome .../tara/annotation/TARA_ION_MAG_00012/orfs.domtbl.tsv # Loading model ... # Inferring genome vector ... # GCA_000634215.1 0.9344 # GCF_000759935.1 0.9282 # GCF_000759855.1 0.9276 # Done. ''' from nanotext.classes import GenomeModel from nanotext.io import load_embedding, smart_open, eprint from nanotext.utils import infer_genome_vector, get_taxa_from_names # fp, mode, eprint('Loading model ...') model = GenomeModel(models, mode=mode, norm='l2') v = model.infer(annotation, fmt='pfamscan', steps=1000) sim = model.search(v, topn) with smart_open(out) as fh: for name, cos in sim: fh.write(f'{name}\t{round(float(cos), 4)}\n') if taxonomy: names, _ = zip(*sim) df = get_taxa_from_names(taxonomy, names) df.to_csv('taxonomy.tsv', sep='\t', index=None) eprint('Done.')
def index_model(names, models, norm='l2'): ''' To normalize or not to normalize: - stats.stackexchange.com/questions/177905 - stackoverflow.com/questions/36034454 Usage: fp = f'{base}/models/{n}/nanotext_r89.model' model3 = load_embedding(fp) m3 = subtract_mean(model3) found, m, index = index_model(names, [m1, m2, m3], norm=norm) ''' import faiss import numpy as np from sklearn.preprocessing import normalize from nanotext.io import eprint m = [] found, notfound = [], 0 # first take mean of vectors ... for i in names: model_vv = [] try: for model in models: model_vv.append(model[i]) except KeyError: notfound += 1 continue sum_ = np.sum(model_vv, axis=0) / len(model_vv) found.append(i) m.append(sum_) # if only one model is present, this will return the original vector db = np.array(m, dtype='float32') dim = db.shape[1] # dimensions # ... then normalize if not norm: index = faiss.IndexFlatL2(dim) elif norm == 'l2': index = faiss.IndexFlatIP(dim) db = normalize(db, norm=norm, axis=1) # the inner product IP of two unit length vectors = cosine similarity else: raise ValueError('This norm is not supported, abort!') index.add(db) if notfound > 0: fraction = round(notfound / len(names), 4) eprint(f'{notfound} entries ({fraction}) not found.') return found, db, index
def get_taxa_from_names(db, names): '''Given a list of IDs return GTDB taxonomy The order of <names> is preserved, e.g. if they are sorted by distance. ''' import pandas as pd from nanotext.utils import strip_name from nanotext.io import dbopen, eprint with dbopen(db) as cursor: # cannot use placeholders here # stackoverflow.com/questions/31277027 # needs to be a tuple otherwise OperationalError: no such table ... n = tuple(names) # hack that covers case of only one query # problem is that tuple([1]) -> tuple(1,) # -- trailing comma causes error if len(n) == 1: n = n * 2 statement = f'SELECT accession_redux, gtdb_taxonomy FROM metadata WHERE accession_redux IN {n}' # statement = f'SELECT accession_redux, gtdb_taxonomy FROM metadata WHERE accession_redux = {n}' cursor.execute(statement) l = cursor.fetchall() # order of names is preserved, e.g. when ordered by distance taxa = {} for name, taxon in l: # 'd__Bacteria;p__Cyanobacteriota;c__Cyanobacteriia;[...]' taxa[name] = [name] + [j.split('__')[1] for j in taxon.split(';')] found, notfound = [], [] for i in names: taxon = taxa.get(i, None) if taxon: found.append(taxon) else: notfound.append(i) # df = pd.DataFrame.from_records([taxa[i] for i in names]) # preserves order if notfound: eprint(f'Did not find {len(notfound)} out of {len(names)} queries:') eprint(notfound) df = pd.DataFrame.from_records(found) if df.empty: # no query found -- len(pd.DataFrame.from_records([])) return df else: df.columns = 'name domain phylum class order family genus species'.split( ) return df
def subset(self, names): ''' Given a list of names, return a dict of name: vector ''' d = self.embedding.copy() embedding = {} notfound = 0 for i in names: try: v = d[i] embedding[i] = v except KeyError: notfound += 1 eprint(f'{notfound} records not found') return embedding
def predict(genome, embedding, db, model, out, topn): ''' From a <genome> w/ annotated protein domains predict a phenotype. Requires the learned <model> (genotype-phenotype mapping) as well as a genome <embedding>. Return the closest <topn> vectors from a database <db>. Usage: \b nanotext predict \\ --out - \\ --model data/media_prediction.h5 \\ --db data/embedding.media.json \\ --embedding data/embedding.genomes.model \\ --genome data/TARA_ION_MAG_00012.domtbl.tsv \\ --topn 3 ''' import json import os import numpy as np import tensorflow as tf os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # turn off debugging info from keras.models import load_model from nanotext.io import load_embedding, smart_open, eprint from nanotext.utils import infer_genome_vector eprint('Loading embedding model for genomes ...') e = load_embedding(embedding) eprint('Inferring genome vector ...') v = infer_genome_vector(genome, e) eprint('Loading media vector database ...') with open(db, 'r') as file: vv = json.load(file) # vv .. vectors eprint('Loading predictive model ...') nn = load_model(model) y_hat = nn.predict(np.array([v]))[0] # [0] .. only single genome for now sim = cosim2(y_hat, vv, topn) with smart_open(out) as fh: # fh.write('\nmedium\tcosine\n') for name, cos in sim: fh.write(f'{name}\t{round(cos, 4)}\n') eprint('Done.')
def evaluate(model, corpus, ecotypes, outfile): ''' A test battery: - SOMO task -- find the domain in a sequence that does not fit - Ecotype task -- separate niche-specific subpopulations of misc species Returns a list of accuracy - (sub)task pairs. ''' with open(outfile, 'w+') as out: # (1) eprint('SOMO task ...') SOMO = odd_one(corpus, model, n_not_odd=5) out.write(f'{str(SOMO)}\tSOMO\n') # (2) eprint('Ecotype task ...') m = load_embedding(model) for task in ['vibrio', 'prochlorococcus', 'pseudomonas']: rank, eco = load_ecotypes(f'{ecotypes}/{task}.tsv') d = {k: v2 for k, (v1, v2) in eco.items()} for k, v in ecotype_task(d, m).items(): if k != 'NA': out.write(f'{v}\t{k}\n') eprint('Evaluation done.')
def train(corpus, config, out, threads): ''' Usage: \b nanotext train \\ --corpus corpus.txt \\ --out nanotext.model \\ --params config.json ''' if not config: eprint('Will use preselected parameters ...') params = { 'vector_size': 100, 'hs': 0, 'negative': 5, 'min_count': 3, 'sample': 0.001, 'window': 10, 'dm': 0, 'dbow_words': 1, 'epochs': 10, 'alpha': 0.025, 'min_alpha': 0.0001, 'seed': 42, } else: with open(config, 'r') as file: params = json.load(file) # TODO: use <corpus_file> arg # https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/Any2Vec_Filebased.ipynb # model = Doc2Vec(corpus_file=corpus, workers=threads, **params) model = Doc2Vec(workers=threads, **params) eprint('Setup:', model) eprint('Building vocabulary ...') stream = CorpusStream(corpus) model.build_vocab(stream) eprint('Training starts ...') _ = model.train(stream, total_examples=model.corpus_count, epochs=model.epochs) model.save(out)
def get_vectors(l, model, normalized=False): '''Get array of document vectors given an ID list and a model''' import numpy as np from sklearn.preprocessing import normalize from nanotext.io import eprint m, found = [], [] cnt = 0 for i in l: try: m.append(model.docvecs[i]) found.append(i) except KeyError: cnt += 1 m = np.array(m, dtype='float64') if normalized: m = normalize(m, norm='l2', axis=1) if cnt: eprint(f'{cnt} entries missing ({round(cnt/len(l), 4)} %)') return found, m
def evaluate(model, corpus, outfile, clusters): ''' A test battery: - SOMO - king queen - various clusterings w/ associated tables - e coli - closridia - chlamydia - prochlorococcus closest genomes prochlorococcus or Tara ''' results = {} # SOMO task -- tests word embedding quality eprint('SOMO task ...') SOMO = odd_one(corpus, model, n_not_odd=5) results['SOMO'] = SOMO # Ecotype task -- tests document embedding quality eprint('Ecotype task ...') truth, cluster_labels = [], [] d = defaultdict(list) with open(clusters, 'r') as file: _ = next(file) # header for line in file: row = line.strip().split('\t') clade = row[3] cluster = row[6] if (clade in ['HL', 'LL']) and (cluster != -1): d[clade].append(row[2]) # genome UID truth.append(clade) cluster_labels.append(cluster) h, c, v = [round(i, 4) for i in hcv(truth, cluster_labels)] results.update( dict(zip('homogeneity completeness vscore'.split(), [h, c, v]))) # Distance eprint('Median cosine distance btw/ points of different ecotypes ...') model = load_embedding(model) l = [] for i, j in product(d['HL'], d['LL']): try: a = model.docvecs[i] b = model.docvecs[j] l.append(cosine(a, b)) except KeyError: continue results['ecotypes_distance'] = round(float(1 - np.median(l)), 4) with open(outfile, 'w+') as out: json.dump(results, out, indent=4)
def __init__(self, fp, mode='ensemble', norm=None, names=None): self.mode = mode if mode == 'ensemble': nn = ['22', '45', '93'] elif mode == 'core': nn = ['93'] elif mode == 'accessory': nn = ['22'] else: raise ValueError( 'More not implemented (try "ensemble", "core" or "accessory")') self.models = [] for n in nn: p = Path(fp) / f'{n}/nanotext_r89.model' model = load_embedding(str(p)) self.models.append(model) self.norm = norm eprint('Subtracting mean from model(s) ...') self.nomean = self._demean(self.models) if not names: self.names = self.models[0].docvecs.index2entity self.dim = len(self.models[0].docvecs[0]) eprint('Indexing model(s) ...') if norm: eprint(f'{self.norm} norm will be applied to vectors') found, m, self.index = index_model(self.names, [i for i in self.nomean], self.norm) self.embedding = dict(zip(found, m)) self.means = [] for model in self.models: mu = np.mean([model.docvecs[i] for i in range(len(model.docvecs))], axis=0) self.means.append(mu) self.warn_on_ensemble_inference = False
def cluster_subset(model, rank, name, taxonomy, outfile, soft, ecotypes, projection_method): ''' TODO: https://github.com/lmcinnes/umap/issues/90 Iterate over a taxonomic rank such as class and cluster using HDBSCAN. One reason we believe we can do this is that at higher ranks there are clear boundaries between organisms. The main motivation behind it is that HDBSCAN clusters are rather coarse when the whole dataset is clustered at once, and "soft-clustering" does not scale. python ~/Dropbox/repos_git/nanotext/nanotext/workflows/train_nanotext/scripts/cluster.py -m nanotext_r89.model --name Clostridia --taxonomy /Users/phi/data_local/databases/gtdb/bac_taxonomy_r83.tsv -o clusters.tsv ''' # args = Namespace( # model='nanotext_r89.model', # rank='class', # name='Oxyphotobacteria', # taxonomy='/Users/phi/data_local/databases/gtdb/bac_taxonomy_r83.tsv', # outfile='clusters.tsv', # soft=False, # ) args = Namespace(model=model, rank=rank, name=name, taxonomy=taxonomy, outfile=outfile, soft=soft, ecotypes=ecotypes) ''' umap.UMAP( ['n_neighbors=15', 'n_components=2', "metric='euclidean'", 'n_epochs=None', 'learning_rate=1.0', "init='spectral'", 'min_dist=0.1', 'spread=1.0', 'set_op_mix_ratio=1.0', 'local_connectivity=1.0', 'repulsion_strength=1.0', 'negative_sample_rate=5', 'transform_queue_size=4.0', 'a=None', 'b=None', 'random_state=None', 'metric_kwds=None', 'angular_rp_forest=False', 'target_n_neighbors=-1', "target_metric='categorical'", 'target_metric_kwds=None', 'target_weight=0.5', 'transform_seed=42', 'verbose=False'],) ''' config_umap_visualisation = { 'metric': 'cosine', 'n_components': 2, # 'n_neighbors': 5, # min_dist=0.05, # 'spread': 5, 'random_state': 42, } config_umap_dim_reduction = { 'metric': 'cosine', 'n_components': 10, # 'n_neighbors': 10, # 'min_dist': 0.05, # 'spread': 5, 'random_state': 42, } # min_cluster_size 3 leaf works great config_hdbscan = { 'min_cluster_size': 5, # 'min_samples': 1, 'cluster_selection_method': 'eom', } # Filter taxonomy file for a given rank names = [] with open(args.taxonomy, 'r') as file: for line in file: if f'{args.rank[0]}__{args.name}' in line: # d__ for domain etc. names.append(line.strip().split('\t')[0]) eprint(f'There are {len(names)} data points for {args.rank} {args.name}.') # Extract only those vectors model = load_embedding(args.model) m, found = [], [] for i in names: try: m.append(model.docvecs[strip_name(i)]) found.append(strip_name(i)) except KeyError: continue if args.ecotypes: ecotypes = {} with open(args.ecotypes) as csvfile: _ = next(csvfile) reader = csv.reader(csvfile, delimiter='\t') for row in reader: genome, e, curator = row[0], row[-2], row[-1] if '_' in e: subtype = e e = e.split('_')[0] else: subtype = 'NA' ecotypes[genome] = (e, subtype, curator) # Extend sample list # TODO: this will be unnecessary once we have the r89 tax for i in ecotypes.keys(): try: m.append(model.docvecs[i]) found.append(i) except KeyError: continue m = np.array(m, dtype='float64') ratio = int(round(m.shape[0] / len(names), 2) * 100) eprint(f'Of those, {m.shape[0]} ({ratio}%) are present in the model.') pm = projection_method.upper() eprint(f'Projecting points (visualisation) using {pm} ...') if projection_method == 'tsne': projection = TSNE(n_components=2, random_state=42).fit_transform(m) elif projection_method == 'umap': reducer = umap.UMAP(**config_umap_visualisation) projection = reducer.fit_transform(m) # projection[-10:] == reducer.embedding_[-10:] else: eprint('No valid projection method. Abort!') sys.exit(-1) eprint('Projecting points (dimension reduction) ...') # Reduce dimensions before clustering reducer = umap.UMAP(**config_umap_dim_reduction) m_redux = reducer.fit_transform(m) eprint('Clustering ...') m_norm = normalize(m_redux, norm='l2', axis=1) # prediction_data=True, eom/ leaf clusterer = hdbscan.HDBSCAN(**config_hdbscan) cluster_labels = clusterer.fit_predict(projection) # Or soft clustering: init clusterer w/ prediction_data=True # soft_clusters = hdbscan.all_points_membership_vectors(clusterer) # cluster_labels = [np.argmax(x) for x in soft_clusters] if args.ecotypes: with open(args.outfile, 'w+') as out: out.write('c1\tc2\tname\tclade\tsubclade\tcollection\tcluster\n') for i, j, k in zip(projection, found, cluster_labels): c1, c2 = i clade, subclade, curator = ecotypes.get( j, 2 * ['NA'] + ['GTDB']) out.write( f'{c1}\t{c2}\t{j}\t{clade}\t{subclade}\t{curator}\t{k}\n') else: with open(args.outfile, 'w+') as out: out.write('c1\tc2\tname\tcluster\n') for i, j, k in zip(projection, found, cluster_labels): c1, c2 = i out.write(f'{c1}\t{c2}\t{j}\t{k}\n') eprint('Done.')
def taxonomy(query, taxonomy, embedding, topn, outfile, fmt, steps): ''' Given a query vector, get the <n> closest vectors and their taxonomy and then report their <raw> taxonomy or use <majority vote> to identify the most likely (?) one. Usage: \b nanotext taxonomy \\ --embedding nanotext_r89.model --taxonomy bac_taxonomy_r86.tsv \\ --query JFOD01_pfam.tsv --fmt pfamscan --topn 10 -o results.json ''' ''' TODO: new fmt name, cos, ranks 2nd output majority vote across columns ''' from collections import Counter, defaultdict import json import pdb import random import numpy as np from sklearn.manifold import TSNE import umap from nanotext.io import load_taxonomy_gtdb, load_embedding, eprint from nanotext.utils import infer_genome_vector, strip_name config_umap_visualisation = { 'metric': 'cosine', 'n_components': 2, # 'repulsion_strength': 5, # 'n_neighbors': 5, # min_dist=0.05, # 'spread': 5, 'random_state': 42, } ranks = [ 'domain', 'phylum', 'class', 'order', 'family', 'genus', 'species'] notfound = [] db = load_taxonomy_gtdb(taxonomy) model = load_embedding(embedding) v_query = infer_genome_vector(query, model, fmt=fmt, steps=steps) sim = model.docvecs.most_similar([v_query], topn=topn) taxcollector = {i: [] for i in ranks} distance = {} for name, cos_sim in sim: distance[name] = round(cos_sim, 4) try: for k, v in zip(ranks, db[name]): taxcollector[k].append(v) except KeyError: eprint(f'{name} has no taxonomy record') continue # What is the last uniform rank? cache = () for i in ranks: if (len(set(taxcollector[i])) == 1) and (taxcollector[i][0] != ''): cache = (i, taxcollector[i][0]) continue else: pass # p__Firmicutes_A # Collect the UIDs for this rank. eprint(f'Will collect all vectors for {cache[0]} {cache[1]} ...') names = [] with open(taxonomy, 'r') as file: for line in file: # if 'c__Clostridia' in line: # if 'f__Pseudomonadaceae' in line: # if ('p__Firmicutes_A' in line) or ('p__Firmicutes_B' in line): if f'{cache[0][0]}__{cache[1]}' in line: # d__ for domain etc. names.append(line.strip().split('\t')[0]) # Collect the associated document vector for each UID. m, found = [], [] for name in names: try: m.append(model.docvecs[strip_name(name)]) found.append(strip_name(name)) except KeyError: continue # Project into 2D. eprint(f'Projecting with UMAP ...') m = np.array(m, dtype='float64') reducer = umap.UMAP(**config_umap_visualisation) # projection = reducer.fit_transform(m) eprint(f'Projecting with TSNE ...') projection = TSNE(n_components=2, random_state=42).fit_transform(m) results = defaultdict(list) # results['query'].extend(reducer.transform([v_query])[0]) # results['query'].extend(7*['query']) for i, j in zip(found, projection): results[i].extend(j) results[i].extend(db[i]) # Add distance info. for k, v in results.items(): results[k].append(distance.get(k, 'NA')) # majority, majority_ratio = {}, {} # for rank, taxa in vote.items(): # results['raw'][rank] = taxa # cnt = Counter(taxa) # maxn = max(cnt.values()) # hits = [k for k, v in cnt.items() if v == maxn] # pick = random.choice(hits) # majority[rank] = pick # majority_ratio[rank] = round(cnt[pick]/len(taxa), 2) # results['majority'] = majority # results['ratio'] = majority_ratio with open(outfile, 'w+') as out: out.write('\t'.join( 'name c1 c2 domain phylum class order family genus species cos'.split())+'\n') # json.dump(dict(sorted(results.items())), out, indent=4) for k, v in results.items(): line = '\t'.join([str(i) for i in [k]+v])+'\n' out.write(line)
'min_samples': 1, 'cluster_selection_method': 'eom', } model = load_embedding(args.model) names = model.docvecs.index2entity # https://github.com/scikit-learn-contrib/hdbscan # because t-SNE for clustering is no good # stats.stackexchange.com/questions/308132 m = [] for i in names: # names .. just a list of accession IDs m.append(model.docvecs[i]) m = np.array(m, dtype='float64') eprint('Projecting points (dimension reduction) ...') # Reduce dimensions before clustering reducer = umap.UMAP(**config_umap_dim_reduction) m_redux = reducer.fit_transform(m) m_norm = normalize(m_redux, norm='l2', axis=1) clusterer = hdbscan.HDBSCAN(**config_hdbscan) cluster_labels = clusterer.fit_predict(m_norm) # We now want to extract the clustering hierarchy from the clusterer object # (i.e. the tree). # condensed tree to newick # stackoverflow.com/questions/46444454 g = clusterer.condensed_tree_.to_networkx() # Find root. The root of a tree has an indegree of 0. # stackoverflow.com/questions/4122390