def __init__(self, fasta_file, split_depth, classes, testfrac): """Initialization of the Split class Args: fasta_file (str): the path to the reference fasta file to use split_depth (int): the taxonomic level at which to split data between test and train. classes (dict): a dict of NCBI Taxonomy ids as keys and the number of samples to sample for each class testfrac (float): The proportion of the data to use for testing the model Returns: None """ logging.info("Initializing vica.split_shred.Split object") logging.info("loading pyfaidx index, or creating index if not present") self.pyfaidx_obj = pyfaidx.Fasta(fasta_file, read_ahead=100) logging.info("Loading ete3 NCBI taxonomy data object") self.tax_instance = ete3.NCBITaxa() self.pruned_tree = None logging.info("Profiling sequences taxonomically") self.profile = self.set_profile(fasta_file) self.test_subtrees = None self.train_subtrees = None self.depth = split_depth self.composition = {} self.classes = classes self.testfrac = testfrac self.ranks = { 'no rank': None, 'superkingdom': 0, 'kingdom': 1, 'subkingdom': 2, 'superphylum': 3, 'phylum': 4, 'subphylum': 5, 'superclass': 6, 'class': 7, 'subclass': 8, 'infraclass': 9, 'superorder': 10, 'order': 11, 'suborder': 12, 'infraorder': 13, 'parvorder': 14, 'superfamily': 15, 'family': 16, 'subfamily': 17, 'tribe': 18, 'subtribe': 19, 'genus': 20, 'subgenus': 21, 'species group': 22, 'species subgroup': 23, 'species': 24, 'subspecies': 25, 'varietas': 26, 'forma': 27 } self.iranks = {v: k for k, v in self.ranks.items()}
def match_taxa(tree, labels, backbone_method): if '_' in labels[0]: splist = label2sciname(labels=labels, in_delim='_', out_delim=' ') else: splist = labels if backbone_method.startswith('ncbi'): ncbi = ete3.NCBITaxa() leaf_names = [ln.replace('_', ' ') for ln in tree.get_leaf_names()] leaf_name_set = set(leaf_names) for sp, label in zip(splist, labels): if backbone_method.startswith('ncbi'): lineage = get_lineage(sp, ncbi, rank='no') ancestor_names = ncbi.get_taxid_translator(lineage) ancestor = leaf_name_set.intersection(set(ancestor_names.values())) ancestor = list(ancestor) if (len(ancestor) > 1): txt = 'Multiple hits. Excluded from the output. Taxon in the list = {}, Taxa in the tree = {}\n' sys.stderr.write(txt.format(sp, ','.join(list(ancestor)))) continue elif (len(ancestor) == 0): txt = 'No hit. Excluded from the output. Taxon = {}\n' sys.stderr.write(txt.format(sp, ','.join(list(ancestor)))) continue elif backbone_method == 'user': ancestor = [sp] for leaf in tree.get_leaves(): if (leaf.name == ancestor[0]): leaf.has_taxon = True leaf.taxon_names.append(label) break return tree
def get_taxonomy_identifier_from_name(name_taxonomy, mode="infer", ncbi=None, into=pd.Series): """ name_taxonomy can be either a single string or iterable of strings """ accepted_modes = {'batch', 'singular'} # Get database if ncbi is None: ncbi = ete3.NCBITaxa() if mode == "infer": if not is_nonstring_iterable(name_taxonomy): mode = "singular" else: mode = "batch" assert mode != "infer", "Cannot infer `mode`. Please explicitly provide mode." if mode == "singular": name_taxonomy = [name_taxonomy] name_taxonomy = [*map(lambda x: x.strip(), name_taxonomy)] Se_taxonomy = pd.Series(ncbi.get_name_translator( name_taxonomy)).sort_index().map(lambda x: x[0]) if mode == "singular": return Se_taxonomy.values[0] if mode == "batch": idx_missing = set(name_taxonomy) - set(Se_taxonomy.index) number_of_name_taxonomy_missing = len(idx_missing) if number_of_name_taxonomy_missing > 0: Se_missing = pd.Series([np.nan] * number_of_name_taxonomy_missing, index=idx_missing) Se_taxonomy = pd.concat([Se_taxonomy, Se_missing]) return into(Se_taxonomy)
def get_lineages(labels, rank): if '_' in labels[0]: splist = label2sciname(labels=labels, in_delim='_', out_delim=' ') else: splist = labels ncbi = ete3.NCBITaxa() lineages = dict() for sp, label in zip(splist, labels): lineages[label] = get_lineage(sp, ncbi, rank) return lineages
def gen_taxon(self): """ Generate taxon for the model from knowledge base """ kb = self.knowledge_base model = self.model ncbi_taxa = ete3.NCBITaxa() taxon_name = ncbi_taxa.get_taxid_translator([kb.cell.taxon])[kb.cell.taxon] taxon_rank = ncbi_taxa.get_rank([kb.cell.taxon])[kb.cell.taxon] model_taxon = wc_lang.core.Taxon(id='taxon', name=taxon_name, model=model, rank=wc_lang.core.TaxonRank[taxon_rank])
def get_tree(taxa, savename=None): """ Generates a taxonomic tree using the ncbi taxonomy and :param oma: a pyoma db object :param saveTree: Bool for whether or not to save a mastertree newick file :return: tree_string: a newick string tree: an ete3 object """ ncbi = ete3.NCBITaxa() tax = set(tax) genomes = set(genomes) tax.remove(0) print(len(tax)) tree = ete3.PhyloTree(name='') tree.add_child(name='131567') topo = ncbi.get_topology(tax, collapse_subspecies=False) tax = set([str(taxid) for taxid in tax]) tree.add_child(topo) orphans = list(genomes - set([x.name for x in tree.get_leaves()])) print('missing taxa:') print(len(orphans)) Entrez.email = config_utils.email orphans_info1 = {} orphans_info2 = {} for x in orphans: search_handle = Entrez.efetch('taxonomy', id=str(x), retmode='xml') record = next(Entrez.parse(search_handle)) print(record) orphans_info1[record['ParentTaxId']] = x orphans_info2[x] = [x['TaxId'] for x in record['LineageEx']] for n in tree.traverse(): if n.name in orphans_info1: n.add_sister(name=orphans_info1[n.name]) print(n) orphans = set(genomes) - set([x.name for x in tree.get_leaves()]) tree = add_orphans(orphans_info2, tree, genomes) orphans = set(genomes) - set([x.name for x in tree.get_leaves()]) tree_string = tree.write(format=1) if savename is None: with open(config_utils.datadir + 'mastertree.nwk', 'w') as nwkout: nwkout.write(tree_string) with open(config_utils.datadir + 'mastertree.pkl', 'wb') as pklout: pklout.write(pickle.dumps(tree)) else: with open(config_utils.datadir + savename + '_master_tree.nwk', 'w') as nwkout: nwkout.write(tree_string) with open(config_utils.datadir + savename + '_master_tree.pkl', 'wb') as pklout: pklout.write(pickle.dumps(tree)) return tree_string, tree
def get_mrca_taxid(multi_counts): ncbi = ete3.NCBITaxa() max_count = multi_counts[:, 1].max() is_max_count = (multi_counts[:, 1] == max_count) max_taxids = multi_counts[is_max_count, 0] mrca_taxid = 1 max_ancestor_num = 0 for mt in max_taxids: ancestor_num = len(ncbi.get_lineage(mt)) if (ancestor_num > max_ancestor_num): mrca_taxid = mt max_ancestor_num = ancestor_num return mrca_taxid
def taxonomic_annotation(tree): ncbi = ete3.NCBITaxa() for leaf in tree.iter_leaves(): leaf_name_split = leaf.name.split("_") binom_name = leaf_name_split[0] + " " + leaf_name_split[1] leaf.sci_name = binom_name name2id = ncbi.get_name_translator(names=[leaf.sci_name]) if len(name2id[leaf.sci_name]) > 1: print(leaf.sci_name, "has", len(name2id[leaf.sci_name]), "taxids.") leaf.taxid = name2id[leaf.sci_name][0] tax2names, tax2lineages, tax2rank = tree.annotate_ncbi_taxa( taxid_attr="taxid") return (tree)
def buildSpeciesTree(taxon, gfaln): """ Build a species tree from the ncbi taxonomy and species in the gene tree, and write it in a file. @param1 taxon: taxon under which ncbi species are considered. @param2 gftree: path of the alignment @return the species tree file path. """ ncbi = ete3.NCBITaxa(dbfile="/opt/DGINN/taxa.sqlite") sptree = ncbi.get_descendant_taxa(taxon, collapse_subspecies=True, return_tree=True) spleaves = [ ncbi.get_taxid_translator([x]) for x in sptree.get_leaf_names() ] accns = list(SeqIO.parse(gfaln, 'fasta')) gLeavesSp = [name.id.split("_")[0] for name in accns] ## link Taxids & species names abbreviations leavesNames = {} for x in spleaves: for k, v in x.items(): tax = v.replace(".", "").replace("-", "").split(" ") newTax = tax[0][:3].lower() + "".join( [i[:3].title() for i in tax[1:]]) leavesNames[k] = newTax # List of tax ids of species in gene tree lTaxids = [] for x in gLeavesSp: lTaxids += [str(k) for k, v in leavesNames.items() if v == x[0:6]] lTaxids = list(set(lTaxids)) # restriction of species tree to these taxons sptree.prune(lTaxids) # back to correct leaves names for x in sptree: x.name = leavesNames[int(x.name)] spTreeFile = "/".join(gfaln.split("/")[:-1] + ["species_tree.tree"]) sptree.write(format=9, outfile=spTreeFile) return spTreeFile
def taxid2tree(lineages, taxid_counts): ncbi = ete3.NCBITaxa() is_multiple = (taxid_counts[:, 1] > 1) multi_counts = taxid_counts[is_multiple, :] clades = list() for i in numpy.arange(multi_counts.shape[0]): taxid = multi_counts[i, 0] count = multi_counts[i, 1] ancestors = ncbi.get_lineage(taxid) new_clade = ete3.PhyloNode() new_clade.ancestors = ancestors new_clade = populate_leaves(new_clade, taxid, lineages) clades = add_new_clade(clades, new_clade) assert len(clades) == 1, 'Failed to merge clades into a single tree.' tree = clades[0] return tree
def __init__(self, MongoDB=None, db=None, username=None, password=None, collection_str=None, authSource='admin', readPreference='nearest'): super().__init__(MongoDB=MongoDB, db=db, username=username, password=password, authSource=authSource, readPreference=readPreference) self._ncbi_taxa = ete3.NCBITaxa() self.collection = self.db_obj[collection_str]
def translateHeader(header_string): # >10224:0029f1 "pub_gene_id":"Sakowv30031477m", "pub_og_id":"EOG091G08IZ", "og_name":"guanine nucleotide binding protein-like 3 (nucleolar) ","level":33208 # first number is an NCBI taxon id # second number is a unique hexadecimal id def unquote(x): x = x.replace('"','') x = x.replace("'","") return x def get_key_colon_value(x): y = x.strip().split(":") if not len(y)==2: print(x,y) return (y[0],y[1]) # Find first space, indicating end of IDs entry_start = header_string.find('>') if entry_start<0: entry_start=0 else: entry_start = entry_start+1 id_end = header_string.find(' ') id_flds = header_string[entry_start:id_end].split(":") ncbi_taxon_id = int(id_flds[0]) taxon_dict = ete3.NCBITaxa().get_taxid_translator([ncbi_taxon_id]) if ncbi_taxon_id in taxon_dict: taxon_name = taxon_dict[ncbi_taxon_id] else: #print(id_flds) taxon_name = MISSING_TAXON rest = header_string[id_end:] brace_begin = rest.find('{') if brace_begin>0: rest = rest[(brace_begin+1):] rest = rest.replace("}",'') # This pattern means: # Find either things that look like "stuff":"stuff" or like "stuff":number y = re.compile('("([^":]*)":"([^"]|"")*")|("([^"]*)":(\d+))') def pickcolon(x): y = [e for e in x if e.find(":")>0] return y[0] flds = [pickcolon(x).split(":") for x in y.findall(rest)] res_dict = dict([(unquote(x[0]),unquote(x[1])) for x in flds]) res_dict["taxon"] = taxon_name return res_dict
def __init__(self, TreePath, AlignementPath, uniprotTaxonomy): """This class takes the path to the Newick Tree, the fasta alignment from which the tree is derived and the path to the parsed uniprot taxonomy.""" self.TreePath = TreePath self.AlignementPath = AlignementPath f = open(self.AlignementPath) lines = f.readlines() out = [] for line in lines: if line[0] == '>': out.append(line.split(' ')[0] + '\n') else: out.append(line) f.close() f = open(self.AlignementPath, 'w') for o in out: f.write(o) f.close() self.tree = ete3.PhyloTree(newick=TreePath, alignment=AlignementPath) self.tree.set_species_naming_function(self.parse_sp_name) self.uniprot2ncbi = {} self.uniprot2species = {} self.ncbiID2species = {} self.ncbi = ete3.NCBITaxa() f = open(uniprotTaxonomy) lines = f.readlines() for line in lines: s = line.strip().split('\t') uniprotID = s[0] ncbiID = s[1].split(' ')[0] specie = s[2].split(',')[-1] self.uniprot2ncbi[uniprotID] = ncbiID self.uniprot2species[uniprotID] = specie self.ncbiID2species[ncbiID] = specie self.treeTaxa = [] leaves = self.tree.get_leaves() for leaf in leaves: uniprotID = leaf.name.split('|')[0].split('_')[1] ncbiID = self.uniprot2ncbi[uniprotID] leaf.name = "%s_%s" % (ncbiID, leaf.name.split('|')[0].split('_')[1]) # leaf.species = sel`f.uniprot2species[uniprotID] self.treeTaxa.append(int(ncbiID)) self.NCBITaxonomy = self.ncbi.get_topology(self.treeTaxa, intermediate_nodes=True)
def run(fastafile, outdir, length=5000, n_per_class=100000, testfrac =0.1, splitlevel="family", classes={2: "Bacteria", 2157: "Archaea", 2759: "Eukaryota", 10239: "Viruses"}, configpath=vica.CONFIG_PATH): """shred all sequences to the desired length""" try: global config config = yaml.safe_load(configpath) # Read data as pyfaidx object seqobj = _read_data(fastafile) ncbi = ete3.NCBITaxa() df = _profile_sequences(seqobj, ncbi, splitlevel, classes) cd = _split_levels(testfrac=testfrac, df=df, classes=classes) _select_contigs(n_per_class=n_per_class, cd=cd, outdir=outdir,length=length,df=df, seqobj=seqobj) testtaxa = _read_taxid_from_fasta(outdir=outdir) logging.info("Wrote {} NCBI taxomomy ids to the file 'test_taxids.txt'. This file isused to exclude test taxa from minhash during training".format(testtaxa)) except: logging.exception("vica.split_train logged the following exception:")
def __init__( self, reference_tree_file='/work/Alphas_and_Cyanos/rooted_partitions-with_named_branches.treefile', assembly_summary='/work/Alphas_and_Cyanos/assembly_summary_genbank.txt', output_folder='.', tree_folder='/work/Alphas_and_Cyanos/ranger_input_trees', reconciliations_folder='/work/Alphas_and_Cyanos/reconciliations', reconciliation_sufix='.reconciliation', ): self.ncbi = ete3.NCBITaxa() self.named_reference_tree = ete3.Tree(reference_tree_file, format=1) self.output_folder = output_folder.strip() self.tree_folder = tree_folder.strip() self.reconciliations_folder = reconciliations_folder.strip() self.reconciliation_sufix = reconciliation_sufix.strip() header = 'assembly_accession bioproject biosample wgs_master refseq_category taxid species_taxid organism_name infraspecific_name isolate version_status assembly_level release_type genome_rep seq_rel_date asm_name submitter gbrs_paired_asm paired_asm_comp ftp_path excluded_from_refseq relation_to_type_material'.split( ) genbank_summary = pd.read_table(assembly_summary, comment='#', header=None, names=header, dtype={ 'taxid': str, 'infraspecific_name': str }) genbank_summary['refseq_category'] = genbank_summary[ 'refseq_category'].str.lower() genbank_summary['assembly_level'] = genbank_summary[ 'assembly_level'].str.lower() genbank_summary['genome_rep'] = genbank_summary[ 'genome_rep'].str.lower() genbank_summary.set_index('assembly_accession', inplace=True) genbank_summary.index = [ re.sub('\.\d+$', '', index).replace('_', '') for index in genbank_summary.index ] self.assembly_summary = genbank_summary.reindex( self.named_reference_tree.get_leaf_names())
tree_to_root.set_outgroup(leaf) break else: is_it_monophyletic, clade_type, fucking_up = tree_to_root.check_monophyly(node.get_leaf_names(), 'name', unrooted=False) if is_it_monophyletic: equivalent = tree_to_root.get_common_ancestor(node.get_leaf_names()) tree_to_root.set_outgroup(equivalent) else: tree_to_root.set_outgroup(fucking_up.pop()) equivalent = tree_to_root.get_common_ancestor(node.get_leaf_names()) tree_to_root.set_outgroup(equivalent) break return tree_to_root ncbi = ete3.NCBITaxa() os.chdir('/work/Alphas_and_Cyanos') species_tree = ete3.Tree('rooted_partitions-with_BB_support.treefile', format=0) species_tree = ete3.Tree('rooted_partitions-with_named_branches.treefile', format=1) ######################################################################################################################## # # # # ######################################################################################################################## single_optimal_rooting = [] with cd('reconciliations/ranger_roots'): for group in os.listdir('.'): if not os.path.isdir(group) or not os.path.isfile('%s/%s.reconciliation1' %(group, group)): continue if os.path.isfile('%s/%s.optResolution1.ranger_input' %(group, group)) and not os.path.isfile('%s/%s.optResolution2.ranger_input' %(group, group)):
def load_content(self): """ Collect and parse all data from CORUM website and add to SQLite database """ database_url = self.ENDPOINT_DOMAINS['corum'] req = self.requests_session session = self.session # Extract All Files and save to current directory response = req.get(database_url) z = zipfile.ZipFile(BytesIO(response.content)) z.extractall(self.cache_dirname) self.cwd = os.path.join(self.cache_dirname, 'allComplexes.txt') # create object to find NCBI taxonomy IDs ncbi_taxa = ete3.NCBITaxa() with open(self.cwd, 'r') as file: i_entry = 0 for entry in csv.DictReader(file, delimiter='\t'): # entry/line number in file i_entry += 1 # stop if the maximum desired number of entries has been reached if i_entry > self.max_entries: break # replace 'None' strings with None for key, val in entry.items(): if val == 'None': entry[key] = None # extract attributes complex_id = int(entry['ComplexID']) complex_name = entry['ComplexName'] cell_line = entry['Cell line'] su_uniprot = entry[ 'subunits(UniProt IDs)'] # SETS OF STRING IDS SEPARATED BY ;\ su_entrez = entry[ 'subunits(Entrez IDs)'] # SETS OF INT IDS SEPARATED BY ; pur_method = entry['Protein complex purification method'] go_id = entry[ 'GO ID'] # SETS OF INT IDS SEPARATED BY ; eg. GO:0005634 go_dsc = entry['GO description'] funcat_id = entry['FunCat ID'] funcat_dsc = entry['FunCat description'] pubmed_id = int(entry['PubMed ID']) protein_name = entry['subunits(Protein name)'] gene_name = entry['subunits(Gene name)'] gene_syn = entry['Synonyms'] disease_cmt = entry['Disease comment'] su_cmt = entry['Subunits comment'] complex_cmt = entry['Complex comment'] swissprot_id = entry['SWISSPROT organism'] """ ----------------- Apply field level corrections-----------------""" # Split the semicolon-separated lists of subunits into protein components, # ignoring semicolons inside square brackets su_uniprot_list = parse_list(su_uniprot) su_entrez_list = parse_list(su_entrez) protein_name_list = parse_list( correct_protein_name_list(protein_name)) # check list lengths match if len(protein_name_list) != len(su_entrez_list): msg = 'Unequal number of uniprot/entrez subunits at line {}\n {}\n {}'.format( i_entry, '; '.join(protein_name_list), '; '.join(su_entrez_list)) raise Exception(msg) if len(su_uniprot_list) != len(su_entrez_list): msg = 'Unequal number of uniprot/entrezs subunits at line {}\n {}\n {}'.format( i_entry, '; '.join(su_uniprot_list), '; '.join(su_entrez_list)) raise Exception(msg) # Fix the redundancy issue with swissprot_id field if swissprot_id: swissprot_id, _, _ = swissprot_id.partition(';') ncbi_name, _, _ = swissprot_id.partition(' (') result = ncbi_taxa.get_name_translator([ncbi_name]) ncbi_id = result[ncbi_name][0] else: ncbi_id = None """ ----------------- Export the entries to the SQLite database ----------------- """ if ncbi_id: q = session.query(Taxon).filter(Taxon.ncbi_id == ncbi_id) if session.query(q.exists()).scalar(): taxon = q.first() else: taxon = Taxon(ncbi_id=ncbi_id, swissprot_id=swissprot_id) session.add(taxon) else: taxon = None observation = Observation(cell_line=cell_line, pur_method=pur_method, pubmed_id=pubmed_id, taxon=taxon) session.add(observation) complex = Complex(complex_id=complex_id, complex_name=complex_name, go_id=go_id, go_dsc=go_dsc, funcat_id=funcat_id, funcat_dsc=funcat_dsc, su_cmt=su_cmt, complex_cmt=complex_cmt, disease_cmt=disease_cmt, observation=observation) session.add(complex) for su_uniprot, su_entrez, protein_name in zip( su_uniprot_list, su_entrez_list, protein_name_list): subunit = Subunit(su_uniprot=su_uniprot, su_entrezs=su_entrez, protein_name=protein_name, gene_name=gene_name, gene_syn=gene_syn, complex=complex) session.add(subunit) session.commit()
def checkTaxid(taxid=None): if taxid=='1' or int(taxid) in ete3.NCBITaxa().get_descendant_taxa(1,intermediate_nodes=True): return True else: return False
def init(cfg, args): import ete3 ncbi = ete3.NCBITaxa() ncbi.update_taxonomy_database() print('NCBI Taxomomy database is installed in {}.'.format(ncbi.dbfile))
def makeTaxidsListFp(taxid=None): outFp=privateBlastDirPath/'taxids_gis'/taxid/'taxids') if not(outFp.is_file() and outFp.stat().st_size>0): outFp.open(mode='wt').writelines(sorted([taxid+'\n']+list(map(lambda i:str(i)+'\n',ete3.NCBITaxa().get_descendant_taxa(taxid))))) if not(outFp.is_file() and outFp.stat().st_size>0):return None else: return outFp
def get_taxonomy_lineage_from_identifier(identifiers, name=None, translate_ids=True, ncbi: ete3.NCBITaxa = None, verbose=0, mode="infer", include_taxid=True, include_levels=[ "id_taxon", "phylum", "class", "order", "family", "genus", "species" ]): """ Input: Single Taxonomy ID or a collection of identifiers w/ {id_orf:id_taxon} Output: pd.Series/pd.DataFrame of rank and taxonomy labels modes: {'infer', 'batch', 'singular'} """ accepted_modes = {'batch', 'singular'} verbose = int(verbose) # Get database if ncbi is None: ncbi = ete3.NCBITaxa() # Infer mode if mode == "infer": if is_dict_like(identifiers): mode = "batch" else: mode = "singular" if verbose > 0: print(f"Inferred mode: {mode}", file=sys.stderr) assert mode != "infer", "Cannot infer `mode`. Please explicitly provide mode." # Singular if mode == "singular": # Handle missing data if pd.isnull(identifiers): return pd.Series([], name=name) # Access the database try: id_taxon = int(identifiers) if name is None: name = id_taxon lineage = ncbi.get_lineage(id_taxon) ranks = dict( filter(lambda x: x[1] != "no rank", ncbi.get_rank(lineage).items())) Se_taxonomy = pd.Series(ncbi.get_taxid_translator(ranks.keys()), name=name) if translate_ids: Se_taxonomy.index = Se_taxonomy.index.map(lambda x: ranks[x]) return Se_taxonomy # Handle taxonomy IDs that are not in database except ValueError: if verbose > 1: print(id_taxon, file=sys.stderr) return pd.Series([], name=name) # Batch if mode == "batch": if not is_query_class(identifiers, "Series"): identifiers = pd.Series(identifiers) # Group each taxonomy identifier dataframes = list() for id_taxon, group in tqdm( pd_series_collapse(identifiers).iteritems(), "Searching lineage from taxonomy identifier"): number_of_orfs_in_group = len(group) Se_taxonomy = get_taxonomy_lineage_from_identifier( identifiers=id_taxon, name=None, translate_ids=translate_ids, ncbi=ncbi, verbose=verbose, mode="singular") df_taxon = pd.DataFrame(number_of_orfs_in_group * [Se_taxonomy]) df_taxon.index = group if include_taxid: df_taxon["id_taxon"] = id_taxon dataframes.append(df_taxon) df_collection = pd.concat(dataframes, axis=0) if include_levels is None: include_levels = df_collection.columns else: include_levels = [ *filter(lambda level: level in df_collection.columns, include_levels) ] idx_missing_orfs = set(identifiers.index) - set(df_collection.index) number_of_orfs_missing = len(idx_missing_orfs) if number_of_orfs_missing > 0: A = np.empty((number_of_orfs_missing, df_collection.shape[1])) A[:] = np.nan df_missing = pd.DataFrame(A, index=idx_missing_orfs, columns=df_collection.columns) return pd.concat([df_collection, df_missing], axis=0).loc[identifiers.index, include_levels] else: return df_collection.loc[identifiers.index, include_levels]
def main(): args = setup_args() print('Start') print('Checking commands:') # Here we check the input and the output, # in other words, we check the correction of the commands check_input_file(args.input_xml, '.xml') output = check_output_path(args.output) # We need to get a set with names handle = open(args.input_xml) blast_records = NCBIXML.parse(handle) species_set = dict() try: while True: blast_record = next(blast_records) for i in range(len(blast_record.alignments)): if species_set.get( str( pull_species_name( blast_record.alignments[i].hit_def))) is None: entry = 1 else: entry = species_set.get( str( pull_species_name( blast_record.alignments[i].hit_def))) + 1 species_set.update({ str(pull_species_name(blast_record.alignments[i].hit_def)): entry }) except StopIteration: print( 'The dictionary of occurrence is built. \nCreating a .csv file ...' ) with open(output + '/frequency_dictionary.csv', 'w') as csv_file: writer = csv.writer(csv_file, delimiter=';') writer.writerow(['name', 'frequency_value']) for key, value in species_set.items(): writer.writerow([key, value]) # This stage requires time, because script should download database from NCBI and parse it. # ncbi_db.log allows to check the date of the last update. print('This stage requires additional time.') ncbi = ete3.NCBITaxa() if not os.path.isfile(os.getcwd() + '/ncbi_db.log'): logging.basicConfig(filename="ncbi_db.log", level=logging.INFO) logging.info("Taxonomy DB was updated {}".format( datetime.datetime.now())) ncbi.update_taxonomy_database() names_list = list(species_set.keys()) name2taxid = ncbi.get_name_translator(names_list) insecta_dict = {} mammalia_dict = {} bacteria_dict = {} archaea_dict = {} others_dict = {} for taxid in list(name2taxid.values()): try: names = ncbi.get_taxid_translator(ncbi.get_lineage(taxid[0])) if any("Insecta" in s for s in [names[taxid] for taxid in ncbi.get_lineage(taxid[0])]): insecta_dict.update(ncbi.get_taxid_translator(taxid)) elif any("Mammalia" in s for s in [names[taxid] for taxid in ncbi.get_lineage(taxid[0])]): mammalia_dict.update(ncbi.get_taxid_translator(taxid)) elif any("Bacteria" in s for s in [names[taxid] for taxid in ncbi.get_lineage(taxid[0])]): bacteria_dict.update(ncbi.get_taxid_translator(taxid)) elif any("Archaea" in s for s in [names[taxid] for taxid in ncbi.get_lineage(taxid[0])]): archaea_dict.update(ncbi.get_taxid_translator(taxid)) else: others_dict.update(ncbi.get_taxid_translator(taxid)) except ValueError: print('Missed taxid: {}'.format(taxid)) dict_keeper = [ insecta_dict, mammalia_dict, bacteria_dict, archaea_dict, others_dict ] dict_names = [ '/insecta_dict.csv', '/mammalia_dict.csv', '/bacteria_dict.csv', '/archaea_dict.csv', '/others_dict.csv' ] for i, elem in enumerate(dict_keeper): with open(output + dict_names[i], 'w') as csv_file: writer = csv.writer(csv_file) for key, value in elem.items(): writer.writerow([key, value]) print('Parsing is done.') # If the taxon list is given, we search child taxons if args.input_taxon is not None: check_input_file(args.input_taxon, '.txt') search_taxon(args.input_taxon, args.input_xml, dict_keeper, output)
def map_taxonomic_level(self, df, taxa_table=None): ''' :param self: :param df: df containing all transfers :param taxa_table: tab-delimited file with all taxa present in the transfer df, should be part of a \ assembly_summmary, either genbank of refseq :return: ''' ncbi = ete3.NCBITaxa() # #read taxa table and make the necessary changes taxa_df = pd.read_csv(taxa_table, sep='\t') # #important: RANGER understand "_" as separation between genome and gene IDs, so we have to remove it from # the taxa table in order to match data structure in the rest of the pipe #... ranger is also very picky about its naming convetions, tips cannot contain "." taxa_df['Unnamed: 0'] = taxa_df['Unnamed: 0'].apply(lambda x: x.replace('_', '').split('.')[0]) taxa_df['accession'] = taxa_df['accession'].apply(lambda x: x.replace('_', '').split('.')[0]) taxa_df.set_index('Unnamed: 0', inplace=True) # #this df will hold all taxonomic data for all leaves in the species tree taxonomy_df = pd.DataFrame() # #traverse through all leaves in the species tree and add tags regarding its taxonomic classification for leaf in self.species_tree.get_leaf_names(): # #kelsey's naming convention wasn't the most consistent, so let's try two distinct ways to find a leaf in the # taxa table # if leaf in taxa_df.index: node_name = taxa_df.index[taxa_df.index == leaf][0] elif leaf in taxa_df.accession.values: node_name = taxa_df.query('accession==@leaf').index[0] else: # #if none works, f**k it... continue #does this leaf has a valid taxid in our taxa table? if pd.notnull(taxa_df.loc[node_name, 'taxid']): # #sweet, what is it? taxid = taxa_df.loc[node_name, 'taxid'] # #create a lineage dict that we can manipulate lineage = {taxon_rank: taxon # #traverse all NCBI classification available for a its taxid for taxon, taxon_rank in ncbi.get_rank(ncbi.get_lineage(taxid)).items() } # #add the highest level taxonomic information we have lineage['leaf_name'] = leaf # #I said there would be a LOT of things here... taxonomy_df = taxonomy_df.append(lineage, ignore_index=True) # #our most specific information about each leaf will work as our "primary key" taxonomy_df.set_index('leaf_name', inplace=True) # #drop all columns not related to commonly used taxonomic ranks to_drop = [] for column in taxonomy_df.columns: if column not in ['class', 'species', 'superkingdom', 'genus', 'order', 'phylum', 'family', 'kingdom']: to_drop.append(column) taxonomy_df.drop(to_drop, axis='columns', inplace=True) transfer_df = df.copy() for index, row in transfer_df.iterrows(): donor_descendants = next( self.species_tree.iter_search_nodes(ranger_name=row.donor) ).get_leaf_names() recipient_descendants = next( self.species_tree.iter_search_nodes(ranger_name=row.recipient) ).get_leaf_names() donor_taxonomy = taxonomy_df.loc[[taxon for taxon in donor_descendants if taxon in taxonomy_df.index]] recipient_taxonomy = taxonomy_df.loc[[taxon for taxon in recipient_descendants if taxon in taxonomy_df.index]] if not donor_taxonomy.shape[0] or not recipient_taxonomy.shape[0]: # # if no descendant has a valid taxid, ignore it... continue donor_taxonomy.dropna( axis=1, how='any', inplace=True) recipient_taxonomy.dropna(axis=1, how='any', inplace=True) donor_taxonomy = next(donor_taxonomy.loc[:, np.invert(donor_taxonomy.T.duplicated().values) ].iterrows())[1] recipient_taxonomy = next(recipient_taxonomy.loc[:, np.invert(recipient_taxonomy.T.duplicated().values) ].iterrows())[1] common_ranks = donor_taxonomy.index.intersection(recipient_taxonomy.index) for rank in ['species', 'genus', 'family', 'order', 'class', 'phylum', 'kingdom', 'superkingdom']: if rank in common_ranks[donor_taxonomy[common_ranks] == recipient_taxonomy[common_ranks]]: break transfer_df.loc[index, 'transfer_level'] = rank return (transfer_df)
def main(inputtree, outbase, div=True, features=None, stem_or_crown="crown", byrank='', byage=None, bylist=None, bysize=None): """byrank: when the rank is included in or equal to 'byrank'; byage: collapse any node of age <= byage; bylist: read list of nodes from file; bysize: collapse oldest nodes with size < bysize.""" group_feature_rate = def_group_feature_rate(stem_or_crown) tree = ete3.PhyloTree(inputtree, format=1, quoted_node_names=False) outsuffix = '-stem' if stem_or_crown == 'stem' else '' if byrank: outsuffix += '-%s' % byrank if byage: outsuffix += '-age%g' % byage if bylist: outsuffix += '-list' + op.splitext(op.basename(bylist))[0] if bysize: outsuffix += '-size%d' % bysize outnames = { 'tsv': (outbase + '%s.tsv' % outsuffix), 'subtrees': (outbase + '%s.subtrees.nwk' % outsuffix), 'tree': (outbase + '%s.nwk' % outsuffix) } for out in outnames.values(): if op.exists(out): logger.error("%r already exists, quitting", out) return 1 columns = [outsuffix.lstrip('-'), 'size', 'branches', 'age', 'tot_len'] #'crown_age', 'stem_age'] if div: columns.extend(('div_rate', 'gamma', 'ncbi_sp_sampling')) if features: columns.extend(features) if byrank or div: logger.info("Loading taxonomy") ncbi = ete3.NCBITaxa() name2taxid = ncbi.get_name_translator( [node.name.replace('_', ' ') if node.is_leaf() \ else node.name for node in tree.traverse()]) # Won't return anything for names not found #if rank: #taxid2rank = ncbi.get_rank(chain(*name2taxid.values())) taxid2name = ncbi.get_taxid_translator(chain(*name2taxid.values())) else: name2taxid, taxid2name = None, None is_leaf_fn = make_is_leaf_fn(byrank, byage, bylist, bysize, name2taxid, taxid2name) with open(outnames['tsv'], 'w') as outtsv, \ open(outnames['subtrees'], 'w') as outsub: outtsv.write('\t'.join(columns) + '\n') logger.info("Iterating over found clades") for node in tree.iter_leaves(is_leaf_fn): outsub.write( node.write(features, format=1, format_root_node=True) + '\n') # Collapse size = len(node) branches = len(node.get_descendants()) _, age = node.get_farthest_leaf() tot_len = sum(d.dist for d in node.iter_descendants()) if stem_or_crown == 'stem': age += node.dist tot_len += node.dist values = [node.name, size, branches, age, tot_len] if div: div_rate = float(size) / age if age else np.NaN gamma_stat = div_gamma(node) try: nodetaxids = name2taxid[node.name.replace('_', ' ')] if len(nodetaxids) > 1: nodetaxids = [ match_duplicate_taxid(taxids, node, taxid2name, ncbi) ] except KeyError: # This clade isn't in the taxonomy (example: Atlantogenata) # take descendant nodes and join them valid_tax_children = get_valid_tax_children( node, name2taxid) vtc_names = [ vtc.name.replace(' ', '_') for vtc in valid_tax_children ] logger.warning( '%r not found in NCBI Taxonomy. Merging ' 'the node children %s to get the ' 'descendant counts.', node.name, vtc_names) nodetaxids = [] for vtc_n, vtc in zip(vtc_names, valid_tax_children): vtc_taxids = name2taxid[vtc_n] if len(vtc_taxids) == 1: nodetaxids.append(vtc_taxids[0]) else: nodetaxids.append( match_duplicate_taxid(vtc_taxids, vtc, taxid2name, ncbi)) ncbi_sp = list(chain(*(ncbi.get_descendant_taxa(nt, rank_limit='species') \ for nt in nodetaxids))) #collapse_subspecies=True)) sp_sampling = float(size) / len(ncbi_sp) values.extend((div_rate, gamma_stat, sp_sampling)) if features: ft_rates = group_feature_rate(node, features) values += ft_rates.tolist() outtsv.write('\t'.join(str(v) for v in values) + '\n') tree.write(outfile=outnames['tree'], format=1, is_leaf_fn=is_leaf_fn, format_root_node=True)
def load_content(self, endpoint='corum'): """ Collect and parse all data from CORUM website into JSON files and add to NoSQL database """ database_url = self.ENDPOINT_DOMAINS[endpoint] _, _, collection = self.con_db(self.collection) os.makedirs(os.path.join( self.cache_dirname, self.collection), exist_ok=True) if self.verbose: print('Download list of all compounds: ...') response = requests.get(database_url) response.raise_for_status() # Extract All Files and save to current directory if self.verbose: print('... Done!') if self.verbose: print('Unzipping and parsing compound list ...') z = zipfile.ZipFile(BytesIO(response.content)) z.extractall(self.cache_dirname) if endpoint == 'corum': cwd = os.path.join(self.cache_dirname, 'allComplexes.txt') else: cwd = os.path.join(self.cache_dirname, 'spliceComplexes.txt') # create object to find NCBI taxonomy IDs ncbi_taxa = ete3.NCBITaxa() with open(cwd, 'r') as file: i_entry = 0 for entry in csv.DictReader(file, delimiter='\t'): # entry/line number in file i_entry += 1 # stop if the maximum desired number of entries has been reached if i_entry > self.max_entries: break # replace 'None' strings with None for key, val in entry.items(): if val == 'None': entry[key] = None # extract attributes complex_id = int(entry['ComplexID']) entry['complex_id'] = complex_id #replace string value with int value complex_name = entry['ComplexName'] cell_line = entry['Cell line'] pur_method = entry['Protein complex purification method'] # SETS OF INT IDS SEPARATED BY ; eg. GO:0005634 go_id = entry['GO ID'] go_dsc = entry['GO description'] funcat_id = entry['FunCat ID'] funcat_dsc = entry['FunCat description'] pubmed_id = int(entry['PubMed ID']) entry['pubmed_id'] = pubmed_id gene_name = entry['subunits(Gene name)'] gene_syn = entry['subunits(Gene name syn)'] complex_syn = entry['Synonyms'] disease_cmt = entry['Disease comment'] su_cmt = entry['Subunits comment'] complex_cmt = entry['Complex comment'] su_uniprot = entry['subunits(UniProt IDs)'] # SETS OF STRING IDS SEPARATED BY ;\ su_entrez = entry['subunits(Entrez IDs)'] # SETS OF INT IDS SEPARATED BY ; protein_name = entry['subunits(Protein name)'] swissprot_id = entry['SWISSPROT organism'] """ ----------------- Apply field level corrections-----------------""" # Split the semicolon-separated lists of subunits into protein components, # ignoring semicolons inside square brackets su_uniprot_list = parse_list(su_uniprot) entry['subunits_isoform_id'] = su_uniprot_list parsed_su_uniprot_list = parse_subunits(su_uniprot_list) entry['subunits_uniprot_id'] = parsed_su_uniprot_list del entry['subunits(UniProt IDs)'] su_entrez_list = parse_list(su_entrez) entry['subunits_entrez_id'] = su_entrez_list del entry['subunits(Entrez IDs)'] go_id_list = parse_list(go_id) entry['go_id'] = go_id_list del entry['GO ID'] go_dsc_list = parse_list(go_dsc) entry['go_description'] = go_dsc_list del entry['GO description'] funcat_id_list = parse_list(funcat_id) entry['funcat_id'] = funcat_id_list del entry['FunCat ID'] funcat_dsc_list = parse_list(funcat_dsc) entry['funcat_description'] = funcat_dsc_list del entry['FunCat description'] gene_name_list = parse_list(gene_name) entry['subunits_gene_name'] = gene_name_list del entry['subunits(Gene name)'] gene_syn_list = parse_list(gene_syn) entry['subunits_gene_name_synonym'] = gene_syn_list del entry['subunits(Gene name syn)'] protein_name_list = parse_list( correct_protein_name_list(protein_name)) entry['subunits_protein_name'] = protein_name_list del entry['subunits(Protein name)'] # check list lengths match if len(protein_name_list) != len(su_entrez_list): msg = 'Unequal number of uniprot/entrez subunits at line {}\n {}\n {}'.format( i_entry, '; '.join(protein_name_list), '; '.join(su_entrez_list)) raise Exception(msg) if len(su_uniprot_list) != len(su_entrez_list): msg = 'Unequal number of uniprot/entrezs subunits at line {}\n {}\n {}'.format( i_entry, '; '.join(su_uniprot_list), '; '.join(su_entrez_list)) raise Exception(msg) # Fix the redundancy issue with swissprot_id field if swissprot_id: swissprot_id, _, _ = swissprot_id.partition(';') ncbi_name, _, _ = swissprot_id.partition(' (') result = ncbi_taxa.get_name_translator([ncbi_name]) ncbi_id = result[ncbi_name][0] else: ncbi_id = None entry['SWISSPROT_organism_NCBI_ID'] = ncbi_id del entry['SWISSPROT organism'] file_name = 'corum_' + str(entry['complex_id']) + '.json' full_path = os.path.join( self.cache_dirname, self.collection, file_name) with open(full_path, 'w') as f: f.write(json.dumps(entry, indent=4)) collection.update_one({'ComplexID': entry['ComplexID']}, {'$set': entry}, upsert=True ) return collection
def search_taxon(in_taxon, in_xml, dict_keeper, output): ncbi = ete3.NCBITaxa() if not os.path.isfile(os.getcwd() + '/ncbi_db.log'): logging.basicConfig(filename="ncbi_db.log", level=logging.INFO) logging.info("Taxonomy DB was updated {}".format( datetime.datetime.now())) ncbi.update_taxonomy_database() # Now we create one dict with unique keys, so that we can search easier overall_dict = dict() for d in dict_keeper: overall_dict.update(d) handle = open(in_xml) blast_records = NCBIXML.parse(handle) species_query_set = dict() try: while True: blast_record = next(blast_records) for i in range(len(blast_record.alignments)): new_name = str( pull_species_name(blast_record.alignments[i].hit_def)) for curr_taxid in ncbi.get_name_translator([new_name ]).values(): for j in range(len(curr_taxid)): species_query_set.update( {curr_taxid[j]: blast_record.query}) except StopIteration: pass # This will allow us to get a dict with taxon_id's from our input file taxon_dict = ncbi.get_name_translator(read_taxons(in_taxon)) print('Found taxons could be found in "taxons_found.csv"') print('Search is started...') output_file = open(output + '/taxons_found.csv', 'w') exam_counter = 0 for idx, key in enumerate(taxon_dict): tree = ncbi.get_descendant_taxa(key, collapse_subspecies=True, return_tree=True) if type(tree) == list: if overall_dict.get(taxon_dict[key][0]) is not None: """ print('{0}:{1} is in your results'.format(key, overall_dict.get(taxon_dict[key][0]))) print('\t{0} is at the {1}'.format(key, species_query_set.get(taxon_dict[key][0]))) """ output_file.write('{0}; {1}; {2} \n'.format( key, overall_dict.get(taxon_dict[key][0]), species_query_set.get(taxon_dict[key][0]))) exam_counter += 1 elif type(tree) == ete3.PhyloNode: for child in tree.children: # We need only the taxid number potential_ids = re.findall('\\b\\d+\\b', child.get_ascii()) for potential_id in potential_ids: if overall_dict.get(int(potential_id)) is not None: """ print('{0}:{1} is in your results'.format(key, overall_dict[int(potential_id)])) print('\t{0} is at the {1}'.format(key, species_query_set[int(potential_id)])) """ output_file.write('{0}; {1}; {2} \n'.format( key, overall_dict[int(potential_id)], species_query_set[int(potential_id)])) exam_counter += 1 else: print('{0}: {1} is unknown'.format(key, taxon_dict[key])) output_file.close() print('Search is finished.') print('Total found {}'.format(exam_counter))