Esempio n. 1
0
    def __init__(self, fasta_file, split_depth, classes, testfrac):
        """Initialization of the Split class

        Args:
            fasta_file (str): the path to the reference fasta file to use
            split_depth (int): the taxonomic level at which to split data between
                test and train.
            classes (dict): a dict of NCBI Taxonomy ids as keys and the number of
                samples to sample for each class
            testfrac (float): The proportion of the data to use for testing the model

        Returns:
            None
        """
        logging.info("Initializing vica.split_shred.Split object")
        logging.info("loading pyfaidx index, or creating index if not present")
        self.pyfaidx_obj = pyfaidx.Fasta(fasta_file, read_ahead=100)
        logging.info("Loading ete3 NCBI taxonomy data object")
        self.tax_instance = ete3.NCBITaxa()
        self.pruned_tree = None
        logging.info("Profiling sequences taxonomically")
        self.profile = self.set_profile(fasta_file)
        self.test_subtrees = None
        self.train_subtrees = None
        self.depth = split_depth
        self.composition = {}
        self.classes = classes
        self.testfrac = testfrac
        self.ranks = {
            'no rank': None,
            'superkingdom': 0,
            'kingdom': 1,
            'subkingdom': 2,
            'superphylum': 3,
            'phylum': 4,
            'subphylum': 5,
            'superclass': 6,
            'class': 7,
            'subclass': 8,
            'infraclass': 9,
            'superorder': 10,
            'order': 11,
            'suborder': 12,
            'infraorder': 13,
            'parvorder': 14,
            'superfamily': 15,
            'family': 16,
            'subfamily': 17,
            'tribe': 18,
            'subtribe': 19,
            'genus': 20,
            'subgenus': 21,
            'species group': 22,
            'species subgroup': 23,
            'species': 24,
            'subspecies': 25,
            'varietas': 26,
            'forma': 27
        }
        self.iranks = {v: k for k, v in self.ranks.items()}
Esempio n. 2
0
def match_taxa(tree, labels, backbone_method):
    if '_' in labels[0]:
        splist = label2sciname(labels=labels, in_delim='_', out_delim=' ')
    else:
        splist = labels
    if backbone_method.startswith('ncbi'):
        ncbi = ete3.NCBITaxa()
    leaf_names = [ln.replace('_', ' ') for ln in tree.get_leaf_names()]
    leaf_name_set = set(leaf_names)
    for sp, label in zip(splist, labels):
        if backbone_method.startswith('ncbi'):
            lineage = get_lineage(sp, ncbi, rank='no')
            ancestor_names = ncbi.get_taxid_translator(lineage)
            ancestor = leaf_name_set.intersection(set(ancestor_names.values()))
            ancestor = list(ancestor)
            if (len(ancestor) > 1):
                txt = 'Multiple hits. Excluded from the output. Taxon in the list = {}, Taxa in the tree = {}\n'
                sys.stderr.write(txt.format(sp, ','.join(list(ancestor))))
                continue
            elif (len(ancestor) == 0):
                txt = 'No hit. Excluded from the output. Taxon = {}\n'
                sys.stderr.write(txt.format(sp, ','.join(list(ancestor))))
                continue
        elif backbone_method == 'user':
            ancestor = [sp]
        for leaf in tree.get_leaves():
            if (leaf.name == ancestor[0]):
                leaf.has_taxon = True
                leaf.taxon_names.append(label)
                break
    return tree
Esempio n. 3
0
def get_taxonomy_identifier_from_name(name_taxonomy,
                                      mode="infer",
                                      ncbi=None,
                                      into=pd.Series):
    """
    name_taxonomy can be either a single string or iterable of strings
    """
    accepted_modes = {'batch', 'singular'}

    # Get database
    if ncbi is None:
        ncbi = ete3.NCBITaxa()

    if mode == "infer":
        if not is_nonstring_iterable(name_taxonomy):
            mode = "singular"
        else:
            mode = "batch"
        assert mode != "infer", "Cannot infer `mode`.  Please explicitly provide mode."

    if mode == "singular":
        name_taxonomy = [name_taxonomy]
    name_taxonomy = [*map(lambda x: x.strip(), name_taxonomy)]
    Se_taxonomy = pd.Series(ncbi.get_name_translator(
        name_taxonomy)).sort_index().map(lambda x: x[0])
    if mode == "singular":
        return Se_taxonomy.values[0]
    if mode == "batch":
        idx_missing = set(name_taxonomy) - set(Se_taxonomy.index)
        number_of_name_taxonomy_missing = len(idx_missing)
        if number_of_name_taxonomy_missing > 0:
            Se_missing = pd.Series([np.nan] * number_of_name_taxonomy_missing,
                                   index=idx_missing)
            Se_taxonomy = pd.concat([Se_taxonomy, Se_missing])
        return into(Se_taxonomy)
Esempio n. 4
0
def get_lineages(labels, rank):
    if '_' in labels[0]:
        splist = label2sciname(labels=labels, in_delim='_', out_delim=' ')
    else:
        splist = labels
    ncbi = ete3.NCBITaxa()
    lineages = dict()
    for sp, label in zip(splist, labels):
        lineages[label] = get_lineage(sp, ncbi, rank)
    return lineages
    def gen_taxon(self):
        """ Generate taxon for the model from knowledge base """
        kb = self.knowledge_base
        model = self.model

        ncbi_taxa = ete3.NCBITaxa()
        taxon_name = ncbi_taxa.get_taxid_translator([kb.cell.taxon])[kb.cell.taxon]
        taxon_rank = ncbi_taxa.get_rank([kb.cell.taxon])[kb.cell.taxon]
        model_taxon = wc_lang.core.Taxon(id='taxon', name=taxon_name, model=model, 
            rank=wc_lang.core.TaxonRank[taxon_rank]) 
Esempio n. 6
0
def get_tree(taxa, savename=None):
    """
    Generates a taxonomic tree using the ncbi taxonomy and
    :param oma:  a pyoma db object
    :param saveTree: Bool for whether or not to save a mastertree newick file
    :return: tree_string: a newick string tree: an ete3 object

    """
    ncbi = ete3.NCBITaxa()
    tax = set(tax)
    genomes = set(genomes)
    tax.remove(0)
    print(len(tax))

    tree = ete3.PhyloTree(name='')
    tree.add_child(name='131567')

    topo = ncbi.get_topology(tax, collapse_subspecies=False)
    tax = set([str(taxid) for taxid in tax])
    tree.add_child(topo)
    orphans = list(genomes - set([x.name for x in tree.get_leaves()]))
    print('missing taxa:')
    print(len(orphans))
    Entrez.email = config_utils.email
    orphans_info1 = {}
    orphans_info2 = {}
    for x in orphans:
        search_handle = Entrez.efetch('taxonomy', id=str(x), retmode='xml')
        record = next(Entrez.parse(search_handle))
        print(record)
        orphans_info1[record['ParentTaxId']] = x
        orphans_info2[x] = [x['TaxId'] for x in record['LineageEx']]
    for n in tree.traverse():
        if n.name in orphans_info1:
            n.add_sister(name=orphans_info1[n.name])
            print(n)
    orphans = set(genomes) - set([x.name for x in tree.get_leaves()])
    tree = add_orphans(orphans_info2, tree, genomes)
    orphans = set(genomes) - set([x.name for x in tree.get_leaves()])
    tree_string = tree.write(format=1)
    if savename is None:
        with open(config_utils.datadir + 'mastertree.nwk', 'w') as nwkout:
            nwkout.write(tree_string)
        with open(config_utils.datadir + 'mastertree.pkl', 'wb') as pklout:
            pklout.write(pickle.dumps(tree))
    else:
        with open(config_utils.datadir + savename + '_master_tree.nwk',
                  'w') as nwkout:
            nwkout.write(tree_string)
        with open(config_utils.datadir + savename + '_master_tree.pkl',
                  'wb') as pklout:
            pklout.write(pickle.dumps(tree))
    return tree_string, tree
Esempio n. 7
0
def get_mrca_taxid(multi_counts):
    ncbi = ete3.NCBITaxa()
    max_count = multi_counts[:, 1].max()
    is_max_count = (multi_counts[:, 1] == max_count)
    max_taxids = multi_counts[is_max_count, 0]
    mrca_taxid = 1
    max_ancestor_num = 0
    for mt in max_taxids:
        ancestor_num = len(ncbi.get_lineage(mt))
        if (ancestor_num > max_ancestor_num):
            mrca_taxid = mt
            max_ancestor_num = ancestor_num
    return mrca_taxid
Esempio n. 8
0
def taxonomic_annotation(tree):
    ncbi = ete3.NCBITaxa()
    for leaf in tree.iter_leaves():
        leaf_name_split = leaf.name.split("_")
        binom_name = leaf_name_split[0] + " " + leaf_name_split[1]
        leaf.sci_name = binom_name
        name2id = ncbi.get_name_translator(names=[leaf.sci_name])
        if len(name2id[leaf.sci_name]) > 1:
            print(leaf.sci_name, "has", len(name2id[leaf.sci_name]), "taxids.")
        leaf.taxid = name2id[leaf.sci_name][0]
    tax2names, tax2lineages, tax2rank = tree.annotate_ncbi_taxa(
        taxid_attr="taxid")
    return (tree)
Esempio n. 9
0
def buildSpeciesTree(taxon, gfaln):
    """
  Build a species tree from the ncbi taxonomy and species in the gene tree,
  and write it in a file.
  
  @param1 taxon: taxon under which ncbi species are considered.
  @param2 gftree: path of the alignment
  @return the species tree file path.
  """

    ncbi = ete3.NCBITaxa(dbfile="/opt/DGINN/taxa.sqlite")

    sptree = ncbi.get_descendant_taxa(taxon,
                                      collapse_subspecies=True,
                                      return_tree=True)
    spleaves = [
        ncbi.get_taxid_translator([x]) for x in sptree.get_leaf_names()
    ]

    accns = list(SeqIO.parse(gfaln, 'fasta'))
    gLeavesSp = [name.id.split("_")[0] for name in accns]

    ## link Taxids & species names abbreviations
    leavesNames = {}
    for x in spleaves:
        for k, v in x.items():
            tax = v.replace(".", "").replace("-", "").split(" ")
            newTax = tax[0][:3].lower() + "".join(
                [i[:3].title() for i in tax[1:]])
            leavesNames[k] = newTax

    # List of tax ids of species in gene tree
    lTaxids = []
    for x in gLeavesSp:
        lTaxids += [str(k) for k, v in leavesNames.items() if v == x[0:6]]

    lTaxids = list(set(lTaxids))

    # restriction of species tree to these taxons
    sptree.prune(lTaxids)

    # back to correct leaves names
    for x in sptree:
        x.name = leavesNames[int(x.name)]

    spTreeFile = "/".join(gfaln.split("/")[:-1] + ["species_tree.tree"])
    sptree.write(format=9, outfile=spTreeFile)

    return spTreeFile
Esempio n. 10
0
def taxid2tree(lineages, taxid_counts):
    ncbi = ete3.NCBITaxa()
    is_multiple = (taxid_counts[:, 1] > 1)
    multi_counts = taxid_counts[is_multiple, :]
    clades = list()
    for i in numpy.arange(multi_counts.shape[0]):
        taxid = multi_counts[i, 0]
        count = multi_counts[i, 1]
        ancestors = ncbi.get_lineage(taxid)
        new_clade = ete3.PhyloNode()
        new_clade.ancestors = ancestors
        new_clade = populate_leaves(new_clade, taxid, lineages)
        clades = add_new_clade(clades, new_clade)
    assert len(clades) == 1, 'Failed to merge clades into a single tree.'
    tree = clades[0]
    return tree
Esempio n. 11
0
 def __init__(self,
              MongoDB=None,
              db=None,
              username=None,
              password=None,
              collection_str=None,
              authSource='admin',
              readPreference='nearest'):
     super().__init__(MongoDB=MongoDB,
                      db=db,
                      username=username,
                      password=password,
                      authSource=authSource,
                      readPreference=readPreference)
     self._ncbi_taxa = ete3.NCBITaxa()
     self.collection = self.db_obj[collection_str]
Esempio n. 12
0
def translateHeader(header_string):
	# >10224:0029f1 "pub_gene_id":"Sakowv30031477m", "pub_og_id":"EOG091G08IZ", "og_name":"guanine nucleotide binding protein-like 3 (nucleolar) ","level":33208
	# first number is an NCBI taxon id
	# second number is a unique hexadecimal id
	def unquote(x):
		x = x.replace('"','')
		x = x.replace("'","")
		return x
	def get_key_colon_value(x):
		y = x.strip().split(":")
		if not len(y)==2:
			print(x,y)
		return (y[0],y[1])
	# Find first space, indicating end of IDs
	entry_start = header_string.find('>')
	if entry_start<0:
		entry_start=0
	else:
		entry_start = entry_start+1
	id_end = header_string.find(' ')
	id_flds = header_string[entry_start:id_end].split(":")
	ncbi_taxon_id = int(id_flds[0])
	taxon_dict = ete3.NCBITaxa().get_taxid_translator([ncbi_taxon_id])
	if ncbi_taxon_id in taxon_dict:
		taxon_name = taxon_dict[ncbi_taxon_id]
	else:
		#print(id_flds)
		taxon_name = MISSING_TAXON
	rest = header_string[id_end:]
	brace_begin = rest.find('{')
	if brace_begin>0:
		rest = rest[(brace_begin+1):]
		rest = rest.replace("}",'')
	# This pattern means:
	# Find either things that look like "stuff":"stuff" or like "stuff":number
	y = re.compile('("([^":]*)":"([^"]|"")*")|("([^"]*)":(\d+))')
	def pickcolon(x):
		y = [e for e in x if e.find(":")>0]
		return y[0]
	flds = [pickcolon(x).split(":") for x in y.findall(rest)]
	res_dict = dict([(unquote(x[0]),unquote(x[1])) for x in flds])
	res_dict["taxon"] = taxon_name
	return res_dict
Esempio n. 13
0
 def __init__(self, TreePath, AlignementPath, uniprotTaxonomy):
     """This class takes the path to the Newick Tree, the fasta alignment from which the tree is derived and the path to the parsed uniprot taxonomy."""
     self.TreePath = TreePath
     self.AlignementPath = AlignementPath
     f = open(self.AlignementPath)
     lines = f.readlines()
     out = []
     for line in lines:
         if line[0] == '>':
             out.append(line.split(' ')[0] + '\n')
         else:
             out.append(line)
     f.close()
     f = open(self.AlignementPath, 'w')
     for o in out:
         f.write(o)
     f.close()
     self.tree = ete3.PhyloTree(newick=TreePath, alignment=AlignementPath)
     self.tree.set_species_naming_function(self.parse_sp_name)
     self.uniprot2ncbi = {}
     self.uniprot2species = {}
     self.ncbiID2species = {}
     self.ncbi = ete3.NCBITaxa()
     f = open(uniprotTaxonomy)
     lines = f.readlines()
     for line in lines:
         s = line.strip().split('\t')
         uniprotID = s[0]
         ncbiID = s[1].split(' ')[0]
         specie = s[2].split(',')[-1]
         self.uniprot2ncbi[uniprotID] = ncbiID
         self.uniprot2species[uniprotID] = specie
         self.ncbiID2species[ncbiID] = specie
     self.treeTaxa = []
     leaves = self.tree.get_leaves()
     for leaf in leaves:
         uniprotID = leaf.name.split('|')[0].split('_')[1]
         ncbiID = self.uniprot2ncbi[uniprotID]
         leaf.name = "%s_%s" % (ncbiID, leaf.name.split('|')[0].split('_')[1])
         # leaf.species = sel`f.uniprot2species[uniprotID]
         self.treeTaxa.append(int(ncbiID))
     self.NCBITaxonomy = self.ncbi.get_topology(self.treeTaxa, intermediate_nodes=True)
Esempio n. 14
0
def run(fastafile, outdir, length=5000, n_per_class=100000,
              testfrac =0.1, splitlevel="family",
              classes={2: "Bacteria",
                       2157: "Archaea",
                       2759: "Eukaryota",
                       10239: "Viruses"},
                       configpath=vica.CONFIG_PATH):
    """shred all sequences to the desired length"""
    try:
        global config
        config = yaml.safe_load(configpath)
        # Read data as pyfaidx object
        seqobj = _read_data(fastafile)
        ncbi = ete3.NCBITaxa()
        df = _profile_sequences(seqobj, ncbi, splitlevel, classes)
        cd = _split_levels(testfrac=testfrac, df=df, classes=classes)
        _select_contigs(n_per_class=n_per_class, cd=cd, outdir=outdir,length=length,df=df, seqobj=seqobj)
        testtaxa = _read_taxid_from_fasta(outdir=outdir)
        logging.info("Wrote {} NCBI taxomomy ids to the file 'test_taxids.txt'. This file isused to exclude test taxa from minhash during training".format(testtaxa))
    except:
        logging.exception("vica.split_train logged the following exception:")
Esempio n. 15
0
    def __init__(
        self,
        reference_tree_file='/work/Alphas_and_Cyanos/rooted_partitions-with_named_branches.treefile',
        assembly_summary='/work/Alphas_and_Cyanos/assembly_summary_genbank.txt',
        output_folder='.',
        tree_folder='/work/Alphas_and_Cyanos/ranger_input_trees',
        reconciliations_folder='/work/Alphas_and_Cyanos/reconciliations',
        reconciliation_sufix='.reconciliation',
    ):

        self.ncbi = ete3.NCBITaxa()
        self.named_reference_tree = ete3.Tree(reference_tree_file, format=1)
        self.output_folder = output_folder.strip()
        self.tree_folder = tree_folder.strip()
        self.reconciliations_folder = reconciliations_folder.strip()
        self.reconciliation_sufix = reconciliation_sufix.strip()

        header = 'assembly_accession bioproject biosample wgs_master refseq_category taxid species_taxid organism_name infraspecific_name isolate version_status assembly_level release_type genome_rep seq_rel_date asm_name submitter gbrs_paired_asm paired_asm_comp ftp_path excluded_from_refseq relation_to_type_material'.split(
        )
        genbank_summary = pd.read_table(assembly_summary,
                                        comment='#',
                                        header=None,
                                        names=header,
                                        dtype={
                                            'taxid': str,
                                            'infraspecific_name': str
                                        })
        genbank_summary['refseq_category'] = genbank_summary[
            'refseq_category'].str.lower()
        genbank_summary['assembly_level'] = genbank_summary[
            'assembly_level'].str.lower()
        genbank_summary['genome_rep'] = genbank_summary[
            'genome_rep'].str.lower()
        genbank_summary.set_index('assembly_accession', inplace=True)
        genbank_summary.index = [
            re.sub('\.\d+$', '', index).replace('_', '')
            for index in genbank_summary.index
        ]
        self.assembly_summary = genbank_summary.reindex(
            self.named_reference_tree.get_leaf_names())
            tree_to_root.set_outgroup(leaf)
            break
        else:
            is_it_monophyletic, clade_type, fucking_up = tree_to_root.check_monophyly(node.get_leaf_names(), 'name', unrooted=False)
            if is_it_monophyletic:
                equivalent = tree_to_root.get_common_ancestor(node.get_leaf_names())
                tree_to_root.set_outgroup(equivalent)
            else:
                tree_to_root.set_outgroup(fucking_up.pop())
                equivalent = tree_to_root.get_common_ancestor(node.get_leaf_names())
                tree_to_root.set_outgroup(equivalent)
            break

    return tree_to_root

ncbi = ete3.NCBITaxa()
os.chdir('/work/Alphas_and_Cyanos')
species_tree = ete3.Tree('rooted_partitions-with_BB_support.treefile', format=0)
species_tree = ete3.Tree('rooted_partitions-with_named_branches.treefile', format=1)

########################################################################################################################
#                                                                                                                      #
#                                                                                                                      #
########################################################################################################################

single_optimal_rooting = []
with cd('reconciliations/ranger_roots'):
    for group in os.listdir('.'):
        if not os.path.isdir(group) or not os.path.isfile('%s/%s.reconciliation1' %(group, group)):
            continue
        if os.path.isfile('%s/%s.optResolution1.ranger_input' %(group, group)) and not os.path.isfile('%s/%s.optResolution2.ranger_input' %(group, group)):
Esempio n. 17
0
    def load_content(self):
        """ Collect and parse all data from CORUM website and add to SQLite database """
        database_url = self.ENDPOINT_DOMAINS['corum']
        req = self.requests_session
        session = self.session

        # Extract All Files and save to current directory
        response = req.get(database_url)
        z = zipfile.ZipFile(BytesIO(response.content))
        z.extractall(self.cache_dirname)
        self.cwd = os.path.join(self.cache_dirname, 'allComplexes.txt')

        # create object to find NCBI taxonomy IDs
        ncbi_taxa = ete3.NCBITaxa()

        with open(self.cwd, 'r') as file:
            i_entry = 0
            for entry in csv.DictReader(file, delimiter='\t'):
                # entry/line number in file
                i_entry += 1

                # stop if the maximum desired number of entries has been reached
                if i_entry > self.max_entries:
                    break

                # replace 'None' strings with None
                for key, val in entry.items():
                    if val == 'None':
                        entry[key] = None

                # extract attributes
                complex_id = int(entry['ComplexID'])
                complex_name = entry['ComplexName']
                cell_line = entry['Cell line']
                su_uniprot = entry[
                    'subunits(UniProt IDs)']  # SETS OF STRING IDS SEPARATED BY ;\
                su_entrez = entry[
                    'subunits(Entrez IDs)']  # SETS OF INT IDS SEPARATED BY ;
                pur_method = entry['Protein complex purification method']
                go_id = entry[
                    'GO ID']  # SETS OF INT IDS SEPARATED BY ; eg. GO:0005634
                go_dsc = entry['GO description']
                funcat_id = entry['FunCat ID']
                funcat_dsc = entry['FunCat description']
                pubmed_id = int(entry['PubMed ID'])
                protein_name = entry['subunits(Protein name)']
                gene_name = entry['subunits(Gene name)']
                gene_syn = entry['Synonyms']
                disease_cmt = entry['Disease comment']
                su_cmt = entry['Subunits comment']
                complex_cmt = entry['Complex comment']
                swissprot_id = entry['SWISSPROT organism']
                """ ----------------- Apply field level corrections-----------------"""
                # Split the semicolon-separated lists of subunits into protein components,
                # ignoring semicolons inside square brackets
                su_uniprot_list = parse_list(su_uniprot)
                su_entrez_list = parse_list(su_entrez)
                protein_name_list = parse_list(
                    correct_protein_name_list(protein_name))

                # check list lengths match
                if len(protein_name_list) != len(su_entrez_list):
                    msg = 'Unequal number of uniprot/entrez subunits at line {}\n  {}\n  {}'.format(
                        i_entry, '; '.join(protein_name_list),
                        '; '.join(su_entrez_list))
                    raise Exception(msg)

                if len(su_uniprot_list) != len(su_entrez_list):
                    msg = 'Unequal number of uniprot/entrezs subunits at line {}\n  {}\n  {}'.format(
                        i_entry, '; '.join(su_uniprot_list),
                        '; '.join(su_entrez_list))
                    raise Exception(msg)

                # Fix the redundancy issue with swissprot_id field
                if swissprot_id:
                    swissprot_id, _, _ = swissprot_id.partition(';')
                    ncbi_name, _, _ = swissprot_id.partition(' (')
                    result = ncbi_taxa.get_name_translator([ncbi_name])
                    ncbi_id = result[ncbi_name][0]
                else:
                    ncbi_id = None
                """ ----------------- Export the entries to the SQLite database ----------------- """

                if ncbi_id:
                    q = session.query(Taxon).filter(Taxon.ncbi_id == ncbi_id)
                    if session.query(q.exists()).scalar():
                        taxon = q.first()
                    else:
                        taxon = Taxon(ncbi_id=ncbi_id,
                                      swissprot_id=swissprot_id)
                        session.add(taxon)
                else:
                    taxon = None

                observation = Observation(cell_line=cell_line,
                                          pur_method=pur_method,
                                          pubmed_id=pubmed_id,
                                          taxon=taxon)
                session.add(observation)

                complex = Complex(complex_id=complex_id,
                                  complex_name=complex_name,
                                  go_id=go_id,
                                  go_dsc=go_dsc,
                                  funcat_id=funcat_id,
                                  funcat_dsc=funcat_dsc,
                                  su_cmt=su_cmt,
                                  complex_cmt=complex_cmt,
                                  disease_cmt=disease_cmt,
                                  observation=observation)
                session.add(complex)

                for su_uniprot, su_entrez, protein_name in zip(
                        su_uniprot_list, su_entrez_list, protein_name_list):
                    subunit = Subunit(su_uniprot=su_uniprot,
                                      su_entrezs=su_entrez,
                                      protein_name=protein_name,
                                      gene_name=gene_name,
                                      gene_syn=gene_syn,
                                      complex=complex)
                    session.add(subunit)

        session.commit()
def checkTaxid(taxid=None):
  if taxid=='1' or int(taxid) in ete3.NCBITaxa().get_descendant_taxa(1,intermediate_nodes=True): return True
  else: return False
Esempio n. 19
0
def init(cfg, args):
    import ete3
    ncbi = ete3.NCBITaxa()
    ncbi.update_taxonomy_database()
    print('NCBI Taxomomy database is installed in {}.'.format(ncbi.dbfile))
def makeTaxidsListFp(taxid=None):
  outFp=privateBlastDirPath/'taxids_gis'/taxid/'taxids')
  if not(outFp.is_file() and outFp.stat().st_size>0):
    outFp.open(mode='wt').writelines(sorted([taxid+'\n']+list(map(lambda i:str(i)+'\n',ete3.NCBITaxa().get_descendant_taxa(taxid)))))
  if not(outFp.is_file() and outFp.stat().st_size>0):return None
  else: return outFp
Esempio n. 21
0
def get_taxonomy_lineage_from_identifier(identifiers,
                                         name=None,
                                         translate_ids=True,
                                         ncbi: ete3.NCBITaxa = None,
                                         verbose=0,
                                         mode="infer",
                                         include_taxid=True,
                                         include_levels=[
                                             "id_taxon", "phylum", "class",
                                             "order", "family", "genus",
                                             "species"
                                         ]):
    """
    Input: Single Taxonomy ID or a collection of identifiers w/ {id_orf:id_taxon}
    Output: pd.Series/pd.DataFrame of rank and taxonomy labels
    modes: {'infer', 'batch', 'singular'}
    """
    accepted_modes = {'batch', 'singular'}
    verbose = int(verbose)
    # Get database
    if ncbi is None:
        ncbi = ete3.NCBITaxa()

    # Infer mode
    if mode == "infer":
        if is_dict_like(identifiers):
            mode = "batch"
        else:
            mode = "singular"
        if verbose > 0:
            print(f"Inferred mode: {mode}", file=sys.stderr)
        assert mode != "infer", "Cannot infer `mode`.  Please explicitly provide mode."

    # Singular
    if mode == "singular":
        # Handle missing data
        if pd.isnull(identifiers):
            return pd.Series([], name=name)

        # Access the database
        try:
            id_taxon = int(identifiers)
            if name is None:
                name = id_taxon
            lineage = ncbi.get_lineage(id_taxon)
            ranks = dict(
                filter(lambda x: x[1] != "no rank",
                       ncbi.get_rank(lineage).items()))
            Se_taxonomy = pd.Series(ncbi.get_taxid_translator(ranks.keys()),
                                    name=name)
            if translate_ids:
                Se_taxonomy.index = Se_taxonomy.index.map(lambda x: ranks[x])
            return Se_taxonomy

        # Handle taxonomy IDs that are not in database
        except ValueError:
            if verbose > 1:
                print(id_taxon, file=sys.stderr)
            return pd.Series([], name=name)
    # Batch
    if mode == "batch":
        if not is_query_class(identifiers, "Series"):
            identifiers = pd.Series(identifiers)

        # Group each taxonomy identifier
        dataframes = list()
        for id_taxon, group in tqdm(
                pd_series_collapse(identifiers).iteritems(),
                "Searching lineage from taxonomy identifier"):
            number_of_orfs_in_group = len(group)
            Se_taxonomy = get_taxonomy_lineage_from_identifier(
                identifiers=id_taxon,
                name=None,
                translate_ids=translate_ids,
                ncbi=ncbi,
                verbose=verbose,
                mode="singular")
            df_taxon = pd.DataFrame(number_of_orfs_in_group * [Se_taxonomy])
            df_taxon.index = group
            if include_taxid:
                df_taxon["id_taxon"] = id_taxon
            dataframes.append(df_taxon)
        df_collection = pd.concat(dataframes, axis=0)
        if include_levels is None:
            include_levels = df_collection.columns
        else:
            include_levels = [
                *filter(lambda level: level in df_collection.columns,
                        include_levels)
            ]

        idx_missing_orfs = set(identifiers.index) - set(df_collection.index)
        number_of_orfs_missing = len(idx_missing_orfs)
        if number_of_orfs_missing > 0:
            A = np.empty((number_of_orfs_missing, df_collection.shape[1]))
            A[:] = np.nan
            df_missing = pd.DataFrame(A,
                                      index=idx_missing_orfs,
                                      columns=df_collection.columns)
            return pd.concat([df_collection, df_missing],
                             axis=0).loc[identifiers.index, include_levels]
        else:
            return df_collection.loc[identifiers.index, include_levels]
Esempio n. 22
0
def main():
    args = setup_args()

    print('Start')
    print('Checking commands:')

    # Here we check the input and the output,
    # in other words, we check the correction of the commands

    check_input_file(args.input_xml, '.xml')
    output = check_output_path(args.output)

    # We need to get a set with names
    handle = open(args.input_xml)
    blast_records = NCBIXML.parse(handle)

    species_set = dict()
    try:
        while True:
            blast_record = next(blast_records)
            for i in range(len(blast_record.alignments)):
                if species_set.get(
                        str(
                            pull_species_name(
                                blast_record.alignments[i].hit_def))) is None:
                    entry = 1
                else:
                    entry = species_set.get(
                        str(
                            pull_species_name(
                                blast_record.alignments[i].hit_def))) + 1
                species_set.update({
                    str(pull_species_name(blast_record.alignments[i].hit_def)):
                    entry
                })
    except StopIteration:
        print(
            'The dictionary of occurrence is built. \nCreating a .csv file ...'
        )

    with open(output + '/frequency_dictionary.csv', 'w') as csv_file:
        writer = csv.writer(csv_file, delimiter=';')
        writer.writerow(['name', 'frequency_value'])
        for key, value in species_set.items():
            writer.writerow([key, value])

    # This stage requires time, because script should download database from NCBI and parse it.
    # ncbi_db.log allows to check the date of the last update.
    print('This stage requires additional time.')
    ncbi = ete3.NCBITaxa()
    if not os.path.isfile(os.getcwd() + '/ncbi_db.log'):
        logging.basicConfig(filename="ncbi_db.log", level=logging.INFO)
        logging.info("Taxonomy DB was updated {}".format(
            datetime.datetime.now()))
        ncbi.update_taxonomy_database()

    names_list = list(species_set.keys())
    name2taxid = ncbi.get_name_translator(names_list)

    insecta_dict = {}
    mammalia_dict = {}
    bacteria_dict = {}
    archaea_dict = {}
    others_dict = {}

    for taxid in list(name2taxid.values()):
        try:
            names = ncbi.get_taxid_translator(ncbi.get_lineage(taxid[0]))
            if any("Insecta" in s for s in
                   [names[taxid] for taxid in ncbi.get_lineage(taxid[0])]):
                insecta_dict.update(ncbi.get_taxid_translator(taxid))
            elif any("Mammalia" in s for s in
                     [names[taxid] for taxid in ncbi.get_lineage(taxid[0])]):
                mammalia_dict.update(ncbi.get_taxid_translator(taxid))
            elif any("Bacteria" in s for s in
                     [names[taxid] for taxid in ncbi.get_lineage(taxid[0])]):
                bacteria_dict.update(ncbi.get_taxid_translator(taxid))
            elif any("Archaea" in s for s in
                     [names[taxid] for taxid in ncbi.get_lineage(taxid[0])]):
                archaea_dict.update(ncbi.get_taxid_translator(taxid))
            else:
                others_dict.update(ncbi.get_taxid_translator(taxid))
        except ValueError:
            print('Missed taxid: {}'.format(taxid))

    dict_keeper = [
        insecta_dict, mammalia_dict, bacteria_dict, archaea_dict, others_dict
    ]
    dict_names = [
        '/insecta_dict.csv', '/mammalia_dict.csv', '/bacteria_dict.csv',
        '/archaea_dict.csv', '/others_dict.csv'
    ]

    for i, elem in enumerate(dict_keeper):
        with open(output + dict_names[i], 'w') as csv_file:
            writer = csv.writer(csv_file)
            for key, value in elem.items():
                writer.writerow([key, value])

    print('Parsing is done.')

    # If the taxon list is given, we search child taxons
    if args.input_taxon is not None:
        check_input_file(args.input_taxon, '.txt')
        search_taxon(args.input_taxon, args.input_xml, dict_keeper, output)
Esempio n. 23
0
    def map_taxonomic_level(self, df, taxa_table=None):
        '''

        :param self:
        :param df: df containing all transfers
        :param taxa_table: tab-delimited file with all taxa present in the transfer df, should be part of a \
        assembly_summmary, either genbank of refseq
        :return:
        '''
        ncbi                  = ete3.NCBITaxa()

        #
        #read taxa table and make the necessary changes
        taxa_df               = pd.read_csv(taxa_table, sep='\t')
        #
        #important: RANGER understand "_" as separation between genome and gene IDs, so we have to remove it from
        #   the taxa table in order to match data structure in the rest of the pipe
        #... ranger is also very picky about its naming convetions, tips cannot contain "."
        taxa_df['Unnamed: 0'] = taxa_df['Unnamed: 0'].apply(lambda x: x.replace('_', '').split('.')[0])
        taxa_df['accession']  = taxa_df['accession'].apply(lambda x: x.replace('_', '').split('.')[0])
        taxa_df.set_index('Unnamed: 0', inplace=True)

        #
        #this df will hold all taxonomic data for all leaves in the species tree
        taxonomy_df = pd.DataFrame()

        #
        #traverse through all leaves in the species tree and add tags regarding its taxonomic classification
        for leaf in self.species_tree.get_leaf_names():
            #
            #kelsey's naming convention wasn't the most consistent, so let's try two distinct ways to find a leaf in the
            #   taxa table
            #
            if leaf in taxa_df.index:
                node_name = taxa_df.index[taxa_df.index == leaf][0]
            elif leaf in taxa_df.accession.values:
                node_name = taxa_df.query('accession==@leaf').index[0]
            else:
                #
                #if none works, f**k it...
                continue

            #does this leaf has a valid taxid in our taxa table?
            if pd.notnull(taxa_df.loc[node_name, 'taxid']):
                #
                #sweet, what is it?
                taxid = taxa_df.loc[node_name, 'taxid']

                #
                #create a lineage dict that we can manipulate
                lineage = {taxon_rank: taxon
                           #
                           #traverse all NCBI classification available for a its taxid
                           for taxon, taxon_rank in ncbi.get_rank(ncbi.get_lineage(taxid)).items()
                           }
                #
                #add the highest level taxonomic information we have
                lineage['leaf_name'] = leaf
                #
                #I said there would be a LOT of things here...
                taxonomy_df = taxonomy_df.append(lineage, ignore_index=True)
        #
        #our most specific information about each leaf will work as our "primary key"
        taxonomy_df.set_index('leaf_name', inplace=True)

        #
        #drop all columns not related to commonly used taxonomic ranks
        to_drop = []
        for column in taxonomy_df.columns:
            if column not in ['class', 'species', 'superkingdom', 'genus',
                              'order', 'phylum', 'family', 'kingdom']:
                to_drop.append(column)
        taxonomy_df.drop(to_drop, axis='columns', inplace=True)

        transfer_df = df.copy()
        for index, row in transfer_df.iterrows():
            donor_descendants = next(
                self.species_tree.iter_search_nodes(ranger_name=row.donor)
            ).get_leaf_names()
            recipient_descendants = next(
                self.species_tree.iter_search_nodes(ranger_name=row.recipient)
            ).get_leaf_names()

            donor_taxonomy     = taxonomy_df.loc[[taxon for taxon in donor_descendants     if taxon in taxonomy_df.index]]
            recipient_taxonomy = taxonomy_df.loc[[taxon for taxon in recipient_descendants if taxon in taxonomy_df.index]]

            if not donor_taxonomy.shape[0] or not recipient_taxonomy.shape[0]:
                #
                # if no descendant has a valid taxid, ignore it...
                continue

            donor_taxonomy.dropna(    axis=1, how='any', inplace=True)
            recipient_taxonomy.dropna(axis=1, how='any', inplace=True)

            donor_taxonomy = next(donor_taxonomy.loc[:,
                                  np.invert(donor_taxonomy.T.duplicated().values)
                                  ].iterrows())[1]
            recipient_taxonomy = next(recipient_taxonomy.loc[:,
                                      np.invert(recipient_taxonomy.T.duplicated().values)
                                      ].iterrows())[1]

            common_ranks = donor_taxonomy.index.intersection(recipient_taxonomy.index)

            for rank in ['species', 'genus', 'family', 'order',
                         'class', 'phylum', 'kingdom', 'superkingdom']:
                if rank in common_ranks[donor_taxonomy[common_ranks] == recipient_taxonomy[common_ranks]]:
                    break

            transfer_df.loc[index, 'transfer_level'] = rank

        return (transfer_df)
Esempio n. 24
0
def main(inputtree,
         outbase,
         div=True,
         features=None,
         stem_or_crown="crown",
         byrank='',
         byage=None,
         bylist=None,
         bysize=None):
    """byrank: when the rank is included in or equal to 'byrank';
       byage:  collapse any node of age <= byage;
       bylist: read list of nodes from file;
       bysize: collapse oldest nodes with size < bysize."""
    group_feature_rate = def_group_feature_rate(stem_or_crown)

    tree = ete3.PhyloTree(inputtree, format=1, quoted_node_names=False)

    outsuffix = '-stem' if stem_or_crown == 'stem' else ''

    if byrank:
        outsuffix += '-%s' % byrank
    if byage:
        outsuffix += '-age%g' % byage
    if bylist:
        outsuffix += '-list' + op.splitext(op.basename(bylist))[0]
    if bysize:
        outsuffix += '-size%d' % bysize

    outnames = {
        'tsv': (outbase + '%s.tsv' % outsuffix),
        'subtrees': (outbase + '%s.subtrees.nwk' % outsuffix),
        'tree': (outbase + '%s.nwk' % outsuffix)
    }

    for out in outnames.values():
        if op.exists(out):
            logger.error("%r already exists, quitting", out)
            return 1

    columns = [outsuffix.lstrip('-'), 'size', 'branches', 'age',
               'tot_len']  #'crown_age', 'stem_age']
    if div: columns.extend(('div_rate', 'gamma', 'ncbi_sp_sampling'))
    if features: columns.extend(features)

    if byrank or div:
        logger.info("Loading taxonomy")
        ncbi = ete3.NCBITaxa()

        name2taxid = ncbi.get_name_translator(
                            [node.name.replace('_', ' ') if node.is_leaf() \
                                else node.name for node in tree.traverse()])
        # Won't return anything for names not found

        #if rank:
        #taxid2rank = ncbi.get_rank(chain(*name2taxid.values()))
        taxid2name = ncbi.get_taxid_translator(chain(*name2taxid.values()))
    else:
        name2taxid, taxid2name = None, None

    is_leaf_fn = make_is_leaf_fn(byrank, byage, bylist, bysize, name2taxid,
                                 taxid2name)

    with open(outnames['tsv'], 'w') as outtsv, \
         open(outnames['subtrees'], 'w') as outsub:

        outtsv.write('\t'.join(columns) + '\n')

        logger.info("Iterating over found clades")
        for node in tree.iter_leaves(is_leaf_fn):
            outsub.write(
                node.write(features, format=1, format_root_node=True) + '\n')

            # Collapse
            size = len(node)
            branches = len(node.get_descendants())
            _, age = node.get_farthest_leaf()
            tot_len = sum(d.dist for d in node.iter_descendants())
            if stem_or_crown == 'stem':
                age += node.dist
                tot_len += node.dist
            values = [node.name, size, branches, age, tot_len]
            if div:
                div_rate = float(size) / age if age else np.NaN
                gamma_stat = div_gamma(node)

                try:
                    nodetaxids = name2taxid[node.name.replace('_', ' ')]
                    if len(nodetaxids) > 1:
                        nodetaxids = [
                            match_duplicate_taxid(taxids, node, taxid2name,
                                                  ncbi)
                        ]

                except KeyError:
                    # This clade isn't in the taxonomy (example: Atlantogenata)
                    # take descendant nodes and join them
                    valid_tax_children = get_valid_tax_children(
                        node, name2taxid)
                    vtc_names = [
                        vtc.name.replace(' ', '_')
                        for vtc in valid_tax_children
                    ]

                    logger.warning(
                        '%r not found in NCBI Taxonomy. Merging '
                        'the node children %s to get the '
                        'descendant counts.', node.name, vtc_names)

                    nodetaxids = []
                    for vtc_n, vtc in zip(vtc_names, valid_tax_children):
                        vtc_taxids = name2taxid[vtc_n]
                        if len(vtc_taxids) == 1:
                            nodetaxids.append(vtc_taxids[0])
                        else:
                            nodetaxids.append(
                                match_duplicate_taxid(vtc_taxids, vtc,
                                                      taxid2name, ncbi))

                ncbi_sp = list(chain(*(ncbi.get_descendant_taxa(nt,
                                                rank_limit='species') \
                                       for nt in nodetaxids)))
                #collapse_subspecies=True))
                sp_sampling = float(size) / len(ncbi_sp)
                values.extend((div_rate, gamma_stat, sp_sampling))

            if features:
                ft_rates = group_feature_rate(node, features)
                values += ft_rates.tolist()

            outtsv.write('\t'.join(str(v) for v in values) + '\n')

    tree.write(outfile=outnames['tree'],
               format=1,
               is_leaf_fn=is_leaf_fn,
               format_root_node=True)
Esempio n. 25
0
    def load_content(self, endpoint='corum'):
        """ Collect and parse all data from CORUM website into JSON files and add to NoSQL database """
        database_url = self.ENDPOINT_DOMAINS[endpoint]
        _, _, collection = self.con_db(self.collection)
        os.makedirs(os.path.join(
            self.cache_dirname, self.collection), exist_ok=True)

        if self.verbose:
            print('Download list of all compounds: ...')

        response = requests.get(database_url)
        response.raise_for_status()
        # Extract All Files and save to current directory

        if self.verbose:
            print('... Done!')
        if self.verbose:
            print('Unzipping and parsing compound list ...')

        z = zipfile.ZipFile(BytesIO(response.content))
        z.extractall(self.cache_dirname)
        if endpoint == 'corum':
            cwd = os.path.join(self.cache_dirname, 'allComplexes.txt')
        else:
            cwd = os.path.join(self.cache_dirname, 'spliceComplexes.txt')

        # create object to find NCBI taxonomy IDs
        ncbi_taxa = ete3.NCBITaxa()

        with open(cwd, 'r') as file:
            i_entry = 0
            for entry in csv.DictReader(file, delimiter='\t'):
                # entry/line number in file
                i_entry += 1

                # stop if the maximum desired number of entries has been reached
                if i_entry > self.max_entries:
                    break

                # replace 'None' strings with None
                for key, val in entry.items():
                    if val == 'None':
                        entry[key] = None

                # extract attributes
                complex_id = int(entry['ComplexID'])
                entry['complex_id'] = complex_id #replace string value with int value
                complex_name = entry['ComplexName']
                cell_line = entry['Cell line']
                pur_method = entry['Protein complex purification method']
                # SETS OF INT IDS SEPARATED BY ; eg. GO:0005634
                go_id = entry['GO ID']
                go_dsc = entry['GO description']
                funcat_id = entry['FunCat ID']
                funcat_dsc = entry['FunCat description']
                pubmed_id = int(entry['PubMed ID'])
                entry['pubmed_id'] = pubmed_id
                gene_name = entry['subunits(Gene name)']
                gene_syn = entry['subunits(Gene name syn)']
                complex_syn = entry['Synonyms']
                disease_cmt = entry['Disease comment']
                su_cmt = entry['Subunits comment']
                complex_cmt = entry['Complex comment']

                su_uniprot = entry['subunits(UniProt IDs)']  # SETS OF STRING IDS SEPARATED BY ;\
                su_entrez = entry['subunits(Entrez IDs)']  # SETS OF INT IDS SEPARATED BY ;
                protein_name = entry['subunits(Protein name)']
                swissprot_id = entry['SWISSPROT organism']

                """ ----------------- Apply field level corrections-----------------"""
                # Split the semicolon-separated lists of subunits into protein components,
                # ignoring semicolons inside square brackets
                su_uniprot_list = parse_list(su_uniprot)
                entry['subunits_isoform_id'] = su_uniprot_list
                parsed_su_uniprot_list = parse_subunits(su_uniprot_list)
                entry['subunits_uniprot_id'] = parsed_su_uniprot_list
                del entry['subunits(UniProt IDs)']
                
                su_entrez_list = parse_list(su_entrez)
                entry['subunits_entrez_id'] = su_entrez_list
                del entry['subunits(Entrez IDs)']
                
                go_id_list = parse_list(go_id)
                entry['go_id'] = go_id_list
                del entry['GO ID']
                
                go_dsc_list = parse_list(go_dsc)
                entry['go_description'] = go_dsc_list
                del entry['GO description']

                funcat_id_list = parse_list(funcat_id)
                entry['funcat_id'] = funcat_id_list
                del entry['FunCat ID']

                funcat_dsc_list = parse_list(funcat_dsc)
                entry['funcat_description'] = funcat_dsc_list
                del entry['FunCat description']


                gene_name_list = parse_list(gene_name)
                entry['subunits_gene_name'] = gene_name_list
                del entry['subunits(Gene name)']

                gene_syn_list = parse_list(gene_syn)
                entry['subunits_gene_name_synonym'] = gene_syn_list
                del entry['subunits(Gene name syn)']

                protein_name_list = parse_list(
                    correct_protein_name_list(protein_name))
               	entry['subunits_protein_name'] = protein_name_list
                del entry['subunits(Protein name)']


                # check list lengths match
                if len(protein_name_list) != len(su_entrez_list):
                    msg = 'Unequal number of uniprot/entrez subunits at line {}\n  {}\n  {}'.format(
                        i_entry, '; '.join(protein_name_list), '; '.join(su_entrez_list))
                    raise Exception(msg)

                if len(su_uniprot_list) != len(su_entrez_list):
                    msg = 'Unequal number of uniprot/entrezs subunits at line {}\n  {}\n  {}'.format(
                        i_entry, '; '.join(su_uniprot_list), '; '.join(su_entrez_list))
                    raise Exception(msg)

                # Fix the redundancy issue with swissprot_id field
                if swissprot_id:
                    swissprot_id, _, _ = swissprot_id.partition(';')
                    ncbi_name, _, _ = swissprot_id.partition(' (')
                    result = ncbi_taxa.get_name_translator([ncbi_name])
                    ncbi_id = result[ncbi_name][0]
                else:
                    ncbi_id = None
                entry['SWISSPROT_organism_NCBI_ID'] = ncbi_id
                del entry['SWISSPROT organism']

                file_name = 'corum_' + str(entry['complex_id']) + '.json'
                full_path = os.path.join(
                    self.cache_dirname, self.collection, file_name)

                with open(full_path, 'w') as f:
                    f.write(json.dumps(entry, indent=4))

                
                collection.update_one({'ComplexID': entry['ComplexID']},
                    {'$set': entry},
                    upsert=True
                    )
                    

        return collection
Esempio n. 26
0
def search_taxon(in_taxon, in_xml, dict_keeper, output):
    ncbi = ete3.NCBITaxa()
    if not os.path.isfile(os.getcwd() + '/ncbi_db.log'):
        logging.basicConfig(filename="ncbi_db.log", level=logging.INFO)
        logging.info("Taxonomy DB was updated {}".format(
            datetime.datetime.now()))
        ncbi.update_taxonomy_database()
    # Now we create one dict with unique keys, so that we can search easier
    overall_dict = dict()
    for d in dict_keeper:
        overall_dict.update(d)

    handle = open(in_xml)
    blast_records = NCBIXML.parse(handle)

    species_query_set = dict()
    try:
        while True:
            blast_record = next(blast_records)
            for i in range(len(blast_record.alignments)):
                new_name = str(
                    pull_species_name(blast_record.alignments[i].hit_def))
                for curr_taxid in ncbi.get_name_translator([new_name
                                                            ]).values():
                    for j in range(len(curr_taxid)):
                        species_query_set.update(
                            {curr_taxid[j]: blast_record.query})
    except StopIteration:
        pass

    # This will allow us to get a dict with taxon_id's from our input file
    taxon_dict = ncbi.get_name_translator(read_taxons(in_taxon))
    print('Found taxons could be found in "taxons_found.csv"')
    print('Search is started...')
    output_file = open(output + '/taxons_found.csv', 'w')
    exam_counter = 0

    for idx, key in enumerate(taxon_dict):
        tree = ncbi.get_descendant_taxa(key,
                                        collapse_subspecies=True,
                                        return_tree=True)
        if type(tree) == list:
            if overall_dict.get(taxon_dict[key][0]) is not None:
                """
                print('{0}:{1} is in your results'.format(key, overall_dict.get(taxon_dict[key][0])))
                print('\t{0} is at the {1}'.format(key, species_query_set.get(taxon_dict[key][0])))
                """
                output_file.write('{0}; {1}; {2} \n'.format(
                    key, overall_dict.get(taxon_dict[key][0]),
                    species_query_set.get(taxon_dict[key][0])))
                exam_counter += 1
        elif type(tree) == ete3.PhyloNode:
            for child in tree.children:
                # We need only the taxid number
                potential_ids = re.findall('\\b\\d+\\b', child.get_ascii())
                for potential_id in potential_ids:
                    if overall_dict.get(int(potential_id)) is not None:
                        """
                        print('{0}:{1} is in your results'.format(key, overall_dict[int(potential_id)]))
                        print('\t{0} is at the {1}'.format(key, species_query_set[int(potential_id)]))
                        """
                        output_file.write('{0}; {1}; {2} \n'.format(
                            key, overall_dict[int(potential_id)],
                            species_query_set[int(potential_id)]))
                        exam_counter += 1
        else:
            print('{0}: {1} is unknown'.format(key, taxon_dict[key]))
    output_file.close()
    print('Search is finished.')
    print('Total found {}'.format(exam_counter))