コード例 #1
0
def get_taxonomy(species_name,
                 name_format="Genus species",
                 ranks=None,
                 update_db=False):
    species_name = str(species_name)
    ncbi = NCBITaxa()
    if update_db == True:
        ncbi.update_taxonomy_database()
    if name_format == "Genus species":
        species_name = species_name
    if name_format == "Genus_species":
        species_name = species_name.replace("_", " ")
    species_id = ncbi.get_name_translator([species_name])
    if len(species_id) == 0 and ranks == None:
        return (['unknown'])
    if len(species_id) == 0 and ranks != None:
        return (['unknown'] * len(ranks))
    lineage_ids = ncbi.get_lineage(species_id[species_name][0])
    names = ncbi.get_taxid_translator(lineage_ids)
    if ranks == None:
        return (names)
    lineage_rk = ncbi.get_rank(lineage_ids)
    parsed_names = []
    for rk in ranks:
        for rk_id, rk_rk in lineage_rk.items():
            if rk_rk == rk:
                parsed_names.append(ncbi.get_taxid_translator([rk_id])[rk_id])
    return (parsed_names)
コード例 #2
0
def get_tax(
    cma_file
):  #finds the 'lowest common ancestor' of species represented in a cma file
    ncbi = NCBITaxa()
    org_regex = r'\[(.*)\]'
    taxid_list = []
    for line in open(cma_file, 'r'):
        if line.startswith(">"):
            find_org_name = re.search(org_regex, line)
            if find_org_name is not None:
                org_name = find_org_name.group(1)
                taxid = str(ncbi.get_name_translator([org_name]))
                taxid = re.sub(r'^.*\[', '', taxid)
                taxid = re.sub(r'\].*$', '', taxid)
                if taxid != '{}' and taxid != '32630' and taxid != '10239':  #omit sequences from viruses and synthetic constructs'
                    taxid_list.append(taxid)
    tax_list = ncbi.get_taxid_translator(taxid_list)
    tree = ncbi.get_topology(taxid_list)
    tree_labeled = tree.get_ascii(attributes=['sci_name', 'taxid'])
    lca_id = str(tree.get_tree_root)
    lca_id = re.sub(r"^.*node '", '', lca_id)
    lca_id = re.sub(r"'.*$", '', lca_id)
    lca_name = str(ncbi.get_taxid_translator([lca_id]))
    lca_name = re.sub(r"'}$", '', lca_name)
    lca_name = re.sub(r"^.*'", '', lca_name)
    return (lca_name, tax_list, tree_labeled)
コード例 #3
0
def main():
    """Make queries against NCBI Taxa databases"""
    # Get commandline args
    args = get_args()

    # Instantiate the ete NCBI taxa object
    ncbi = NCBITaxa()

    if args.verbose > 1:
        print("Taxa database is stored under ~/.etetoolkit/taxa.sqlite")

    # Update the database if required.
    if args.update is True:
        if args.verbose > 1:
            print(
                "Updating the taxonomy database. This may take several minutes..."
            )
        ncbi.update_taxonomy_database()

    # If a name was provided instead of a TaxID, convert and store it.
    if args.name:
        args.taxid = ncbi.get_name_translator([args.name])[args.name][0]

    if args.verbose > 0:
        tax_dict = {}
        # If a name was provided, simply add it to dict
        if args.name:
            tax_dict['Name'] = args.name
        # If not, do the opposite conversion to the above and store that
        else:
            tax_dict['Name'] = ncbi.get_taxid_translator([args.taxid
                                                          ])[args.taxid]

# Continue to populate the taxa dict with other information
        tax_dict['TaxID'] = args.taxid
        tax_dict['Rank'] = ncbi.get_rank([args.taxid])
        tax_dict['Lineage'] = ncbi.get_taxid_translator(
            ncbi.get_lineage(args.taxid))

        print("Information about your selected taxa:")
        pretty(tax_dict)

    # Main feature of the script is to get all taxa within a given group.
    descendent_taxa = ncbi.get_descendant_taxa(args.taxid)
    descendent_taxa_names = ncbi.translate_to_names(descendent_taxa)
    print("Descendent taxa for TaxID: %s" % (args.taxid))

    # Under python3, zip = izip. In python2, this list could be very large, and memory intensive
    # Suggest the script is run with python3
    if args.verbose > 0:
        for dtn, dt in zip(descendent_taxa_names, descendent_taxa):
            print("%s\t%s" % (dtn, dt))

    if args.outfile:
        with open(args.outfile, 'w') as ofh:
            for id in descendent_taxa:
                ofh.write(str(id) + '\n')
コード例 #4
0
def main():
    """main"""
    ncbi = NCBITaxa()
    args = get_args()
    infile = args.infile
    rank = args.classify_order
    out_file = args.outfile

    l = []
    level = {
        'superkingdom': 0,
        'phylum': 1,
        'class': 2,
        'order': 3,
        'family': 4,
        'genus': 5,
        'species': 6
    }

    h_num = level[rank]

    with open(infile, 'r') as in_f:
        reader = csv.reader(in_f, delimiter='\t')
        next(reader)
        for row in reader:
            if row[h_num] != 'NA':
                taxid2name = ncbi.get_taxid_translator([row[h_num]])
                name = taxid2name[int(row[h_num])]
            elif row[h_num - 1] != 'NA':
                taxid2name = ncbi.get_taxid_translator([row[h_num - 1]])
                name = taxid2name[int(row[h_num - 1])] + '_NA'
            elif row[h_num - 2] != 'NA':
                taxid2name = ncbi.get_taxid_translator([row[h_num - 2]])
                name = taxid2name[int(row[h_num - 2])] + '_NA' + '_NA'
            elif row[h_num - 3] != 'NA':
                taxid2name = ncbi.get_taxid_translator([row[h_num - 3]])
                name = taxid2name[int(row[h_num - 3])] + '_NA' + '_NA' + '_NA'
            elif row[h_num - 4] != 'NA':
                taxid2name = ncbi.get_taxid_translator([row[h_num - 4]])
                name = taxid2name[int(
                    row[h_num - 4])] + '_NA' + '_NA' + '_NA' + '_NA'
            else:
                name = 'NA'
            t = (name, row[7])
            l.append(t)

    d_count = {}
    for key, value in l:
        d_count[key] = int(d_count.get(key, 0)) + int(value)

    total_count = sum(d_count.values())

    with open(out_file, 'w') as o_f:
        for key in sorted(d_count, key=d_count.get, reverse=True):
            percent = float(d_count[key] / total_count * 100)
            #print(key,'\t',d_count[key],'\t','{0:.2f}'.format(percent), file=o_f)
            print(key, '\t', d_count[key], file=o_f)
コード例 #5
0
    def from_taxid(cls, taxid: int) -> "Lineage":
        """
        Create `Lineage` instance from taxid

        Parameters
        ----------
        taxid : int
            A valid NCBI taxonomy id

        Returns
        -------
        "Lineage"
            Instance of the `Lineage` class
        """
        ncbi = NCBITaxa()
        lineage_taxids = ncbi.get_lineage(taxid)
        lineage_names = ncbi.get_taxid_translator(lineage_taxids)
        lineage_ranks = {
            v.capitalize(): k for k, v in ncbi.get_rank(lineage_taxids).items()
        }
        if "Superkingdom" in lineage_ranks:
            lineage_ranks["Kingdom"] = lineage_ranks["Superkingdom"]
            del lineage_ranks["Superkingdom"]
        taxa: Dict[str, str] = {}
        for field in cls._fields:
            if field in lineage_ranks:
                taxa[field] = lineage_names[lineage_ranks[field]]
            else:
                break
        return cls(**taxa)
コード例 #6
0
ファイル: runSingle.py プロジェクト: BIONF/fDOG
def getTaxName(taxId):
    ncbi = NCBITaxa()
    try:
        name = ncbi.get_taxid_translator([taxId])[int(taxId)]
    except:
        name = 'UNK'
    return (name)
コード例 #7
0
    def get_lineage(self,taxid):

        ncbi = NCBITaxa()
        lineage =  ncbi.get_lineage(taxid)
        names = ncbi.get_taxid_translator(lineage)
        lin = [names[taxid] for taxid in lineage]
        return lin
コード例 #8
0
def get_metadata(records: List[SeqRecord]):
    ncbi = NCBITaxa()

    species = [gb.annotations["organism"] for gb in records]
    name_translator = ncbi.get_name_translator(species)

    sought_ranks = [
        "superkingdom", "order", "family", "subfamily", "genus", "species"
    ]

    metadata = []

    for gb in records:
        taxid = name_translator[gb.annotations["organism"]][0]
        lineage = ncbi.get_lineage(taxid)
        ranks = ncbi.get_rank(lineage)
        names = ncbi.get_taxid_translator(lineage)
        taxonomy = {
            ranks[k]: names[k]
            for k in lineage if ranks[k] in sought_ranks
        }
        metadata.append({**taxonomy, "aid": gb.id})

    df = pd.DataFrame(metadata)
    df.to_csv("metadata.csv")

    return df
コード例 #9
0
def get_full_lineage(otus):

    ### Makes the updated lineage file (full_lineages_updated.tsv), requires the ete3 library

    ## Input: List with the keys of the updated_input_dic
    ## Output: Generates the file full_lineages_updated.tsv

    from ete3 import NCBITaxa

    ncbi = NCBITaxa()

    lineages = {}

    if 0 in otus:
        lineages.update({0: ""})
        otus.remove(0)
    if 1 in otus:
        lineages.update({1: "root"})
        otus.remove(1)
    if 2 in otus:
        lineages.update({2: "root;Bacteria"})
        otus.remove(2)

    for entrie in otus:
        lineage = ncbi.get_lineage(entrie)  #returns list of lineage taxids
        names = ncbi.get_taxid_translator(lineage).values(
        )  #returns dict in which the taxids of the lineage list become the keys (int) and the translations the values. Error if there is a 0
        all_names = ";".join(names)
        lineages.update({entrie: all_names})

    lineages_df = pd.DataFrame(lineages.items(), columns=["OTU", "lineage"])
    lineages_df.to_csv("full_lineages_updated.tsv",
                       sep="\t",
                       index=False,
                       header=True)
コード例 #10
0
def get_full_lineages(otus):

    #### makes the full lineage file (lineages.tsv). Requires ete3 ####

    #Input: list of the otus in the table obtained from the get_otus function
    #Output: makes the full_lineages.tsv file

    from ete3 import NCBITaxa

    ncbi = NCBITaxa()

    lineages = {}

    if 0 in otus:
        lineages.update({0: ""})
        otus.remove(0)
    if 1 in otus:
        lineages.update({1: "root"})
        otus.remove(1)
    if 2 in otus:
        lineages.update({2: "root;Bacteria"})
        otus.remove(2)

    for entrie in otus:
        lineage = ncbi.get_lineage(entrie)  #returns list of lineage taxids
        names = ncbi.get_taxid_translator(lineage).values(
        )  #returns dict in which the taxids of the lineage list become the keys (int) and the translations the values. Error if there is a 0
        all_names = ";".join(names)
        lineages.update({entrie: all_names})

    lineages_df = pd.DataFrame(lineages.items(), columns=["OTU", "LINEAGE"])
    lineages_df.to_csv("full_lineages.tsv", sep="\t", index=False, header=True)
    print("full lineage file created")
コード例 #11
0
ファイル: alignment.py プロジェクト: andreyfsch/byte_pipette
    def correct_erroneous_repres_of_taxon_instances():
        protDB = db_handling.ProteinDatabase()
        ncbi = NCBITaxa()

        erroneous_instances = protDB.get_erroneous_repres_of_taxon_instances()

        for instance in erroneous_instances:
            lineage = ncbi.get_lineage(instance[1])
            lineage_ranks = ncbi.get_rank(lineage)
            representative_id = protDB.get_protein_entry(instance[2])[5]
            lineage_translation = ncbi.get_taxid_translator(lineage)
            if protDB.get_protein_entry(instance[2]):
                if representative_id != None:
                    representative_id = protDB.get_protein_entry(
                        instance[2])[5]
                    print('representative_id', representative_id)
                    lineage_representative = ncbi.get_lineage(
                        representative_id)
                    print('lineage_representative', lineage_representative)
                    lineage_representative_ranks = ncbi.get_rank(
                        lineage_representative)
                    print('lineage_representative_ranks',
                          lineage_representative_ranks)
                    if lineage_ranks[
                            instance[1]] == lineage_representative_ranks[
                                representative_id]:
                        count_instance = protDB.get_count_of_children_of_repres(
                            instance[1])
                        count_representative = protDB.get_count_of_children_of_repres(
                            representative_id)
                        if count_representative >= count_instance:
                            protDB.update_protein_entry(
                                {'representative_of_taxon': None}, instance[1])
                        else:
                            protDB.update_protein_entry_by_repres_by(
                                {'represented_by': instance[1]},
                                representative_id)
                            protDB.update_protein_entry(
                                {'represented_by': instance[1]},
                                representative_id)
                            protDB.update_protein_entry(
                                {'representative_of_taxon': None},
                                representative_id)
            else:
                protDB.update_protein_entry({'representative_of_taxon': None},
                                            instance[0])
                protDB.update_protein_entry(
                    {'representative_of_taxon': instance[1]}, instance[2])
                protDB.update_protein_entry(
                    {
                        'taxon_name_representative':
                        lineage_translation[instance[1]]
                    }, representative_id)
                protDB.update_protein_entry(
                    {'representative_taxon_rank': lineage_ranks[instance[1]]},
                    representative_id)

        print(
            '---------------------------------------------------------------------------------'
        )
コード例 #12
0
def contig_tax(annot_df, ncbi_db, min_prot, prop_annot, tax_thres):
    """This function takes the annotation table generated by viral_contig_maps.py and generates a table that
    provides the taxonomic lineage of each viral contig, based on the corresponding ViPhOG annotations"""

    ncbi = NCBITaxa(dbfile=ncbi_db)
    tax_rank_order = ["genus", "subfamily", "family", "order"]
    contig_set = set(annot_df["Contig"])

    for contig in contig_set:
        contig_lineage = [contig]
        contig_df = annot_df[annot_df["Contig"] == contig]
        total_prot = len(contig_df)
        annot_prot = sum(contig_df["Best_hit"] != "No hit")
        if annot_prot < prop_annot * total_prot:
            contig_lineage.extend([""] * 4)
        else:
            contig_hits = contig_df[pd.notnull(
                contig_df["Label"])]["Label"].values
            taxid_list = [
                ncbi.get_name_translator([item])[item][0]
                for item in contig_hits
            ]
            hit_lineages = [{
                y: x
                for x, y in ncbi.get_rank(ncbi.get_lineage(item)).items()
                if y in tax_rank_order
            } for item in taxid_list]
            for rank in tax_rank_order:
                taxon_list = [item.get(rank) for item in hit_lineages]
                total_hits = sum(pd.notnull(taxon_list))
                if total_hits < min_prot:
                    contig_lineage.append("")
                    continue
                else:
                    count_hits = Counter(
                        [item for item in taxon_list if pd.notnull(item)])
                    best_hit = sorted(
                        [(x, y) for x, y in count_hits.items()],
                        key=lambda x: x[1],
                        reverse=True,
                    )[0]
                    prop_hits = best_hit[1] / total_hits
                    if prop_hits < tax_thres:
                        contig_lineage.append(prop_hits)
                        continue
                    else:
                        best_lineage = ncbi.get_lineage(best_hit[0])
                        contig_lineage.extend([
                            ncbi.get_taxid_translator([key])[key]
                            if pd.notnull(key) else "" for key in [{
                                y: x
                                for x, y in ncbi.get_rank(
                                    best_lineage).items()
                            }.get(item
                                  ) for item in tax_rank_order[tax_rank_order.
                                                               index(rank):]]
                        ])
                        break
        yield contig_lineage
コード例 #13
0
ファイル: alignment.py プロジェクト: andreyfsch/byte_pipette
    def ellect_taxon_rank_representatives_step(self, taxon_rank_id,
                                               taxon_rank):
        protDB = db_handling.ProteinDatabase()
        ncbi = NCBITaxa()
        paralell = Parallelization()

        taxon_name = ncbi.get_taxid_translator([taxon_rank_id])[taxon_rank_id]

        representative_seqs = protDB.get_comparisons_same_taxon_id(
            taxon_rank_id, taxon_rank)

        identity_matrix = {}

        for seq_tuple in representative_seqs:
            if seq_tuple[0] not in identity_matrix.keys():
                identity_matrix[seq_tuple[0]] = [seq_tuple[1]]
            elif seq_tuple[0] in identity_matrix.keys():
                identity_matrix[seq_tuple[0]].append(seq_tuple[1])

            if seq_tuple[1] not in identity_matrix.keys():
                identity_matrix[seq_tuple[1]] = [seq_tuple[0]]
            elif seq_tuple[1] in identity_matrix.keys():
                identity_matrix[seq_tuple[1]].append(seq_tuple[0])

        if identity_matrix:
            params_a, params_b = [], []

            chosen_item = max(identity_matrix,
                              key=lambda k: len(identity_matrix[k]))
            # print(chosen_item, identity_matrix[chosen_item])

            for protein_id in identity_matrix[chosen_item]:
                # protDB.update_protein_entry({'represented_by':chosen_item},protein_id)
                params_a.append({'represented_by': chosen_item})
                params_b.append(protein_id)
            # print('paralellizing updates...')

            bar = Bar(taxon_rank + ' ' + taxon_name, max=len(params_a))

            paralell.parallelize_7(protDB.update_protein_entry,
                                   [params_a, params_b],
                                   bar=bar)

            bar.finish()

            # print('collapsed '+str(len(identity_matrix[chosen_item]))+' seqs of '+taxon_rank+' '+taxon_name+' into protein entry '+str(chosen_item))
            protDB.update_protein_entry(
                {
                    'representative_of_taxon': taxon_rank_id,
                    'representative_taxon_rank': taxon_rank,
                    'taxon_name_representative': taxon_name
                }, chosen_item)

            return True
        else:
            sys.stdout.write("\033[K")
            print(taxon_rank + ' ' + taxon_name)

            return False
コード例 #14
0
def getNcbiName(taxonName):
    ncbi = NCBITaxa()
    taxId = taxonName.split('@')[1]
    try:
        name = ncbi.get_taxid_translator([taxId])[int(taxId)]
    except:
        name = taxonName
    return (name)
コード例 #15
0
def from_name2ids(phylum_name, dataset='genbank', return_d2ids=False):
    """
    retrieve ids and metadata from genbank file
    :param phylum_name:
    :return:
    """
    phylum_names = [_ for _ in phylum_name.split(';') if _]
    # phylum_name = "Nitrospirae;"
    # phylum_tid = "40117"
    ncbi = NCBITaxa()

    p2tid = ncbi.get_name_translator(phylum_names)

    for _ in phylum_names:
        if not p2tid.get(_):
            print(f" '{_}'' not found. please check the name")
    tids = [p2tid.get(_, [None])[0] for _ in phylum_names if p2tid.get(_)]
    tid2name = {
        p2tid.get(_, [None])[0]: _
        for _ in phylum_names if p2tid.get(_)
    }

    domain2dids = defaultdict(list)
    descend_ids = []
    tid2dids = {}
    for tid in tids:
        lineages = ncbi.get_lineage(tid)
        ranks = ncbi.get_rank(lineages)
        ranks = {v: k for k, v in ranks.items()}
        names = ncbi.get_taxid_translator(lineages)
        domain = names[ranks['superkingdom']]

        _descend_ids = ncbi.get_descendant_taxa(tid, intermediate_nodes=True)
        tid2dids[tid2name[tid]] = len(_descend_ids)
        descend_ids += _descend_ids
        domain2dids[domain].extend(_descend_ids)
    print(f"in total, {len(descend_ids)} taxids were found. ")
    if return_d2ids:
        return domain2dids

    domain2aids = defaultdict(list)
    collect_info = []
    descend_ids = set(descend_ids)
    for domain, ids in domain2dids.items():
        d = domain.lower()
        metadata = join(metadata_files_dir,
                        f"{dataset}_{d}_assembly_summary.txt")
        tqdm.write(
            f'read {metadata} which last modified at : {time.ctime(os.path.getmtime(metadata))}'
        )
        for row in tqdm(open(metadata)):
            if row.startswith("GC"):
                rows = row.split('\t')
                if int(rows[5]) in descend_ids:
                    collect_info.append(row)
                    domain2aids[d].append(rows[0])
    return domain2aids, collect_info
コード例 #16
0
def name_output(taxid, gcf):
    """
    Given a taxid and a gcf, create a name with the organism name and its gcf
    """
    ncbi = NCBITaxa()
    name_org = ncbi.get_taxid_translator([int(taxid)])[int(taxid)]
    name_org = name_org.replace("'", '')
    name_org = name_org.replace("/", '_')
    return name_org + " " + gcf
コード例 #17
0
ファイル: addTaxon1s.py プロジェクト: BIONF/HaMStR
def getTaxName(taxId):
    ncbi = NCBITaxa()
    try:
        ncbiName = ncbi.get_taxid_translator([taxId])[int(taxId)]
        ncbiName = re.sub('[^a-zA-Z1-9\s]+', '', ncbiName)
        taxName = ncbiName.split()
        name = taxName[0][:3].upper()+taxName[1][:2].upper()
    except:
        name = "UNK" + taxId
    return(name)
コード例 #18
0
def extract_taxa(mpwt_taxon_file, taxon_output_file, tree_output_file):
    """From NCBI taxon ID, extract taxonomy rank and create a tree file

    Args:
        mpwt_taxon_file (str): mpwt taxon file for species in sbml folder
        taxon_output_file (str): path to phylum output file
        tree_output_file (str): path to tree output file

    """
    ncbi = NCBITaxa()

    taxon_ids = []

    phylum_count = {}
    with open(taxon_output_file, "w") as phylum_file:
        csvwriter = csv.writer(phylum_file, delimiter="\t")
        csvwriter.writerow([
            "species", "taxid", "phylum_number", "phylum", "class", "order",
            "family", "genus", "species"
        ])
        with open(mpwt_taxon_file, "r") as taxon_file:
            csvfile = csv.reader(taxon_file, delimiter="\t")
            for line in csvfile:
                if "taxon" not in line[1]:
                    taxon_ids.append(line[1])
                    lineage = ncbi.get_lineage(line[1])
                    lineage2ranks = ncbi.get_rank(lineage)
                    names = ncbi.get_taxid_translator(lineage)
                    ranks2lineage = dict(
                        (rank, names[taxid])
                        for (taxid, rank) in lineage2ranks.items())
                    ranks = [
                        ranks2lineage.get(rank, "no_information") for rank in [
                            "phylum", "class", "order", "family", "genus",
                            "species"
                        ]
                    ]
                    if ranks[0] != "no_information":
                        phylum = ranks[0][:4]
                    else:
                        phylum = "no_information"
                    if phylum not in phylum_count:
                        phylum_count[phylum] = 1
                    elif phylum == "no_information":
                        phylum_count[phylum] = ""
                    else:
                        phylum_count[phylum] += 1
                    row = ([line[0], line[1]] +
                           [phylum + str(phylum_count[phylum])] + ranks)
                    csvwriter.writerow(row)

    tree = ncbi.get_topology(taxon_ids)

    with open(tree_output_file, "w") as tree_file:
        tree_file.write(tree.get_ascii(attributes=["sci_name", "rank"]))
コード例 #19
0
def get_rank_dict(taxa_name=None):
    ncbi = NCBITaxa()
    name_dict = ncbi.get_name_translator([taxa_name])
    if not name_dict:
        ## try only the first word (which may be a genus name?)
        print("can not find taxid for", taxa_name, file=sys.stderr)
        taxa_name = taxa_name.split()
        if len(taxa_name) > 1:
            taxa_name = taxa_name[0]
            print("try to search %s instead..." % taxa_name, file=sys.stderr)
            name_dict = ncbi.get_name_translator([taxa_name])

        if not name_dict:
            print("can not find taxid for %s, maybe it's a misspelling.\n" %
                  taxa_name,
                  file=sys.stderr)
            return None

    lineage_taxid_list = ncbi.get_lineage(name_dict[taxa_name][0])

    rank_dict = dict()
    for rank in [
            'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'
    ]:
        rank_dict[rank] = 'NA'

    for j in lineage_taxid_list:
        rank = ncbi.get_rank([j])[j]
        taxa = ncbi.get_taxid_translator([j])[j]
        if rank == 'kingdom':
            rank_dict['kingdom'] = taxa

        elif rank == 'phylum':
            rank_dict['phylum'] = taxa

        elif rank == 'class':
            rank_dict['class'] = taxa

        elif rank == 'order':
            rank_dict['order'] = taxa

        elif rank == 'family':
            rank_dict['family'] = taxa

        elif rank == 'genus':
            rank_dict['genus'] = taxa

        elif rank == 'species':
            rank_dict['species'] = taxa

        else:
            pass

    return rank_dict
コード例 #20
0
def taxid2lineage(taxid):
    ranks = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
    ncbi = NCBITaxa()
    lineage = ncbi.get_lineage(taxid)
    lineage_dict = dict()
    names = ncbi.get_taxid_translator(lineage)
    for rank in ranks:
        for k, v in ncbi.get_rank(lineage).items():
            if v == rank:
                lineage_dict.update({v: names[k]})
    return lineage_dict
コード例 #21
0
ファイル: addTaxon1s.py プロジェクト: BIONF/HaMStR
def checkTaxId(taxId):
    ncbi = NCBITaxa()
    tmp = ncbi.get_rank([taxId])
    try:
        tmp = ncbi.get_rank([taxId])
        rank = tmp[int(taxId)]
        if not rank == 'species':
            print('\033[92mWARNING: rank of %s is not SPECIES (%s)\033[0m' % (taxId, rank))
        else:
            print('\033[92mNCBI taxon info: %s %s\033[0m' % (taxId, ncbi.get_taxid_translator([taxId])[int(taxId)]))
    except:
        print('\033[92mWARNING: %s not found in NCBI taxonomy database!\033[0m' % taxId)
コード例 #22
0
ファイル: topology.py プロジェクト: ThThalamas/pyproteinsExt
 def get_taxo(self, function_get_taxid):
     ncbi = NCBITaxa()
     taxname = None
     taxrank = None
     taxid = function_get_taxid(self)
     taxname_dic = ncbi.get_taxid_translator([taxid])
     if taxname_dic:
         taxname = taxname_dic[int(taxid)]
         taxrank_dic = ncbi.get_rank([taxid])
         if taxrank_dic:
             taxrank = taxrank_dic[int(taxid)]
     self.taxo = Taxo(taxid, taxname, taxrank)
コード例 #23
0
def get_tax_lineage(taxonid, source):
    """Return taxonomy lineage information

    This function uses Biopython library to connect NCBI database
    and search for taxonomy information or ete3 to download
    taxdump file and search the information locally.

    Parameters
    -------------
    taxonid : string
        Taxonomic id of the species
    source : string
        Source to be used to collect the info about the taxonid

    Returns
    -------------
    lineage: dict
        Species lineage

    """

    if taxonid not in LINEAGES:
        if source == "taxdump":
            ncbi_taxdump = NCBITaxa()
            lineage_ids = ncbi_taxdump.get_lineage(taxonid)
            ranks = ncbi_taxdump.get_rank(lineage_ids)
            names = ncbi_taxdump.get_taxid_translator(lineage_ids)
            lineage = {ranks[i]:names[i] for i in lineage_ids}

            LINEAGES[taxonid] = lineage
            return LINEAGES[taxonid]

        while True:
            data = ""
            try:
                Entrez.email = "*****@*****.**"
                handle = Entrez.efetch(id = taxonid, db = "taxonomy", retmode = "xml")    
                data = Entrez.read(handle)
                handle.close()
            except Exception as e:
                with open(LOG, "a") as log:
                    print("Error when searching information about {}".format(taxonid),
                        file=log)

            if data:
                break
    
        lineage = {d["Rank"]:d["ScientificName"] for d in data[0]["LineageEx"]}
        lineage[data[0]["Rank"]] = data[0]["ScientificName"]
        LINEAGES[taxonid] = lineage

    
    return LINEAGES[taxonid]
コード例 #24
0
def get_lineage_sciname_at_desired_ranks(taxid, desired_ranks):
    'Retrieve lineage information at desired taxonomic ranks'
    # initiate an instance of the ncbi taxonomy database
    ncbi = NCBITaxa()
    # retrieve lineage information for each full length 16S molecule
    lineage = ncbi.get_lineage(taxid)
    lineage2ranks = ncbi.get_rank(lineage)
    ranks2lineage = dict((rank, taxid) for (taxid, rank) in lineage2ranks.items())
    ranki = [ranks2lineage.get(x) for x in desired_ranks]
    ranks = [x if x is not None else 0 for x in ranki]
    ranks_translation = ncbi.get_taxid_translator(ranks)
    ranks_sciname = [ranks_translation[x] if x != 0 else 'NA' for x in ranks]
    return(ranks, ranks_sciname)
コード例 #25
0
ファイル: alignment.py プロジェクト: andreyfsch/byte_pipette
    def assign_rank_representation(self, rank='species'):
        protDB = db_handling.ProteinDatabase()
        ncbi = NCBITaxa()
        entries_no_representative = protDB.get_entries_no_representative()

        for entry in entries_no_representative:
            taxon_id = entry[1]
            with warnings.catch_warnings(record=True) as w:
                warn_msg = None
                warnings.simplefilter("always")
                lineage = ncbi.get_lineage(taxon_id)
                for a in w:
                    warn_msg = a.message
                if warn_msg:
                    warn_data = str(warn_msg).split()
                    taxon_id = int(warn_data[-1])
                    protDB.update_protein_entry(
                        {'representative_of_taxon': taxon_id}, entry[0])

            lineage_ranks = ncbi.get_rank(lineage)
            lineage_translation = ncbi.get_taxid_translator(lineage)
            insert = True
            ellected_rank_id = ''
            for rank_id, lineage_rank in lineage_ranks.items():
                if rank == lineage_rank:
                    ellected_rank_id = rank_id
            print(entry[0])
            if lineage_ranks[taxon_id] != rank:
                if not self.bigger_than_rank_taxon(
                        lineage_ranks[taxon_id],
                        rank) and ellected_rank_id != '':
                    protDB.update_protein_entry(
                        {
                            'representative_of_taxon':
                            ellected_rank_id,
                            'representative_taxon_rank':
                            rank,
                            'taxon_name_representative':
                            lineage_translation[ellected_rank_id]
                        }, entry[0])
                    insert = False
            if entry[2] == None and insert and ellected_rank_id != '':
                protDB.update_protein_entry(
                    {
                        'representative_of_taxon':
                        ellected_rank_id,
                        'representative_taxon_rank':
                        rank,
                        'taxon_name_representative':
                        lineage_translation[ellected_rank_id]
                    }, entry[0])
コード例 #26
0
def make_krona_table(f, db):
    if not db:
        ncbi_taxa = NCBITaxa()
    else:
        ncbi_taxa = NCBITaxa(db)
    krona_table = pd.DataFrame(columns = ["abundance","superkingdom","phylum","class","order","family","genus",
                                          "species","leaf"])
    one_letter_ranks = {"D": "superkingdom", "P": "phylum", "C": "class", "O": "order", "F": "family", "G": "genus",
                        "S": "species"}
    df = pd.read_csv(f, header=None, names = ["clade_percent", "clade_reads", "reads", "rank", "taxid", "name"], sep="\t")
    df = df.loc[df.reads > 0]
    for j, i in enumerate(df.index):
        r = df.loc[i]
        taxid = r["taxid"]
        reads = r["reads"]
        name = r["name"]
        one_letter_rank = r["rank"]
        if one_letter_rank == "-":
            rank = ncbi_taxa.get_rank([taxid])[taxid]
            try:
                parent_taxid = ncbi_taxa.get_lineage(taxid)[-2]
            except IndexError:
                parent_taxid = taxid
            parent_rank = ncbi_taxa.get_rank([parent_taxid])[parent_taxid]
            if rank == "no rank" and parent_rank == "species":
                rank = "leaf"
            else:
                continue
        elif one_letter_rank == "U":
            rank = "unclassified"
        else:
            try:
                rank = one_letter_ranks[one_letter_rank]
                #TODO: Shouldn't be too many reads mapped directly to ranks not in the krona table, but check eventually
            except KeyError:
                continue
        res = {"abundance": reads, "superkingdom": "", "phylum": "", "class": "", "order": "", "family": "",
               "genus": "", "species": "", "leaf": ""}
        if rank != "unclassified":
            rank_dict = ncbi_taxa.get_rank(ncbi_taxa.get_lineage(taxid))
            name_dict = ncbi_taxa.get_taxid_translator(ncbi_taxa.get_lineage(taxid))
            for dict_taxid, dict_rank in rank_dict.items():
                if dict_rank in res.keys():
                    rank_name = name_dict[dict_taxid]
                    res[dict_rank] = rank_name
            if not rank in ["superkingdom", "phylum", "class", "order", "family", "genus", "species"]:
                res["leaf"] = name
        _df = pd.DataFrame(res, index=[j])[krona_table.columns]
        krona_table = pd.concat([krona_table, _df])
    return krona_table
コード例 #27
0
def taxid_to_lineage_string(taxid):
    tax_order = ['kingdom', 'domain', 'phylum', 'class', 'order', 'family', 'genus', 'species']
    outstr = ''
    ncbi = NCBITaxa()
    lineage = ncbi.get_lineage(taxid)
    names = ncbi.get_taxid_translator(lineage)
    for level in tax_order:
        for tid in names:
            rank = ncbi.get_rank([tid])
            if rank[tid] == 'superkingdom':
                rank[tid] = 'domain'
            if rank[tid] == level:
                outstr += level[0] + '_' + names[tid] + ';'
    return outstr[:-1]
コード例 #28
0
class SynTax:
    """Synopsis: SynTax class contains all the relevant taxonomy to mine"""
    def __init__(self):
        self.ncbi = NCBITaxa()

    def get_descendants(self, domain: str, taxon_rank: str) -> List[str]:
        """Synopsis: Fetch all the available taxids"""
        # Domain must be in title case
        taxids = self.ncbi.get_descendant_taxa(domain,
                                               rank_limit=taxon_rank,
                                               collapse_subspecies=True)
        taxa_names = (self.ncbi.get_taxid_translator([taxa])
                      for taxa in taxids)
        return [values for i in taxa_names for key, values in i.items()]
コード例 #29
0
def get_taxonomic_group_mapping(group_ids: List[str],
                                selected_rank: str) -> Tuple[Dict, Dict]:
    """
    Function to create a mapping from NCBI-taxon ids to groups which are used to split the provided
    training records into training and validation sets

    :param group_ids: List of identifiers that should be NCBI taxon ids
    :param selected_rank: selected standard rank determining on which level the set is split in
                          training and validation-set
    :return: Mapping of input taxon_ids as string and groups as integers
    """
    ncbi = NCBITaxa()
    standard_ranks = [
        "superkingdom", "phylum", "class", "order", "family", "genus",
        "species"
    ]
    if not selected_rank.lower() in standard_ranks:
        selected_rank = auto_select_rank(group_ids)

    taxon_ids_set = set(group_ids)
    taxon_ancestor_mapping = {}

    for taxon in taxon_ids_set:
        lineage = ncbi.get_lineage(int(taxon))
        ids_of_ranks = ncbi.get_rank(lineage)
        taxon_ancestor_mapping[
            taxon] = 0  # fall-back value if sample does not have an entry on this level
        for ancestor_id, rank in ids_of_ranks.items():
            if rank == selected_rank:
                taxon_ancestor_mapping[taxon] = ancestor_id

    ancestor_ids = set(taxon_ancestor_mapping.values())
    ancestor_names = ncbi.get_taxid_translator(ancestor_ids)
    ancestor_names[0] = "unknown"
    ancestor_enumeration = {
        ancestor_id: x
        for x, ancestor_id in enumerate(ancestor_ids)
    }

    group_name_mapping = {
        taxon: ancestor_names[taxon_ancestor_mapping[taxon]]
        for taxon in group_ids
    }
    group_id_mapping = {
        taxon: ancestor_enumeration[taxon_ancestor_mapping[taxon]]
        for taxon in group_ids
    }

    return group_name_mapping, group_id_mapping
コード例 #30
0
def get_ncbi_taxonomy(taxid):

	ncbi= NCBITaxa()

	lineage = ncbi.get_lineage(taxid)
	names = ncbi.get_taxid_translator(lineage)
	ranks = ncbi.get_rank(lineage)
	ncbi_taxonomy_path=""

	for taxid in lineage:

		if not ranks[taxid]=="no rank":
			ncbi_taxonomy_path = ncbi_taxonomy_path +";"+names[taxid]

	return(ncbi_taxonomy_path)
コード例 #31
0
ファイル: ete_ncbiquery.py プロジェクト: Ward9250/ete
def run(args):
    # add lineage profiles/stats

    import re
    from ete3 import PhyloTree, NCBITaxa

    # dump tree by default
    if not args.tree and not args.info and not args.descendants:
        args.tree = True

    ncbi = NCBITaxa()

    all_taxids = {}
    all_names = set()
    queries = []

    if not args.search:
        log.error('Search terms should be provided (i.e. --search) ')
        sys.exit(-1)
    for n in args.search:
        queries.append(n)
        try:
            all_taxids[int(n)] = None
        except ValueError:
            all_names.add(n.strip())

    # translate names
    name2tax = ncbi.get_name_translator(all_names)
    all_taxids.update([(v, None) for v in list(name2tax.values())])

    not_found_names = all_names - set(name2tax.keys())
    if args.fuzzy and not_found_names:
        log.warn("%s unknown names", len(not_found_names))
        for name in not_found_names:
            # enable extension loading
            tax, realname, sim = ncbi.get_fuzzy_name_translation(name, args.fuzzy)
            if tax:
                all_taxids[tax] = None
                name2tax[name] = tax
                name2realname[name] = realname
                name2score[name] = "Fuzzy:%0.2f" %sim

    if not_found_names:
        log.warn("[%s] could not be translated into taxids!" %','.join(not_found_names))

    if args.tree:
        if len(all_taxids) == 1:
            target_taxid = next(all_taxids.keys())
            log.info("Dumping NCBI descendants tree for %s" %(target_taxid))
            t = ncbi.get_descendant_taxa(target_taxid, collapse_subspecies=args.collapse_subspecies, rank_limit=args.rank_limit, return_tree=True)
        else:
            log.info("Dumping NCBI taxonomy of %d taxa..." %(len(all_taxids)))
            t = ncbi.get_topology(list(all_taxids.keys()),
                              intermediate_nodes=args.full_lineage,
                              rank_limit=args.rank_limit,
                              collapse_subspecies=args.collapse_subspecies)

        id2name = ncbi.get_taxid_translator([n.name for n in t.traverse()])
        for n in t.traverse():
            n.add_features(taxid=n.name)
            n.add_features(sci_name=str(id2name.get(int(n.name), "?")))
            n.name = "%s - %s" %(id2name.get(int(n.name), n.name), n.name)
            lineage = ncbi.get_lineage(n.taxid)
            n.add_features(named_lineage = '|'.join(ncbi.translate_to_names(lineage)))
        dump(t, features=["taxid", "name", "rank", "bgcolor", "sci_name",
                          "collapse_subspecies", "named_lineage"])
    elif args.descendants:
        log.info("Dumping NCBI taxonomy of %d taxa..." %(len(all_taxids)))
        print('# ' + '\t'.join(["Taxid", "Sci.Name", "Rank", "descendant_taxids", "descendant_names"]))
        translator = ncbi.get_taxid_translator(all_taxids)
        ranks = ncbi.get_rank(all_taxids)
        for taxid in all_taxids:
            descendants = ncbi.get_descendant_taxa(taxid, collapse_subspecies=args.collapse_subspecies, rank_limit=args.rank_limit)
            print('\t'.join([str(taxid), translator.get(taxid, taxid), ranks.get(taxid, ''),
                             '|'.join(map(str, descendants)),
                             '|'.join(map(str, ncbi.translate_to_names(descendants)))]))

    elif args.info:
        print('# ' + '\t'.join(["Taxid", "Sci.Name", "Rank", "Named Lineage", "Taxid Lineage"]))
        translator = ncbi.get_taxid_translator(all_taxids)
        ranks = ncbi.get_rank(all_taxids)
        for taxid, name in six.iteritems(translator):
            lineage = ncbi.get_lineage(taxid)
            named_lineage = ','.join(ncbi.translate_to_names(lineage))
            lineage_string = ','.join(map(str, lineage))
            print('\t'.join([str(taxid), name, ranks.get(taxid, ''), named_lineage, lineage_string]))
コード例 #32
0
#!/usr/bin/python

Usage = """
Print taxid's lineage and ranks
by default prints to the stdout
Usage:
  taxid_ranks.py taxid > ouput.txt

Arun Seetharam
[email protected]
taxid_ranks.py -version 1.0
04/13/2017
"""
from ete3 import NCBITaxa
import sys
ncbi = NCBITaxa()
if len(sys.argv)<2:
    print Usage
else:
    cmdargs = str(sys.argv)
    lineage = ncbi.get_lineage((sys.argv[1]))
    names = ncbi.get_taxid_translator(lineage)
    for taxid in lineage:
        print [ncbi.get_rank([taxid])], [names[taxid]]        
#    print [names[taxid] for taxid in lineage]
#    print [ncbi.get_rank([taxid]) for taxid in lineage]
#    print [ncbi.get_rank([name]) for name in names]
コード例 #33
0
#!/usr/bin/python3
from ete3 import NCBITaxa

ncbi = NCBITaxa()
diamond_path = "/home/anna/bioinformatics/diplonema/dpapi_genome_diamond.tsv"
out_path = "/home/anna/bioinformatics/diplonema/dpapi_genome_diamond_annotation.tsv"

taxids = []

with open(diamond_path) as input_f:
    for line in input_f:
        newtaxid = line.split("\t")[1]
        taxids.append(newtaxid)

taxids_nr = list(set(taxids))
tax_names = ncbi.get_taxid_translator(taxids_nr)

input_f = open(diamond_path, "r")
output_f = open(out_path, 'w')

for line in input_f:
    line_split = line.rstrip().split("\t")
    id = line_split[0]
    taxid = line_split[1]
    evalue = line_split[2]
    if taxid == "0":
        name = "None"
        is_bacteria = 0
    else:
        name = tax_names[int(taxid)]
        is_bacteria = 1 if 2 in ncbi.get_lineage(taxid) else 0