コード例 #1
0
ファイル: alignment.py プロジェクト: andreyfsch/byte_pipette
    def correct_erroneous_repres_of_taxon_instances():
        protDB = db_handling.ProteinDatabase()
        ncbi = NCBITaxa()

        erroneous_instances = protDB.get_erroneous_repres_of_taxon_instances()

        for instance in erroneous_instances:
            lineage = ncbi.get_lineage(instance[1])
            lineage_ranks = ncbi.get_rank(lineage)
            representative_id = protDB.get_protein_entry(instance[2])[5]
            lineage_translation = ncbi.get_taxid_translator(lineage)
            if protDB.get_protein_entry(instance[2]):
                if representative_id != None:
                    representative_id = protDB.get_protein_entry(
                        instance[2])[5]
                    print('representative_id', representative_id)
                    lineage_representative = ncbi.get_lineage(
                        representative_id)
                    print('lineage_representative', lineage_representative)
                    lineage_representative_ranks = ncbi.get_rank(
                        lineage_representative)
                    print('lineage_representative_ranks',
                          lineage_representative_ranks)
                    if lineage_ranks[
                            instance[1]] == lineage_representative_ranks[
                                representative_id]:
                        count_instance = protDB.get_count_of_children_of_repres(
                            instance[1])
                        count_representative = protDB.get_count_of_children_of_repres(
                            representative_id)
                        if count_representative >= count_instance:
                            protDB.update_protein_entry(
                                {'representative_of_taxon': None}, instance[1])
                        else:
                            protDB.update_protein_entry_by_repres_by(
                                {'represented_by': instance[1]},
                                representative_id)
                            protDB.update_protein_entry(
                                {'represented_by': instance[1]},
                                representative_id)
                            protDB.update_protein_entry(
                                {'representative_of_taxon': None},
                                representative_id)
            else:
                protDB.update_protein_entry({'representative_of_taxon': None},
                                            instance[0])
                protDB.update_protein_entry(
                    {'representative_of_taxon': instance[1]}, instance[2])
                protDB.update_protein_entry(
                    {
                        'taxon_name_representative':
                        lineage_translation[instance[1]]
                    }, representative_id)
                protDB.update_protein_entry(
                    {'representative_taxon_rank': lineage_ranks[instance[1]]},
                    representative_id)

        print(
            '---------------------------------------------------------------------------------'
        )
コード例 #2
0
def contig_tax(annot_df, ncbi_db, min_prot, prop_annot, tax_thres):
    """This function takes the annotation table generated by viral_contig_maps.py and generates a table that
    provides the taxonomic lineage of each viral contig, based on the corresponding ViPhOG annotations"""

    ncbi = NCBITaxa(dbfile=ncbi_db)
    tax_rank_order = ["genus", "subfamily", "family", "order"]
    contig_set = set(annot_df["Contig"])

    for contig in contig_set:
        contig_lineage = [contig]
        contig_df = annot_df[annot_df["Contig"] == contig]
        total_prot = len(contig_df)
        annot_prot = sum(contig_df["Best_hit"] != "No hit")
        if annot_prot < prop_annot * total_prot:
            contig_lineage.extend([""] * 4)
        else:
            contig_hits = contig_df[pd.notnull(
                contig_df["Label"])]["Label"].values
            taxid_list = [
                ncbi.get_name_translator([item])[item][0]
                for item in contig_hits
            ]
            hit_lineages = [{
                y: x
                for x, y in ncbi.get_rank(ncbi.get_lineage(item)).items()
                if y in tax_rank_order
            } for item in taxid_list]
            for rank in tax_rank_order:
                taxon_list = [item.get(rank) for item in hit_lineages]
                total_hits = sum(pd.notnull(taxon_list))
                if total_hits < min_prot:
                    contig_lineage.append("")
                    continue
                else:
                    count_hits = Counter(
                        [item for item in taxon_list if pd.notnull(item)])
                    best_hit = sorted(
                        [(x, y) for x, y in count_hits.items()],
                        key=lambda x: x[1],
                        reverse=True,
                    )[0]
                    prop_hits = best_hit[1] / total_hits
                    if prop_hits < tax_thres:
                        contig_lineage.append(prop_hits)
                        continue
                    else:
                        best_lineage = ncbi.get_lineage(best_hit[0])
                        contig_lineage.extend([
                            ncbi.get_taxid_translator([key])[key]
                            if pd.notnull(key) else "" for key in [{
                                y: x
                                for x, y in ncbi.get_rank(
                                    best_lineage).items()
                            }.get(item
                                  ) for item in tax_rank_order[tax_rank_order.
                                                               index(rank):]]
                        ])
                        break
        yield contig_lineage
コード例 #3
0
ファイル: addTaxon1s.py プロジェクト: BIONF/HaMStR
def checkTaxId(taxId):
    ncbi = NCBITaxa()
    tmp = ncbi.get_rank([taxId])
    try:
        tmp = ncbi.get_rank([taxId])
        rank = tmp[int(taxId)]
        if not rank == 'species':
            print('\033[92mWARNING: rank of %s is not SPECIES (%s)\033[0m' % (taxId, rank))
        else:
            print('\033[92mNCBI taxon info: %s %s\033[0m' % (taxId, ncbi.get_taxid_translator([taxId])[int(taxId)]))
    except:
        print('\033[92mWARNING: %s not found in NCBI taxonomy database!\033[0m' % taxId)
コード例 #4
0
def make_krona_table(f, db):
    if not db:
        ncbi_taxa = NCBITaxa()
    else:
        ncbi_taxa = NCBITaxa(db)
    krona_table = pd.DataFrame(columns = ["abundance","superkingdom","phylum","class","order","family","genus",
                                          "species","leaf"])
    one_letter_ranks = {"D": "superkingdom", "P": "phylum", "C": "class", "O": "order", "F": "family", "G": "genus",
                        "S": "species"}
    df = pd.read_csv(f, header=None, names = ["clade_percent", "clade_reads", "reads", "rank", "taxid", "name"], sep="\t")
    df = df.loc[df.reads > 0]
    for j, i in enumerate(df.index):
        r = df.loc[i]
        taxid = r["taxid"]
        reads = r["reads"]
        name = r["name"]
        one_letter_rank = r["rank"]
        if one_letter_rank == "-":
            rank = ncbi_taxa.get_rank([taxid])[taxid]
            try:
                parent_taxid = ncbi_taxa.get_lineage(taxid)[-2]
            except IndexError:
                parent_taxid = taxid
            parent_rank = ncbi_taxa.get_rank([parent_taxid])[parent_taxid]
            if rank == "no rank" and parent_rank == "species":
                rank = "leaf"
            else:
                continue
        elif one_letter_rank == "U":
            rank = "unclassified"
        else:
            try:
                rank = one_letter_ranks[one_letter_rank]
                #TODO: Shouldn't be too many reads mapped directly to ranks not in the krona table, but check eventually
            except KeyError:
                continue
        res = {"abundance": reads, "superkingdom": "", "phylum": "", "class": "", "order": "", "family": "",
               "genus": "", "species": "", "leaf": ""}
        if rank != "unclassified":
            rank_dict = ncbi_taxa.get_rank(ncbi_taxa.get_lineage(taxid))
            name_dict = ncbi_taxa.get_taxid_translator(ncbi_taxa.get_lineage(taxid))
            for dict_taxid, dict_rank in rank_dict.items():
                if dict_rank in res.keys():
                    rank_name = name_dict[dict_taxid]
                    res[dict_rank] = rank_name
            if not rank in ["superkingdom", "phylum", "class", "order", "family", "genus", "species"]:
                res["leaf"] = name
        _df = pd.DataFrame(res, index=[j])[krona_table.columns]
        krona_table = pd.concat([krona_table, _df])
    return krona_table
コード例 #5
0
def get_metadata(records: List[SeqRecord]):
    ncbi = NCBITaxa()

    species = [gb.annotations["organism"] for gb in records]
    name_translator = ncbi.get_name_translator(species)

    sought_ranks = [
        "superkingdom", "order", "family", "subfamily", "genus", "species"
    ]

    metadata = []

    for gb in records:
        taxid = name_translator[gb.annotations["organism"]][0]
        lineage = ncbi.get_lineage(taxid)
        ranks = ncbi.get_rank(lineage)
        names = ncbi.get_taxid_translator(lineage)
        taxonomy = {
            ranks[k]: names[k]
            for k in lineage if ranks[k] in sought_ranks
        }
        metadata.append({**taxonomy, "aid": gb.id})

    df = pd.DataFrame(metadata)
    df.to_csv("metadata.csv")

    return df
コード例 #6
0
ファイル: alignment.py プロジェクト: andreyfsch/byte_pipette
    def sort_collection_by_taxon_rank(collection,
                                      key,
                                      rank='species',
                                      rank_id=None):
        ncbi = NCBITaxa()

        new_collection = collection

        i = 0
        for item in collection:
            lineage = ncbi.get_lineage(item[key])
            lineage_ranks = ncbi.get_rank(lineage)
            if (rank in lineage_ranks.values()
                    and item[key] in lineage_ranks.keys()
                    and lineage_ranks[item[key]] != rank):
                for taxon, taxon_rank in lineage_ranks.items():
                    if rank == taxon_rank:
                        new_rep = []
                        for k in range(len(item)):
                            if k == key:
                                new_rep.append(taxon)
                            else:
                                new_rep.append(item[k])
                        new_rep = tuple(new_rep)
                        if rank_id == None:
                            new_collection[i] = new_rep
                        elif taxon == rank_id:
                            new_collection[i] = new_rep

            i += 1
        sorted_collection = tuple(
            sorted(new_collection, key=operator.itemgetter(key)))

        return sorted_collection
コード例 #7
0
    def from_taxid(cls, taxid: int) -> "Lineage":
        """
        Create `Lineage` instance from taxid

        Parameters
        ----------
        taxid : int
            A valid NCBI taxonomy id

        Returns
        -------
        "Lineage"
            Instance of the `Lineage` class
        """
        ncbi = NCBITaxa()
        lineage_taxids = ncbi.get_lineage(taxid)
        lineage_names = ncbi.get_taxid_translator(lineage_taxids)
        lineage_ranks = {
            v.capitalize(): k for k, v in ncbi.get_rank(lineage_taxids).items()
        }
        if "Superkingdom" in lineage_ranks:
            lineage_ranks["Kingdom"] = lineage_ranks["Superkingdom"]
            del lineage_ranks["Superkingdom"]
        taxa: Dict[str, str] = {}
        for field in cls._fields:
            if field in lineage_ranks:
                taxa[field] = lineage_names[lineage_ranks[field]]
            else:
                break
        return cls(**taxa)
コード例 #8
0
def get_taxonomy(species_name,
                 name_format="Genus species",
                 ranks=None,
                 update_db=False):
    species_name = str(species_name)
    ncbi = NCBITaxa()
    if update_db == True:
        ncbi.update_taxonomy_database()
    if name_format == "Genus species":
        species_name = species_name
    if name_format == "Genus_species":
        species_name = species_name.replace("_", " ")
    species_id = ncbi.get_name_translator([species_name])
    if len(species_id) == 0 and ranks == None:
        return (['unknown'])
    if len(species_id) == 0 and ranks != None:
        return (['unknown'] * len(ranks))
    lineage_ids = ncbi.get_lineage(species_id[species_name][0])
    names = ncbi.get_taxid_translator(lineage_ids)
    if ranks == None:
        return (names)
    lineage_rk = ncbi.get_rank(lineage_ids)
    parsed_names = []
    for rk in ranks:
        for rk_id, rk_rk in lineage_rk.items():
            if rk_rk == rk:
                parsed_names.append(ncbi.get_taxid_translator([rk_id])[rk_id])
    return (parsed_names)
コード例 #9
0
class TaxaRetriever(object):

    # tested
    def __init__(self, category):
        self.ncbi = NCBITaxa()
        self.species = list(
            self.ncbi.get_descendant_taxa(category, collapse_subspecies=True))
        self.ranks = self.ncbi.get_rank(self.species)
        self.taxas = filter(lambda x: self.ranks[x] == 'species', self.species)
コード例 #10
0
def main():
    """Make queries against NCBI Taxa databases"""
    # Get commandline args
    args = get_args()

    # Instantiate the ete NCBI taxa object
    ncbi = NCBITaxa()

    if args.verbose > 1:
        print("Taxa database is stored under ~/.etetoolkit/taxa.sqlite")

    # Update the database if required.
    if args.update is True:
        if args.verbose > 1:
            print(
                "Updating the taxonomy database. This may take several minutes..."
            )
        ncbi.update_taxonomy_database()

    # If a name was provided instead of a TaxID, convert and store it.
    if args.name:
        args.taxid = ncbi.get_name_translator([args.name])[args.name][0]

    if args.verbose > 0:
        tax_dict = {}
        # If a name was provided, simply add it to dict
        if args.name:
            tax_dict['Name'] = args.name
        # If not, do the opposite conversion to the above and store that
        else:
            tax_dict['Name'] = ncbi.get_taxid_translator([args.taxid
                                                          ])[args.taxid]

# Continue to populate the taxa dict with other information
        tax_dict['TaxID'] = args.taxid
        tax_dict['Rank'] = ncbi.get_rank([args.taxid])
        tax_dict['Lineage'] = ncbi.get_taxid_translator(
            ncbi.get_lineage(args.taxid))

        print("Information about your selected taxa:")
        pretty(tax_dict)

    # Main feature of the script is to get all taxa within a given group.
    descendent_taxa = ncbi.get_descendant_taxa(args.taxid)
    descendent_taxa_names = ncbi.translate_to_names(descendent_taxa)
    print("Descendent taxa for TaxID: %s" % (args.taxid))

    # Under python3, zip = izip. In python2, this list could be very large, and memory intensive
    # Suggest the script is run with python3
    if args.verbose > 0:
        for dtn, dt in zip(descendent_taxa_names, descendent_taxa):
            print("%s\t%s" % (dtn, dt))

    if args.outfile:
        with open(args.outfile, 'w') as ofh:
            for id in descendent_taxa:
                ofh.write(str(id) + '\n')
コード例 #11
0
def from_name2ids(phylum_name, dataset='genbank', return_d2ids=False):
    """
    retrieve ids and metadata from genbank file
    :param phylum_name:
    :return:
    """
    phylum_names = [_ for _ in phylum_name.split(';') if _]
    # phylum_name = "Nitrospirae;"
    # phylum_tid = "40117"
    ncbi = NCBITaxa()

    p2tid = ncbi.get_name_translator(phylum_names)

    for _ in phylum_names:
        if not p2tid.get(_):
            print(f" '{_}'' not found. please check the name")
    tids = [p2tid.get(_, [None])[0] for _ in phylum_names if p2tid.get(_)]
    tid2name = {
        p2tid.get(_, [None])[0]: _
        for _ in phylum_names if p2tid.get(_)
    }

    domain2dids = defaultdict(list)
    descend_ids = []
    tid2dids = {}
    for tid in tids:
        lineages = ncbi.get_lineage(tid)
        ranks = ncbi.get_rank(lineages)
        ranks = {v: k for k, v in ranks.items()}
        names = ncbi.get_taxid_translator(lineages)
        domain = names[ranks['superkingdom']]

        _descend_ids = ncbi.get_descendant_taxa(tid, intermediate_nodes=True)
        tid2dids[tid2name[tid]] = len(_descend_ids)
        descend_ids += _descend_ids
        domain2dids[domain].extend(_descend_ids)
    print(f"in total, {len(descend_ids)} taxids were found. ")
    if return_d2ids:
        return domain2dids

    domain2aids = defaultdict(list)
    collect_info = []
    descend_ids = set(descend_ids)
    for domain, ids in domain2dids.items():
        d = domain.lower()
        metadata = join(metadata_files_dir,
                        f"{dataset}_{d}_assembly_summary.txt")
        tqdm.write(
            f'read {metadata} which last modified at : {time.ctime(os.path.getmtime(metadata))}'
        )
        for row in tqdm(open(metadata)):
            if row.startswith("GC"):
                rows = row.split('\t')
                if int(rows[5]) in descend_ids:
                    collect_info.append(row)
                    domain2aids[d].append(rows[0])
    return domain2aids, collect_info
コード例 #12
0
def extract_taxa(mpwt_taxon_file, taxon_output_file, tree_output_file):
    """From NCBI taxon ID, extract taxonomy rank and create a tree file

    Args:
        mpwt_taxon_file (str): mpwt taxon file for species in sbml folder
        taxon_output_file (str): path to phylum output file
        tree_output_file (str): path to tree output file

    """
    ncbi = NCBITaxa()

    taxon_ids = []

    phylum_count = {}
    with open(taxon_output_file, "w") as phylum_file:
        csvwriter = csv.writer(phylum_file, delimiter="\t")
        csvwriter.writerow([
            "species", "taxid", "phylum_number", "phylum", "class", "order",
            "family", "genus", "species"
        ])
        with open(mpwt_taxon_file, "r") as taxon_file:
            csvfile = csv.reader(taxon_file, delimiter="\t")
            for line in csvfile:
                if "taxon" not in line[1]:
                    taxon_ids.append(line[1])
                    lineage = ncbi.get_lineage(line[1])
                    lineage2ranks = ncbi.get_rank(lineage)
                    names = ncbi.get_taxid_translator(lineage)
                    ranks2lineage = dict(
                        (rank, names[taxid])
                        for (taxid, rank) in lineage2ranks.items())
                    ranks = [
                        ranks2lineage.get(rank, "no_information") for rank in [
                            "phylum", "class", "order", "family", "genus",
                            "species"
                        ]
                    ]
                    if ranks[0] != "no_information":
                        phylum = ranks[0][:4]
                    else:
                        phylum = "no_information"
                    if phylum not in phylum_count:
                        phylum_count[phylum] = 1
                    elif phylum == "no_information":
                        phylum_count[phylum] = ""
                    else:
                        phylum_count[phylum] += 1
                    row = ([line[0], line[1]] +
                           [phylum + str(phylum_count[phylum])] + ranks)
                    csvwriter.writerow(row)

    tree = ncbi.get_topology(taxon_ids)

    with open(tree_output_file, "w") as tree_file:
        tree_file.write(tree.get_ascii(attributes=["sci_name", "rank"]))
コード例 #13
0
def taxid2lineage(taxid):
    ranks = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
    ncbi = NCBITaxa()
    lineage = ncbi.get_lineage(taxid)
    lineage_dict = dict()
    names = ncbi.get_taxid_translator(lineage)
    for rank in ranks:
        for k, v in ncbi.get_rank(lineage).items():
            if v == rank:
                lineage_dict.update({v: names[k]})
    return lineage_dict
コード例 #14
0
def get_lineage_at_desired_ranks(taxid, desired_ranks):
    'Retrieve lineage information at desired taxonomic ranks'
    # initiate an instance of the ncbi taxonomy database
    ncbi = NCBITaxa()
    # retrieve lineage information for each full length 16S molecule
    lineage = ncbi.get_lineage(taxid)
    lineage2ranks = ncbi.get_rank(lineage)
    ranks2lineage = dict((rank, taxid) for (taxid, rank) in lineage2ranks.items())
    ranki = [ranks2lineage.get(x) for x in desired_ranks]
    ranks = [x if x is not None else 0 for x in ranki]
    return(ranks)
コード例 #15
0
def check_ancestor(name: str, tax_id: int, rank: str = None) -> bool:
    ncbi = NCBITaxa()
    ancestor_ids = ncbi.get_name_translator([name]).get(name, [])
    if not ancestor_ids:
        raise ValueError("No taxonomy id for {}".format(name))
    lineage = ncbi.get_lineage(tax_id)
    for anc_id in lineage:
        if rank is None or ncbi.get_rank([anc_id]).get(anc_id, '') == rank:
            if anc_id in ancestor_ids:
                return True
    return False
コード例 #16
0
def get_rank_dict(taxa_name=None):
    ncbi = NCBITaxa()
    name_dict = ncbi.get_name_translator([taxa_name])
    if not name_dict:
        ## try only the first word (which may be a genus name?)
        print("can not find taxid for", taxa_name, file=sys.stderr)
        taxa_name = taxa_name.split()
        if len(taxa_name) > 1:
            taxa_name = taxa_name[0]
            print("try to search %s instead..." % taxa_name, file=sys.stderr)
            name_dict = ncbi.get_name_translator([taxa_name])

        if not name_dict:
            print("can not find taxid for %s, maybe it's a misspelling.\n" %
                  taxa_name,
                  file=sys.stderr)
            return None

    lineage_taxid_list = ncbi.get_lineage(name_dict[taxa_name][0])

    rank_dict = dict()
    for rank in [
            'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'
    ]:
        rank_dict[rank] = 'NA'

    for j in lineage_taxid_list:
        rank = ncbi.get_rank([j])[j]
        taxa = ncbi.get_taxid_translator([j])[j]
        if rank == 'kingdom':
            rank_dict['kingdom'] = taxa

        elif rank == 'phylum':
            rank_dict['phylum'] = taxa

        elif rank == 'class':
            rank_dict['class'] = taxa

        elif rank == 'order':
            rank_dict['order'] = taxa

        elif rank == 'family':
            rank_dict['family'] = taxa

        elif rank == 'genus':
            rank_dict['genus'] = taxa

        elif rank == 'species':
            rank_dict['species'] = taxa

        else:
            pass

    return rank_dict
コード例 #17
0
def get_tax_lineage(taxonid, source):
    """Return taxonomy lineage information

    This function uses Biopython library to connect NCBI database
    and search for taxonomy information or ete3 to download
    taxdump file and search the information locally.

    Parameters
    -------------
    taxonid : string
        Taxonomic id of the species
    source : string
        Source to be used to collect the info about the taxonid

    Returns
    -------------
    lineage: dict
        Species lineage

    """

    if taxonid not in LINEAGES:
        if source == "taxdump":
            ncbi_taxdump = NCBITaxa()
            lineage_ids = ncbi_taxdump.get_lineage(taxonid)
            ranks = ncbi_taxdump.get_rank(lineage_ids)
            names = ncbi_taxdump.get_taxid_translator(lineage_ids)
            lineage = {ranks[i]:names[i] for i in lineage_ids}

            LINEAGES[taxonid] = lineage
            return LINEAGES[taxonid]

        while True:
            data = ""
            try:
                Entrez.email = "*****@*****.**"
                handle = Entrez.efetch(id = taxonid, db = "taxonomy", retmode = "xml")    
                data = Entrez.read(handle)
                handle.close()
            except Exception as e:
                with open(LOG, "a") as log:
                    print("Error when searching information about {}".format(taxonid),
                        file=log)

            if data:
                break
    
        lineage = {d["Rank"]:d["ScientificName"] for d in data[0]["LineageEx"]}
        lineage[data[0]["Rank"]] = data[0]["ScientificName"]
        LINEAGES[taxonid] = lineage

    
    return LINEAGES[taxonid]
コード例 #18
0
ファイル: topology.py プロジェクト: ThThalamas/pyproteinsExt
 def get_taxo(self, function_get_taxid):
     ncbi = NCBITaxa()
     taxname = None
     taxrank = None
     taxid = function_get_taxid(self)
     taxname_dic = ncbi.get_taxid_translator([taxid])
     if taxname_dic:
         taxname = taxname_dic[int(taxid)]
         taxrank_dic = ncbi.get_rank([taxid])
         if taxrank_dic:
             taxrank = taxrank_dic[int(taxid)]
     self.taxo = Taxo(taxid, taxname, taxrank)
コード例 #19
0
    def get_rank(self):
        """ Get the rank of the taxon

        Returns:
            :obj:`str`: rank of the taxon
        """
        if self.distance_from_nearest_ncbi_taxon == 0:
            ncbi_taxa = NCBITaxa()
            rank = ncbi_taxa.get_rank([self.id_of_nearest_ncbi_taxon])[self.id_of_nearest_ncbi_taxon]
            if rank != 'no rank':
                return rank

        return None
コード例 #20
0
ファイル: alignment.py プロジェクト: andreyfsch/byte_pipette
    def assign_rank_representation(self, rank='species'):
        protDB = db_handling.ProteinDatabase()
        ncbi = NCBITaxa()
        entries_no_representative = protDB.get_entries_no_representative()

        for entry in entries_no_representative:
            taxon_id = entry[1]
            with warnings.catch_warnings(record=True) as w:
                warn_msg = None
                warnings.simplefilter("always")
                lineage = ncbi.get_lineage(taxon_id)
                for a in w:
                    warn_msg = a.message
                if warn_msg:
                    warn_data = str(warn_msg).split()
                    taxon_id = int(warn_data[-1])
                    protDB.update_protein_entry(
                        {'representative_of_taxon': taxon_id}, entry[0])

            lineage_ranks = ncbi.get_rank(lineage)
            lineage_translation = ncbi.get_taxid_translator(lineage)
            insert = True
            ellected_rank_id = ''
            for rank_id, lineage_rank in lineage_ranks.items():
                if rank == lineage_rank:
                    ellected_rank_id = rank_id
            print(entry[0])
            if lineage_ranks[taxon_id] != rank:
                if not self.bigger_than_rank_taxon(
                        lineage_ranks[taxon_id],
                        rank) and ellected_rank_id != '':
                    protDB.update_protein_entry(
                        {
                            'representative_of_taxon':
                            ellected_rank_id,
                            'representative_taxon_rank':
                            rank,
                            'taxon_name_representative':
                            lineage_translation[ellected_rank_id]
                        }, entry[0])
                    insert = False
            if entry[2] == None and insert and ellected_rank_id != '':
                protDB.update_protein_entry(
                    {
                        'representative_of_taxon':
                        ellected_rank_id,
                        'representative_taxon_rank':
                        rank,
                        'taxon_name_representative':
                        lineage_translation[ellected_rank_id]
                    }, entry[0])
コード例 #21
0
def taxid_to_lineage_string(taxid):
    tax_order = ['kingdom', 'domain', 'phylum', 'class', 'order', 'family', 'genus', 'species']
    outstr = ''
    ncbi = NCBITaxa()
    lineage = ncbi.get_lineage(taxid)
    names = ncbi.get_taxid_translator(lineage)
    for level in tax_order:
        for tid in names:
            rank = ncbi.get_rank([tid])
            if rank[tid] == 'superkingdom':
                rank[tid] = 'domain'
            if rank[tid] == level:
                outstr += level[0] + '_' + names[tid] + ';'
    return outstr[:-1]
コード例 #22
0
def taxid_to_lineage(taxid):
    """
    Function for retrieving the taxonomic rank of given taxid
    :param taxid:
    :return:
    """
    ncbi = NCBITaxa()
    lineage = ncbi.get_lineage(taxid)
    rank_to_id = {rank: id for (id, rank) in ncbi.get_rank(lineage).items()}
    rank_to_id = {
        desired_rank: (rank_to_id[desired_rank] if desired_rank in rank_to_id.keys() else None)
        for desired_rank in desired_ranks
    }
    return rank_to_id
コード例 #23
0
def get_ncbi_taxa_rank(taxa_name):

	ncbi= NCBITaxa()

	name2taxid=ncbi.get_name_translator([taxa_name])
	rank="N/A"
	ncbi_taxid="N/A"

	if name2taxid:
		ncbi_taxid=name2taxid[taxa_name].pop()
		ncbi_ranks=ncbi.get_rank([ncbi_taxid])
		rank=ncbi_ranks[ncbi_taxid]
	
	return(rank,ncbi_taxid)
コード例 #24
0
def get_taxonomic_group_mapping(group_ids: List[str],
                                selected_rank: str) -> Tuple[Dict, Dict]:
    """
    Function to create a mapping from NCBI-taxon ids to groups which are used to split the provided
    training records into training and validation sets

    :param group_ids: List of identifiers that should be NCBI taxon ids
    :param selected_rank: selected standard rank determining on which level the set is split in
                          training and validation-set
    :return: Mapping of input taxon_ids as string and groups as integers
    """
    ncbi = NCBITaxa()
    standard_ranks = [
        "superkingdom", "phylum", "class", "order", "family", "genus",
        "species"
    ]
    if not selected_rank.lower() in standard_ranks:
        selected_rank = auto_select_rank(group_ids)

    taxon_ids_set = set(group_ids)
    taxon_ancestor_mapping = {}

    for taxon in taxon_ids_set:
        lineage = ncbi.get_lineage(int(taxon))
        ids_of_ranks = ncbi.get_rank(lineage)
        taxon_ancestor_mapping[
            taxon] = 0  # fall-back value if sample does not have an entry on this level
        for ancestor_id, rank in ids_of_ranks.items():
            if rank == selected_rank:
                taxon_ancestor_mapping[taxon] = ancestor_id

    ancestor_ids = set(taxon_ancestor_mapping.values())
    ancestor_names = ncbi.get_taxid_translator(ancestor_ids)
    ancestor_names[0] = "unknown"
    ancestor_enumeration = {
        ancestor_id: x
        for x, ancestor_id in enumerate(ancestor_ids)
    }

    group_name_mapping = {
        taxon: ancestor_names[taxon_ancestor_mapping[taxon]]
        for taxon in group_ids
    }
    group_id_mapping = {
        taxon: ancestor_enumeration[taxon_ancestor_mapping[taxon]]
        for taxon in group_ids
    }

    return group_name_mapping, group_id_mapping
コード例 #25
0
def get_ncbi_taxonomy(taxid):

	ncbi= NCBITaxa()

	lineage = ncbi.get_lineage(taxid)
	names = ncbi.get_taxid_translator(lineage)
	ranks = ncbi.get_rank(lineage)
	ncbi_taxonomy_path=""

	for taxid in lineage:

		if not ranks[taxid]=="no rank":
			ncbi_taxonomy_path = ncbi_taxonomy_path +";"+names[taxid]

	return(ncbi_taxonomy_path)
コード例 #26
0
class NCBIController:
    def __init__(self):
        self.ncbi = NCBITaxa()

    def translate(self, taxid):
        """
        :ret scientific name
        """
        return self.ncbi.get_taxid_translator([taxid])[taxid]

    def get_lineage(self, taxid, rank_lst=None):
        if rank_lst is None:
            rank_lst = [
                "superkingdom", "phylum", "class", "order", "family", "genus",
                "species"
            ]

        dct = {}
        try:
            for taxidLineage, rank in self.ncbi.get_rank(
                    self.ncbi.get_lineage(taxid)).items():
                if rank in rank_lst:
                    dct[rank] = taxidLineage
                    dct[rank + "_s"] = self.translate(taxidLineage)
            return dct
        except (KeyError, ValueError):
            #            print("ERROR: unknown taxid = {}".format(taxid))
            return dict()

    def get_descendant(self, taxid, rank):
        ret = []
        children = self.ncbi.get_descendant_taxa(taxid, rank_limit="genus")
        for k, v in self.ncbi.get_rank(children).items():
            if v == rank:
                ret.append(k)
        return ret
コード例 #27
0
def get_off_target_last_common_taxon_rank(df, target_rank, target_taxon):
    ncbi = NCBITaxa()
    if (target_taxon != 0) & (df.loc[target_rank] != 0):
        if not pd.isnull(df.loc[target_rank]):
            last_common_taxon = ncbi.get_topology([df.loc[target_rank], target_taxon])
            last_common_taxon_rank = last_common_taxon.rank
            if last_common_taxon_rank != 'no rank':
                lineage = ncbi.get_lineage(last_common_taxon.taxid)
                last_common_taxon_rank = ncbi.get_rank([lineage[-1]])[lineage[-1]]
            else:
                last_common_taxon_rank = 'no rank'
        else:
            last_common_taxon_rank = 'no rank'
    else:
        last_common_taxon_rank = 'no rank'
    return(last_common_taxon_rank)
コード例 #28
0
def compute_taxid_paths(unique_tax_id_hash, ):
    #ncbi = NCBITaxa()
    path_output = ""
    ncbi = NCBITaxa(NCBITaxaDbFile)
    pathways = list()
    tax_name_ctr = dict()
    max_scalable_hits = 1000
    max_value = 40
    for tax_id in unique_tax_id_hash:
        # save mode; because the tax id can also be a not parsable string
        try:
            # get pathway (ete3 package) => "['root', 'bacteria', 'bac1']"
            global_scaling_val = unique_tax_id_hash[tax_id]
            lineage = ncbi.get_lineage(int(tax_id))

            # prepare output for CopraRNA
            path_output += str(ncbi.get_rank(lineage)) + "\n"
            path_output += str(lineage) + "\n\n"

            names = ncbi.get_taxid_translator(lineage)
            tmp_path = list()
            for tax_id2 in lineage:
                tax_name = str(tax_id2) + ":" + str(names[tax_id2])
                if tax_name in tax_name_ctr:
                    tax_name_ctr[tax_name][0] += global_scaling_val
                else:
                    tax_name_ctr[tax_name] = list()
                    tax_name_ctr[tax_name].append(global_scaling_val)
                    #tax_name_ctr[tax_name][0] += unique_tax_id_hash[tax_id]
                    tax_name_ctr[tax_name].append(0)
                    tax_name_ctr[tax_name].append(0)
                tmp_path.append(tax_name)
            # normalize node values
            for tax_name in tax_name_ctr:
                if (tax_name_ctr[tax_name][0]) <= max_scalable_hits:
                    tax_name_ctr[tax_name][1] = math.sqrt(float(tax_name_ctr[tax_name][0])) * 1.26
                    tax_name_ctr[tax_name][2] = "passed"
                else:
                    tax_name_ctr[tax_name][1] = max_value
                    tax_name_ctr[tax_name][2] = "failed"
            # append sub-pathway to pathways
            pathways.append(tmp_path)
        except ValueError:
            pass

    return pathways, tax_name_ctr, path_output
コード例 #29
0
ファイル: seqchoosers.py プロジェクト: KirkVM/biodb
def build_seqxds(vpwxracc_fpathstr, dbpathstr, vpwxrafull_fpathstr=None):
    '''builds an xarray dataset from vscurate xarray, adding '''
    vpwxra_cc = xr.open_dataarray(vpwxracc_fpathstr)
    for x in vpwxra_cc.curateseq.values:  #().coords[:,'curateseqs'].data:
        vpwxra_cc.loc[:,x,'normscore'] =  (vpwxra_cc.loc[:,x,'score']- vpwxra_cc.loc[:,x,'score'].min()) /    \
             (vpwxra_cc.loc[:,x,'score'].max() - vpwxra_cc.loc[:,x,'score'].min())
    mergeds = xr.Dataset(data_vars={'vpwxra_cc': vpwxra_cc})
    if vpwxrafull_fpathstr is not None:
        vpwxra_full = xr.open_dataarray(vpwxrafull_fpathstr)
        for x in vpwxra_full.curateseq.values:  #().coords[:,'curateseqs'].data:
            vpwxra_full.loc[:,x,'normscore'] =  (vpwxra_full.loc[:,x,'score']- vpwxra_full.loc[:,x,'score'].min()) /    \
                (vpwxra_full.loc[:,x,'score'].max() - vpwxra_full.loc[:,x,'score'].min())
        mergeds[
            'vpwxra_full'] = vpwxra_full  #xr.Dataset(data_vars={'vpwxra_cc':vpwxracc})

    taxra=xr.DataArray(  np.full((len(vpwxra_cc.dbseq),7),np.nan), \
        coords=[vpwxra_cc.dbseq,['superkingdom','phylum','class','order','family','genus','species']], \
        dims=['dbseq','ranks'])

    ncbitaxa = NCBITaxa()
    conn = seqdbutils.gracefuldbopen(dbpathstr)
    conn.row_factory = sqlite3.Row
    c = conn.cursor()
    tonamedict = {}
    for accentry in vpwxra_cc.dbseq:
        acc = accentry.data.item(0)
        c.execute('''SELECT * FROM PROTEINGBS WHERE acc=(?)''', (acc, ))
        #        sr=pickle.loads(c.fetchone()['pklgbsr'])
        #taxdict=get_taxdict(sr,ncbitaxa)
        row = c.fetchone()
        taxid = row['taxid']
        if taxid is not None:
            taxlineage = ncbitaxa.get_lineage(taxid)
            rankdict = ncbitaxa.get_rank(taxlineage)
            valdict = ncbitaxa.get_taxid_translator(taxlineage)
            rankinfodict = {rankdict[k]: [k, valdict[k]] for k in rankdict}
            for k in rankinfodict:
                if k in taxra.ranks:
                    taxra.loc[acc, k] = rankinfodict[k][0]
                    tonamedict[rankinfodict[k][1]] = rankinfodict[k][0]
    conn.close()
    #    mergeds=xr.Dataset(data_vars={'vpwxra':vpwxra,'taxra':taxra})
    #mergeds=xr.Dataset(data_vars={'vpwxra':vpwxracc})
    mergeds['taxra'] = taxra
    return mergeds
コード例 #30
0
def determine_unassigned_rank(taxid):
    """
    Given a taxid, will use ete3 to look at all its descendants. Based on what it finds, will infer what taxonomic
    level the taxid should be at. Useful for things that have 'no rank' according to NCBI.
    :param taxid: NCBI taxid, should be an integer
    :return: string that says what taxonomy level we're at, one of the options from tax_order
    """
    tax_order = ['kingdom', 'domain', 'phylum', 'class', 'order', 'family', 'genus', 'species']
    ncbi = NCBITaxa()
    descendants = ncbi.get_descendant_taxa(taxid, intermediate_nodes=True)
    lowest_rank = 900
    for descendant in descendants:
        rank = ncbi.get_rank([descendant])
        if rank[descendant] in tax_order:
            rank_number = tax_order.index(rank[descendant])
            if rank_number < lowest_rank:
                lowest_rank = rank_number
    return tax_order[lowest_rank - 1]
if len(sys.argv) == 1:
	sys.exit("USAGE: python %s <path/to/ncbi_gi_taxid_file> > <output.txt>" % sys.argv[0])

ncbi = NCBITaxa()
#ncbi.update_taxonomy_database()

fp = open('taxa-ids-not-found.txt', 'w')
hier = ["superkingdom", "kingdom", "phylum", "class", "order", "family", "genus", "species"]

missing = []
for x in open(sys.argv[1]):
    dat = x.rstrip().split('\t')[-1]
    try:
        lineage = ncbi.get_lineage(dat)
	names = ncbi.get_taxid_translator(lineage)
	ranks = ncbi.get_rank(lineage)
	
	new_ranks = {}
	for keys in ranks:
		if ranks[keys] in hier:
			new_ranks[keys]=ranks[keys]
        
	d = {}
        for taxid in lineage and new_ranks:
		d[new_ranks[taxid]] = names[taxid]
	
	for key in sorted(d):
		print x.rstrip() + "\t"+ str(key)+"\t"+d[key]
    except ValueError:
	missing.append(x.rstrip())
fp.write('\n'.join(missing))
コード例 #32
0
#!/usr/bin/python

Usage = """
Print taxid's lineage and ranks
by default prints to the stdout
Usage:
  taxid_ranks.py taxid > ouput.txt

Arun Seetharam
[email protected]
taxid_ranks.py -version 1.0
04/13/2017
"""
from ete3 import NCBITaxa
import sys
ncbi = NCBITaxa()
if len(sys.argv)<2:
    print Usage
else:
    cmdargs = str(sys.argv)
    lineage = ncbi.get_lineage((sys.argv[1]))
    names = ncbi.get_taxid_translator(lineage)
    for taxid in lineage:
        print [ncbi.get_rank([taxid])], [names[taxid]]        
#    print [names[taxid] for taxid in lineage]
#    print [ncbi.get_rank([taxid]) for taxid in lineage]
#    print [ncbi.get_rank([name]) for name in names]
コード例 #33
0
ファイル: ete_ncbiquery.py プロジェクト: Ward9250/ete
def run(args):
    # add lineage profiles/stats

    import re
    from ete3 import PhyloTree, NCBITaxa

    # dump tree by default
    if not args.tree and not args.info and not args.descendants:
        args.tree = True

    ncbi = NCBITaxa()

    all_taxids = {}
    all_names = set()
    queries = []

    if not args.search:
        log.error('Search terms should be provided (i.e. --search) ')
        sys.exit(-1)
    for n in args.search:
        queries.append(n)
        try:
            all_taxids[int(n)] = None
        except ValueError:
            all_names.add(n.strip())

    # translate names
    name2tax = ncbi.get_name_translator(all_names)
    all_taxids.update([(v, None) for v in list(name2tax.values())])

    not_found_names = all_names - set(name2tax.keys())
    if args.fuzzy and not_found_names:
        log.warn("%s unknown names", len(not_found_names))
        for name in not_found_names:
            # enable extension loading
            tax, realname, sim = ncbi.get_fuzzy_name_translation(name, args.fuzzy)
            if tax:
                all_taxids[tax] = None
                name2tax[name] = tax
                name2realname[name] = realname
                name2score[name] = "Fuzzy:%0.2f" %sim

    if not_found_names:
        log.warn("[%s] could not be translated into taxids!" %','.join(not_found_names))

    if args.tree:
        if len(all_taxids) == 1:
            target_taxid = next(all_taxids.keys())
            log.info("Dumping NCBI descendants tree for %s" %(target_taxid))
            t = ncbi.get_descendant_taxa(target_taxid, collapse_subspecies=args.collapse_subspecies, rank_limit=args.rank_limit, return_tree=True)
        else:
            log.info("Dumping NCBI taxonomy of %d taxa..." %(len(all_taxids)))
            t = ncbi.get_topology(list(all_taxids.keys()),
                              intermediate_nodes=args.full_lineage,
                              rank_limit=args.rank_limit,
                              collapse_subspecies=args.collapse_subspecies)

        id2name = ncbi.get_taxid_translator([n.name for n in t.traverse()])
        for n in t.traverse():
            n.add_features(taxid=n.name)
            n.add_features(sci_name=str(id2name.get(int(n.name), "?")))
            n.name = "%s - %s" %(id2name.get(int(n.name), n.name), n.name)
            lineage = ncbi.get_lineage(n.taxid)
            n.add_features(named_lineage = '|'.join(ncbi.translate_to_names(lineage)))
        dump(t, features=["taxid", "name", "rank", "bgcolor", "sci_name",
                          "collapse_subspecies", "named_lineage"])
    elif args.descendants:
        log.info("Dumping NCBI taxonomy of %d taxa..." %(len(all_taxids)))
        print('# ' + '\t'.join(["Taxid", "Sci.Name", "Rank", "descendant_taxids", "descendant_names"]))
        translator = ncbi.get_taxid_translator(all_taxids)
        ranks = ncbi.get_rank(all_taxids)
        for taxid in all_taxids:
            descendants = ncbi.get_descendant_taxa(taxid, collapse_subspecies=args.collapse_subspecies, rank_limit=args.rank_limit)
            print('\t'.join([str(taxid), translator.get(taxid, taxid), ranks.get(taxid, ''),
                             '|'.join(map(str, descendants)),
                             '|'.join(map(str, ncbi.translate_to_names(descendants)))]))

    elif args.info:
        print('# ' + '\t'.join(["Taxid", "Sci.Name", "Rank", "Named Lineage", "Taxid Lineage"]))
        translator = ncbi.get_taxid_translator(all_taxids)
        ranks = ncbi.get_rank(all_taxids)
        for taxid, name in six.iteritems(translator):
            lineage = ncbi.get_lineage(taxid)
            named_lineage = ','.join(ncbi.translate_to_names(lineage))
            lineage_string = ','.join(map(str, lineage))
            print('\t'.join([str(taxid), name, ranks.get(taxid, ''), named_lineage, lineage_string]))