コード例 #1
0
ファイル: alignment.py プロジェクト: andreyfsch/byte_pipette
    def correct_erroneous_repres_of_taxon_instances():
        protDB = db_handling.ProteinDatabase()
        ncbi = NCBITaxa()

        erroneous_instances = protDB.get_erroneous_repres_of_taxon_instances()

        for instance in erroneous_instances:
            lineage = ncbi.get_lineage(instance[1])
            lineage_ranks = ncbi.get_rank(lineage)
            representative_id = protDB.get_protein_entry(instance[2])[5]
            lineage_translation = ncbi.get_taxid_translator(lineage)
            if protDB.get_protein_entry(instance[2]):
                if representative_id != None:
                    representative_id = protDB.get_protein_entry(
                        instance[2])[5]
                    print('representative_id', representative_id)
                    lineage_representative = ncbi.get_lineage(
                        representative_id)
                    print('lineage_representative', lineage_representative)
                    lineage_representative_ranks = ncbi.get_rank(
                        lineage_representative)
                    print('lineage_representative_ranks',
                          lineage_representative_ranks)
                    if lineage_ranks[
                            instance[1]] == lineage_representative_ranks[
                                representative_id]:
                        count_instance = protDB.get_count_of_children_of_repres(
                            instance[1])
                        count_representative = protDB.get_count_of_children_of_repres(
                            representative_id)
                        if count_representative >= count_instance:
                            protDB.update_protein_entry(
                                {'representative_of_taxon': None}, instance[1])
                        else:
                            protDB.update_protein_entry_by_repres_by(
                                {'represented_by': instance[1]},
                                representative_id)
                            protDB.update_protein_entry(
                                {'represented_by': instance[1]},
                                representative_id)
                            protDB.update_protein_entry(
                                {'representative_of_taxon': None},
                                representative_id)
            else:
                protDB.update_protein_entry({'representative_of_taxon': None},
                                            instance[0])
                protDB.update_protein_entry(
                    {'representative_of_taxon': instance[1]}, instance[2])
                protDB.update_protein_entry(
                    {
                        'taxon_name_representative':
                        lineage_translation[instance[1]]
                    }, representative_id)
                protDB.update_protein_entry(
                    {'representative_taxon_rank': lineage_ranks[instance[1]]},
                    representative_id)

        print(
            '---------------------------------------------------------------------------------'
        )
コード例 #2
0
def contig_tax(annot_df, ncbi_db, min_prot, prop_annot, tax_thres):
    """This function takes the annotation table generated by viral_contig_maps.py and generates a table that
    provides the taxonomic lineage of each viral contig, based on the corresponding ViPhOG annotations"""

    ncbi = NCBITaxa(dbfile=ncbi_db)
    tax_rank_order = ["genus", "subfamily", "family", "order"]
    contig_set = set(annot_df["Contig"])

    for contig in contig_set:
        contig_lineage = [contig]
        contig_df = annot_df[annot_df["Contig"] == contig]
        total_prot = len(contig_df)
        annot_prot = sum(contig_df["Best_hit"] != "No hit")
        if annot_prot < prop_annot * total_prot:
            contig_lineage.extend([""] * 4)
        else:
            contig_hits = contig_df[pd.notnull(
                contig_df["Label"])]["Label"].values
            taxid_list = [
                ncbi.get_name_translator([item])[item][0]
                for item in contig_hits
            ]
            hit_lineages = [{
                y: x
                for x, y in ncbi.get_rank(ncbi.get_lineage(item)).items()
                if y in tax_rank_order
            } for item in taxid_list]
            for rank in tax_rank_order:
                taxon_list = [item.get(rank) for item in hit_lineages]
                total_hits = sum(pd.notnull(taxon_list))
                if total_hits < min_prot:
                    contig_lineage.append("")
                    continue
                else:
                    count_hits = Counter(
                        [item for item in taxon_list if pd.notnull(item)])
                    best_hit = sorted(
                        [(x, y) for x, y in count_hits.items()],
                        key=lambda x: x[1],
                        reverse=True,
                    )[0]
                    prop_hits = best_hit[1] / total_hits
                    if prop_hits < tax_thres:
                        contig_lineage.append(prop_hits)
                        continue
                    else:
                        best_lineage = ncbi.get_lineage(best_hit[0])
                        contig_lineage.extend([
                            ncbi.get_taxid_translator([key])[key]
                            if pd.notnull(key) else "" for key in [{
                                y: x
                                for x, y in ncbi.get_rank(
                                    best_lineage).items()
                            }.get(item
                                  ) for item in tax_rank_order[tax_rank_order.
                                                               index(rank):]]
                        ])
                        break
        yield contig_lineage
コード例 #3
0
def make_krona_table(f, db):
    if not db:
        ncbi_taxa = NCBITaxa()
    else:
        ncbi_taxa = NCBITaxa(db)
    krona_table = pd.DataFrame(columns = ["abundance","superkingdom","phylum","class","order","family","genus",
                                          "species","leaf"])
    one_letter_ranks = {"D": "superkingdom", "P": "phylum", "C": "class", "O": "order", "F": "family", "G": "genus",
                        "S": "species"}
    df = pd.read_csv(f, header=None, names = ["clade_percent", "clade_reads", "reads", "rank", "taxid", "name"], sep="\t")
    df = df.loc[df.reads > 0]
    for j, i in enumerate(df.index):
        r = df.loc[i]
        taxid = r["taxid"]
        reads = r["reads"]
        name = r["name"]
        one_letter_rank = r["rank"]
        if one_letter_rank == "-":
            rank = ncbi_taxa.get_rank([taxid])[taxid]
            try:
                parent_taxid = ncbi_taxa.get_lineage(taxid)[-2]
            except IndexError:
                parent_taxid = taxid
            parent_rank = ncbi_taxa.get_rank([parent_taxid])[parent_taxid]
            if rank == "no rank" and parent_rank == "species":
                rank = "leaf"
            else:
                continue
        elif one_letter_rank == "U":
            rank = "unclassified"
        else:
            try:
                rank = one_letter_ranks[one_letter_rank]
                #TODO: Shouldn't be too many reads mapped directly to ranks not in the krona table, but check eventually
            except KeyError:
                continue
        res = {"abundance": reads, "superkingdom": "", "phylum": "", "class": "", "order": "", "family": "",
               "genus": "", "species": "", "leaf": ""}
        if rank != "unclassified":
            rank_dict = ncbi_taxa.get_rank(ncbi_taxa.get_lineage(taxid))
            name_dict = ncbi_taxa.get_taxid_translator(ncbi_taxa.get_lineage(taxid))
            for dict_taxid, dict_rank in rank_dict.items():
                if dict_rank in res.keys():
                    rank_name = name_dict[dict_taxid]
                    res[dict_rank] = rank_name
            if not rank in ["superkingdom", "phylum", "class", "order", "family", "genus", "species"]:
                res["leaf"] = name
        _df = pd.DataFrame(res, index=[j])[krona_table.columns]
        krona_table = pd.concat([krona_table, _df])
    return krona_table
コード例 #4
0
def main(tree_path):

    ncbi = NCBITaxa()

    tree = Tree(tree_path, format=1)

    names = []
    ids = []
    in_magnoliophyta = []
    for leaf in tree:
        name = leaf.name.replace("_", ' ')
        name2taxid = ncbi.get_name_translator([name])

        if not name2taxid:
            if name in byhand:
                id = byhand[name]
                magno = "yes"

            else:
                id = "not found"
                magno = ""

        elif len(name2taxid) > 1:
            id = str(name2taxid[name])
            magno = ""
            print "two ids: ", name
        else:
            id = str(name2taxid[name][0])
            lineage = ncbi.get_lineage(id)
            if 3398 in lineage:  #3398 - magnoliophyta id
                magno = "yes"
            else:
                magno = "no"

        if id != "not found" and id in ids:
            print "duplicate: ", name, id
            id += "_B"

        leaf.name = id

        names.append(name)
        ids.append(id)
        in_magnoliophyta.append(magno)

    df = pd.DataFrame({
        'name': names,
        'id': ids,
        'in magnoliophyta': in_magnoliophyta
    })
    df.to_csv('names_to_ids.csv')

    p = "/groups/itay_mayrose/nomihadar/trees/magnoliophyta_tree/sequences_filtered_zanne/species/intersect_mytree_zannetree_mangoete3.ls"
    with open(p, 'r') as f:
        lines = f.read().splitlines()

    species = [x for x in lines if x not in ['58454', '142615', '77013']]

    tree.prune(list(set(species)), preserve_branch_length=True)

    tree.write(outfile="tree_ids.tree")
コード例 #5
0
    def from_taxid(cls, taxid: int) -> "Lineage":
        """
        Create `Lineage` instance from taxid

        Parameters
        ----------
        taxid : int
            A valid NCBI taxonomy id

        Returns
        -------
        "Lineage"
            Instance of the `Lineage` class
        """
        ncbi = NCBITaxa()
        lineage_taxids = ncbi.get_lineage(taxid)
        lineage_names = ncbi.get_taxid_translator(lineage_taxids)
        lineage_ranks = {
            v.capitalize(): k for k, v in ncbi.get_rank(lineage_taxids).items()
        }
        if "Superkingdom" in lineage_ranks:
            lineage_ranks["Kingdom"] = lineage_ranks["Superkingdom"]
            del lineage_ranks["Superkingdom"]
        taxa: Dict[str, str] = {}
        for field in cls._fields:
            if field in lineage_ranks:
                taxa[field] = lineage_names[lineage_ranks[field]]
            else:
                break
        return cls(**taxa)
コード例 #6
0
def get_metadata(records: List[SeqRecord]):
    ncbi = NCBITaxa()

    species = [gb.annotations["organism"] for gb in records]
    name_translator = ncbi.get_name_translator(species)

    sought_ranks = [
        "superkingdom", "order", "family", "subfamily", "genus", "species"
    ]

    metadata = []

    for gb in records:
        taxid = name_translator[gb.annotations["organism"]][0]
        lineage = ncbi.get_lineage(taxid)
        ranks = ncbi.get_rank(lineage)
        names = ncbi.get_taxid_translator(lineage)
        taxonomy = {
            ranks[k]: names[k]
            for k in lineage if ranks[k] in sought_ranks
        }
        metadata.append({**taxonomy, "aid": gb.id})

    df = pd.DataFrame(metadata)
    df.to_csv("metadata.csv")

    return df
コード例 #7
0
def get_full_lineages(otus):

    #### makes the full lineage file (lineages.tsv). Requires ete3 ####

    #Input: list of the otus in the table obtained from the get_otus function
    #Output: makes the full_lineages.tsv file

    from ete3 import NCBITaxa

    ncbi = NCBITaxa()

    lineages = {}

    if 0 in otus:
        lineages.update({0: ""})
        otus.remove(0)
    if 1 in otus:
        lineages.update({1: "root"})
        otus.remove(1)
    if 2 in otus:
        lineages.update({2: "root;Bacteria"})
        otus.remove(2)

    for entrie in otus:
        lineage = ncbi.get_lineage(entrie)  #returns list of lineage taxids
        names = ncbi.get_taxid_translator(lineage).values(
        )  #returns dict in which the taxids of the lineage list become the keys (int) and the translations the values. Error if there is a 0
        all_names = ";".join(names)
        lineages.update({entrie: all_names})

    lineages_df = pd.DataFrame(lineages.items(), columns=["OTU", "LINEAGE"])
    lineages_df.to_csv("full_lineages.tsv", sep="\t", index=False, header=True)
    print("full lineage file created")
コード例 #8
0
    def get_lineage(self,taxid):

        ncbi = NCBITaxa()
        lineage =  ncbi.get_lineage(taxid)
        names = ncbi.get_taxid_translator(lineage)
        lin = [names[taxid] for taxid in lineage]
        return lin
コード例 #9
0
ファイル: alignment.py プロジェクト: andreyfsch/byte_pipette
    def sort_collection_by_taxon_rank(collection,
                                      key,
                                      rank='species',
                                      rank_id=None):
        ncbi = NCBITaxa()

        new_collection = collection

        i = 0
        for item in collection:
            lineage = ncbi.get_lineage(item[key])
            lineage_ranks = ncbi.get_rank(lineage)
            if (rank in lineage_ranks.values()
                    and item[key] in lineage_ranks.keys()
                    and lineage_ranks[item[key]] != rank):
                for taxon, taxon_rank in lineage_ranks.items():
                    if rank == taxon_rank:
                        new_rep = []
                        for k in range(len(item)):
                            if k == key:
                                new_rep.append(taxon)
                            else:
                                new_rep.append(item[k])
                        new_rep = tuple(new_rep)
                        if rank_id == None:
                            new_collection[i] = new_rep
                        elif taxon == rank_id:
                            new_collection[i] = new_rep

            i += 1
        sorted_collection = tuple(
            sorted(new_collection, key=operator.itemgetter(key)))

        return sorted_collection
コード例 #10
0
def get_taxonomy(species_name,
                 name_format="Genus species",
                 ranks=None,
                 update_db=False):
    species_name = str(species_name)
    ncbi = NCBITaxa()
    if update_db == True:
        ncbi.update_taxonomy_database()
    if name_format == "Genus species":
        species_name = species_name
    if name_format == "Genus_species":
        species_name = species_name.replace("_", " ")
    species_id = ncbi.get_name_translator([species_name])
    if len(species_id) == 0 and ranks == None:
        return (['unknown'])
    if len(species_id) == 0 and ranks != None:
        return (['unknown'] * len(ranks))
    lineage_ids = ncbi.get_lineage(species_id[species_name][0])
    names = ncbi.get_taxid_translator(lineage_ids)
    if ranks == None:
        return (names)
    lineage_rk = ncbi.get_rank(lineage_ids)
    parsed_names = []
    for rk in ranks:
        for rk_id, rk_rk in lineage_rk.items():
            if rk_rk == rk:
                parsed_names.append(ncbi.get_taxid_translator([rk_id])[rk_id])
    return (parsed_names)
コード例 #11
0
def get_full_lineage(otus):

    ### Makes the updated lineage file (full_lineages_updated.tsv), requires the ete3 library

    ## Input: List with the keys of the updated_input_dic
    ## Output: Generates the file full_lineages_updated.tsv

    from ete3 import NCBITaxa

    ncbi = NCBITaxa()

    lineages = {}

    if 0 in otus:
        lineages.update({0: ""})
        otus.remove(0)
    if 1 in otus:
        lineages.update({1: "root"})
        otus.remove(1)
    if 2 in otus:
        lineages.update({2: "root;Bacteria"})
        otus.remove(2)

    for entrie in otus:
        lineage = ncbi.get_lineage(entrie)  #returns list of lineage taxids
        names = ncbi.get_taxid_translator(lineage).values(
        )  #returns dict in which the taxids of the lineage list become the keys (int) and the translations the values. Error if there is a 0
        all_names = ";".join(names)
        lineages.update({entrie: all_names})

    lineages_df = pd.DataFrame(lineages.items(), columns=["OTU", "lineage"])
    lineages_df.to_csv("full_lineages_updated.tsv",
                       sep="\t",
                       index=False,
                       header=True)
コード例 #12
0
    def get_parent_taxa(self):
        """ Get parent taxa

        Returns:
            :obj:`list` of :obj:`Taxon`: list of parent taxa
        """
        if self.id_of_nearest_ncbi_taxon is None:
            return None

        cls = self.__class__
        ncbi_taxa = NCBITaxa()
        lineage = [
            cls(ncbi_id=id)
            for id in ncbi_taxa.get_lineage(self.id_of_nearest_ncbi_taxon)
        ]

        if self.additional_name_beyond_nearest_ncbi_taxon:
            base_name = ncbi_taxa.translate_to_names(
                [self.id_of_nearest_ncbi_taxon])[0]
            names = self.additional_name_beyond_nearest_ncbi_taxon[1:].split(
                ' ')
            for i_rank, name, in enumerate(names):
                lineage.append(
                    cls(name=base_name + ''.join(' ' + n
                                                 for n in name[0:i_rank + 1])))

        return lineage[0:-1]
コード例 #13
0
def from_name2ids(phylum_name, dataset='genbank', return_d2ids=False):
    """
    retrieve ids and metadata from genbank file
    :param phylum_name:
    :return:
    """
    phylum_names = [_ for _ in phylum_name.split(';') if _]
    # phylum_name = "Nitrospirae;"
    # phylum_tid = "40117"
    ncbi = NCBITaxa()

    p2tid = ncbi.get_name_translator(phylum_names)

    for _ in phylum_names:
        if not p2tid.get(_):
            print(f" '{_}'' not found. please check the name")
    tids = [p2tid.get(_, [None])[0] for _ in phylum_names if p2tid.get(_)]
    tid2name = {
        p2tid.get(_, [None])[0]: _
        for _ in phylum_names if p2tid.get(_)
    }

    domain2dids = defaultdict(list)
    descend_ids = []
    tid2dids = {}
    for tid in tids:
        lineages = ncbi.get_lineage(tid)
        ranks = ncbi.get_rank(lineages)
        ranks = {v: k for k, v in ranks.items()}
        names = ncbi.get_taxid_translator(lineages)
        domain = names[ranks['superkingdom']]

        _descend_ids = ncbi.get_descendant_taxa(tid, intermediate_nodes=True)
        tid2dids[tid2name[tid]] = len(_descend_ids)
        descend_ids += _descend_ids
        domain2dids[domain].extend(_descend_ids)
    print(f"in total, {len(descend_ids)} taxids were found. ")
    if return_d2ids:
        return domain2dids

    domain2aids = defaultdict(list)
    collect_info = []
    descend_ids = set(descend_ids)
    for domain, ids in domain2dids.items():
        d = domain.lower()
        metadata = join(metadata_files_dir,
                        f"{dataset}_{d}_assembly_summary.txt")
        tqdm.write(
            f'read {metadata} which last modified at : {time.ctime(os.path.getmtime(metadata))}'
        )
        for row in tqdm(open(metadata)):
            if row.startswith("GC"):
                rows = row.split('\t')
                if int(rows[5]) in descend_ids:
                    collect_info.append(row)
                    domain2aids[d].append(rows[0])
    return domain2aids, collect_info
コード例 #14
0
def main():
    """Make queries against NCBI Taxa databases"""
    # Get commandline args
    args = get_args()

    # Instantiate the ete NCBI taxa object
    ncbi = NCBITaxa()

    if args.verbose > 1:
        print("Taxa database is stored under ~/.etetoolkit/taxa.sqlite")

    # Update the database if required.
    if args.update is True:
        if args.verbose > 1:
            print(
                "Updating the taxonomy database. This may take several minutes..."
            )
        ncbi.update_taxonomy_database()

    # If a name was provided instead of a TaxID, convert and store it.
    if args.name:
        args.taxid = ncbi.get_name_translator([args.name])[args.name][0]

    if args.verbose > 0:
        tax_dict = {}
        # If a name was provided, simply add it to dict
        if args.name:
            tax_dict['Name'] = args.name
        # If not, do the opposite conversion to the above and store that
        else:
            tax_dict['Name'] = ncbi.get_taxid_translator([args.taxid
                                                          ])[args.taxid]

# Continue to populate the taxa dict with other information
        tax_dict['TaxID'] = args.taxid
        tax_dict['Rank'] = ncbi.get_rank([args.taxid])
        tax_dict['Lineage'] = ncbi.get_taxid_translator(
            ncbi.get_lineage(args.taxid))

        print("Information about your selected taxa:")
        pretty(tax_dict)

    # Main feature of the script is to get all taxa within a given group.
    descendent_taxa = ncbi.get_descendant_taxa(args.taxid)
    descendent_taxa_names = ncbi.translate_to_names(descendent_taxa)
    print("Descendent taxa for TaxID: %s" % (args.taxid))

    # Under python3, zip = izip. In python2, this list could be very large, and memory intensive
    # Suggest the script is run with python3
    if args.verbose > 0:
        for dtn, dt in zip(descendent_taxa_names, descendent_taxa):
            print("%s\t%s" % (dtn, dt))

    if args.outfile:
        with open(args.outfile, 'w') as ofh:
            for id in descendent_taxa:
                ofh.write(str(id) + '\n')
コード例 #15
0
def extract_taxa(mpwt_taxon_file, taxon_output_file, tree_output_file):
    """From NCBI taxon ID, extract taxonomy rank and create a tree file

    Args:
        mpwt_taxon_file (str): mpwt taxon file for species in sbml folder
        taxon_output_file (str): path to phylum output file
        tree_output_file (str): path to tree output file

    """
    ncbi = NCBITaxa()

    taxon_ids = []

    phylum_count = {}
    with open(taxon_output_file, "w") as phylum_file:
        csvwriter = csv.writer(phylum_file, delimiter="\t")
        csvwriter.writerow([
            "species", "taxid", "phylum_number", "phylum", "class", "order",
            "family", "genus", "species"
        ])
        with open(mpwt_taxon_file, "r") as taxon_file:
            csvfile = csv.reader(taxon_file, delimiter="\t")
            for line in csvfile:
                if "taxon" not in line[1]:
                    taxon_ids.append(line[1])
                    lineage = ncbi.get_lineage(line[1])
                    lineage2ranks = ncbi.get_rank(lineage)
                    names = ncbi.get_taxid_translator(lineage)
                    ranks2lineage = dict(
                        (rank, names[taxid])
                        for (taxid, rank) in lineage2ranks.items())
                    ranks = [
                        ranks2lineage.get(rank, "no_information") for rank in [
                            "phylum", "class", "order", "family", "genus",
                            "species"
                        ]
                    ]
                    if ranks[0] != "no_information":
                        phylum = ranks[0][:4]
                    else:
                        phylum = "no_information"
                    if phylum not in phylum_count:
                        phylum_count[phylum] = 1
                    elif phylum == "no_information":
                        phylum_count[phylum] = ""
                    else:
                        phylum_count[phylum] += 1
                    row = ([line[0], line[1]] +
                           [phylum + str(phylum_count[phylum])] + ranks)
                    csvwriter.writerow(row)

    tree = ncbi.get_topology(taxon_ids)

    with open(tree_output_file, "w") as tree_file:
        tree_file.write(tree.get_ascii(attributes=["sci_name", "rank"]))
コード例 #16
0
    def get_distance_to_root(self):
        """ Get the distance from the taxon to the root of the NCBI taxonomy tree

        Returns:
            :obj:`int`: distance from the taxon to the root
        """
        if self.id_of_nearest_ncbi_taxon is None:
            return id_of_nearest_ncbi_taxon

        ncbi_taxa = NCBITaxa()
        return len(ncbi_taxa.get_lineage(self.id_of_nearest_ncbi_taxon)) - 1 + self.distance_from_nearest_ncbi_taxon
コード例 #17
0
def get_lineage_at_desired_ranks(taxid, desired_ranks):
    'Retrieve lineage information at desired taxonomic ranks'
    # initiate an instance of the ncbi taxonomy database
    ncbi = NCBITaxa()
    # retrieve lineage information for each full length 16S molecule
    lineage = ncbi.get_lineage(taxid)
    lineage2ranks = ncbi.get_rank(lineage)
    ranks2lineage = dict((rank, taxid) for (taxid, rank) in lineage2ranks.items())
    ranki = [ranks2lineage.get(x) for x in desired_ranks]
    ranks = [x if x is not None else 0 for x in ranki]
    return(ranks)
コード例 #18
0
def get_rank_dict(taxa_name=None):
    ncbi = NCBITaxa()
    name_dict = ncbi.get_name_translator([taxa_name])
    if not name_dict:
        ## try only the first word (which may be a genus name?)
        print("can not find taxid for", taxa_name, file=sys.stderr)
        taxa_name = taxa_name.split()
        if len(taxa_name) > 1:
            taxa_name = taxa_name[0]
            print("try to search %s instead..." % taxa_name, file=sys.stderr)
            name_dict = ncbi.get_name_translator([taxa_name])

        if not name_dict:
            print("can not find taxid for %s, maybe it's a misspelling.\n" %
                  taxa_name,
                  file=sys.stderr)
            return None

    lineage_taxid_list = ncbi.get_lineage(name_dict[taxa_name][0])

    rank_dict = dict()
    for rank in [
            'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'
    ]:
        rank_dict[rank] = 'NA'

    for j in lineage_taxid_list:
        rank = ncbi.get_rank([j])[j]
        taxa = ncbi.get_taxid_translator([j])[j]
        if rank == 'kingdom':
            rank_dict['kingdom'] = taxa

        elif rank == 'phylum':
            rank_dict['phylum'] = taxa

        elif rank == 'class':
            rank_dict['class'] = taxa

        elif rank == 'order':
            rank_dict['order'] = taxa

        elif rank == 'family':
            rank_dict['family'] = taxa

        elif rank == 'genus':
            rank_dict['genus'] = taxa

        elif rank == 'species':
            rank_dict['species'] = taxa

        else:
            pass

    return rank_dict
コード例 #19
0
def taxid2lineage(taxid):
    ranks = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
    ncbi = NCBITaxa()
    lineage = ncbi.get_lineage(taxid)
    lineage_dict = dict()
    names = ncbi.get_taxid_translator(lineage)
    for rank in ranks:
        for k, v in ncbi.get_rank(lineage).items():
            if v == rank:
                lineage_dict.update({v: names[k]})
    return lineage_dict
コード例 #20
0
def check_ancestor(name: str, tax_id: int, rank: str = None) -> bool:
    ncbi = NCBITaxa()
    ancestor_ids = ncbi.get_name_translator([name]).get(name, [])
    if not ancestor_ids:
        raise ValueError("No taxonomy id for {}".format(name))
    lineage = ncbi.get_lineage(tax_id)
    for anc_id in lineage:
        if rank is None or ncbi.get_rank([anc_id]).get(anc_id, '') == rank:
            if anc_id in ancestor_ids:
                return True
    return False
コード例 #21
0
def get_tax_lineage(taxonid, source):
    """Return taxonomy lineage information

    This function uses Biopython library to connect NCBI database
    and search for taxonomy information or ete3 to download
    taxdump file and search the information locally.

    Parameters
    -------------
    taxonid : string
        Taxonomic id of the species
    source : string
        Source to be used to collect the info about the taxonid

    Returns
    -------------
    lineage: dict
        Species lineage

    """

    if taxonid not in LINEAGES:
        if source == "taxdump":
            ncbi_taxdump = NCBITaxa()
            lineage_ids = ncbi_taxdump.get_lineage(taxonid)
            ranks = ncbi_taxdump.get_rank(lineage_ids)
            names = ncbi_taxdump.get_taxid_translator(lineage_ids)
            lineage = {ranks[i]:names[i] for i in lineage_ids}

            LINEAGES[taxonid] = lineage
            return LINEAGES[taxonid]

        while True:
            data = ""
            try:
                Entrez.email = "*****@*****.**"
                handle = Entrez.efetch(id = taxonid, db = "taxonomy", retmode = "xml")    
                data = Entrez.read(handle)
                handle.close()
            except Exception as e:
                with open(LOG, "a") as log:
                    print("Error when searching information about {}".format(taxonid),
                        file=log)

            if data:
                break
    
        lineage = {d["Rank"]:d["ScientificName"] for d in data[0]["LineageEx"]}
        lineage[data[0]["Rank"]] = data[0]["ScientificName"]
        LINEAGES[taxonid] = lineage

    
    return LINEAGES[taxonid]
コード例 #22
0
ファイル: alignment.py プロジェクト: andreyfsch/byte_pipette
    def assign_rank_representation(self, rank='species'):
        protDB = db_handling.ProteinDatabase()
        ncbi = NCBITaxa()
        entries_no_representative = protDB.get_entries_no_representative()

        for entry in entries_no_representative:
            taxon_id = entry[1]
            with warnings.catch_warnings(record=True) as w:
                warn_msg = None
                warnings.simplefilter("always")
                lineage = ncbi.get_lineage(taxon_id)
                for a in w:
                    warn_msg = a.message
                if warn_msg:
                    warn_data = str(warn_msg).split()
                    taxon_id = int(warn_data[-1])
                    protDB.update_protein_entry(
                        {'representative_of_taxon': taxon_id}, entry[0])

            lineage_ranks = ncbi.get_rank(lineage)
            lineage_translation = ncbi.get_taxid_translator(lineage)
            insert = True
            ellected_rank_id = ''
            for rank_id, lineage_rank in lineage_ranks.items():
                if rank == lineage_rank:
                    ellected_rank_id = rank_id
            print(entry[0])
            if lineage_ranks[taxon_id] != rank:
                if not self.bigger_than_rank_taxon(
                        lineage_ranks[taxon_id],
                        rank) and ellected_rank_id != '':
                    protDB.update_protein_entry(
                        {
                            'representative_of_taxon':
                            ellected_rank_id,
                            'representative_taxon_rank':
                            rank,
                            'taxon_name_representative':
                            lineage_translation[ellected_rank_id]
                        }, entry[0])
                    insert = False
            if entry[2] == None and insert and ellected_rank_id != '':
                protDB.update_protein_entry(
                    {
                        'representative_of_taxon':
                        ellected_rank_id,
                        'representative_taxon_rank':
                        rank,
                        'taxon_name_representative':
                        lineage_translation[ellected_rank_id]
                    }, entry[0])
コード例 #23
0
def taxid_to_lineage_string(taxid):
    tax_order = ['kingdom', 'domain', 'phylum', 'class', 'order', 'family', 'genus', 'species']
    outstr = ''
    ncbi = NCBITaxa()
    lineage = ncbi.get_lineage(taxid)
    names = ncbi.get_taxid_translator(lineage)
    for level in tax_order:
        for tid in names:
            rank = ncbi.get_rank([tid])
            if rank[tid] == 'superkingdom':
                rank[tid] = 'domain'
            if rank[tid] == level:
                outstr += level[0] + '_' + names[tid] + ';'
    return outstr[:-1]
コード例 #24
0
def taxid_to_lineage(taxid):
    """
    Function for retrieving the taxonomic rank of given taxid
    :param taxid:
    :return:
    """
    ncbi = NCBITaxa()
    lineage = ncbi.get_lineage(taxid)
    rank_to_id = {rank: id for (id, rank) in ncbi.get_rank(lineage).items()}
    rank_to_id = {
        desired_rank: (rank_to_id[desired_rank] if desired_rank in rank_to_id.keys() else None)
        for desired_rank in desired_ranks
    }
    return rank_to_id
コード例 #25
0
def get_taxonomic_group_mapping(group_ids: List[str],
                                selected_rank: str) -> Tuple[Dict, Dict]:
    """
    Function to create a mapping from NCBI-taxon ids to groups which are used to split the provided
    training records into training and validation sets

    :param group_ids: List of identifiers that should be NCBI taxon ids
    :param selected_rank: selected standard rank determining on which level the set is split in
                          training and validation-set
    :return: Mapping of input taxon_ids as string and groups as integers
    """
    ncbi = NCBITaxa()
    standard_ranks = [
        "superkingdom", "phylum", "class", "order", "family", "genus",
        "species"
    ]
    if not selected_rank.lower() in standard_ranks:
        selected_rank = auto_select_rank(group_ids)

    taxon_ids_set = set(group_ids)
    taxon_ancestor_mapping = {}

    for taxon in taxon_ids_set:
        lineage = ncbi.get_lineage(int(taxon))
        ids_of_ranks = ncbi.get_rank(lineage)
        taxon_ancestor_mapping[
            taxon] = 0  # fall-back value if sample does not have an entry on this level
        for ancestor_id, rank in ids_of_ranks.items():
            if rank == selected_rank:
                taxon_ancestor_mapping[taxon] = ancestor_id

    ancestor_ids = set(taxon_ancestor_mapping.values())
    ancestor_names = ncbi.get_taxid_translator(ancestor_ids)
    ancestor_names[0] = "unknown"
    ancestor_enumeration = {
        ancestor_id: x
        for x, ancestor_id in enumerate(ancestor_ids)
    }

    group_name_mapping = {
        taxon: ancestor_names[taxon_ancestor_mapping[taxon]]
        for taxon in group_ids
    }
    group_id_mapping = {
        taxon: ancestor_enumeration[taxon_ancestor_mapping[taxon]]
        for taxon in group_ids
    }

    return group_name_mapping, group_id_mapping
コード例 #26
0
def get_ncbi_taxonomy(taxid):

	ncbi= NCBITaxa()

	lineage = ncbi.get_lineage(taxid)
	names = ncbi.get_taxid_translator(lineage)
	ranks = ncbi.get_rank(lineage)
	ncbi_taxonomy_path=""

	for taxid in lineage:

		if not ranks[taxid]=="no rank":
			ncbi_taxonomy_path = ncbi_taxonomy_path +";"+names[taxid]

	return(ncbi_taxonomy_path)
コード例 #27
0
def get_taxonomy(updateBool, spName):
    ncbi = NCBITaxa()

    #add update condition
    if updateBool is True:
        ncbi.update_taxonomy_database()

    #get only genus name
    genus = spName.partition('_')[0]

    name2taxid = ncbi.get_name_translator([genus])

    lineage = ncbi.get_lineage(name2taxid[genus][0])

    return lineage[2:]
コード例 #28
0
def main(InputMSA, output):
    ncbi = NCBITaxa()
    #ncbi.update_taxonomy_database()
    headers, seqs = readAlg(InputMSA)
    sys.stdout.write("Annotating headers for %d sequences..." % len(headers))

    for i in range(0, len(headers)):
        head_terms = read_header(headers[i])
        lin = ncbi.get_lineage(head_terms["taxid"])
        #sp_name = ncbi.translate_to_names([tid])
        lin_name = ncbi.translate_to_names(lin)
        with open(output, 'w+') as output_fasta:
            output_fasta.write(">%s|%s|%s\n%s\n" %
                               (head_terms["header"], lin_name[-1], ", ".join(
                                   lin_name[1:]), seqs[i]))
    sys.stdout.write("Done\n")
コード例 #29
0
ファイル: stats2tree_annotated.py プロジェクト: hamid58b/bio
def get_leaves_taxid(nodeset):
    ncbi =NCBITaxa()
    df=pd.read_csv("stats.csv", names=['taxid','CDS','CDS_Mean','exon','exon_Mean','gene','gene_Mean','mRNA','mRNA_Mean'])
    taxid_list=df['taxid']
    gff_set=set()
    
    for nodeid in nodeset:
        for taxid in taxid_list:
            try:
                if nodeid in ncbi.get_lineage(taxid):
                    gff_set.add(taxid)
            except ValueError:
                print ("error in getting get_lineage()")

            
    return gff_set
コード例 #30
0
ファイル: exec.py プロジェクト: openpaul/pygmes
 def fetchinfomap(self):
     """
     function to make sure the information of all models
     is known to the class
     """
     ncbi = NCBITaxa()
     if len(self.modelinfomap) == 0:
         info = self.fetch_info("{}info.csv".format(url))
         logging.debug("Fetching models from {}".format(url))
         for line in info.split("\n"):
             l = line.split(",")
             # fetch lineage from ete3 for each model
             # time consuming but important to adapt to changes
             # in NCBI taxonomy
             if len(l) > 1:
                 self.modelinfomap[l[0]] = ncbi.get_lineage(l[1])
コード例 #31
0
ファイル: check_hgt_trees.py プロジェクト: AnnaNenarokova/ngs
def get_tags_leaves(tree, taxid_dict):
    ncbi_taxa = NCBITaxa()
    bacteria_taxid = 2
    dpapi_taxid = 91374
    leaf_tags = {}
    for leaf in tree.iter_leaves():
        seqid = leaf.name
        if "DIPPA" in seqid:
            leaf_tags[seqid] = "dpapi"
        elif seqid in taxid_dict.keys():
            # print (seqid)
            # print (taxid_dict[seqid])
            taxid = int(taxid_dict[seqid])
            if taxid == dpapi_taxid:
                leaf_tags[seqid] = "dpapi"
            elif bacteria_taxid in ncbi_taxa.get_lineage(taxid):
                leaf_tags[seqid] = "bacteria"
            else:
                leaf_tags[seqid] = "other"
        else:
            print (seqid, "is not in taxid dict!")
            leaf_tags[seqid] = "other"
    return leaf_tags
コード例 #32
0
ファイル: ete_lineage.py プロジェクト: jetjr/Bioinformatics
#!/usr/bin/env python

from ete3 import NCBITaxa
import sys
import os


args = sys.argv

if len(args) < 2:
  print("Usage:", args[0], "[IDs]")
  sys.exit(1)

ncbi = NCBITaxa()

for id in open(args[1]):
    print ncbi.get_lineage(id)
コード例 #33
0
ファイル: lineage.py プロジェクト: andrewwhwang/fastaq2phylo
filenum  = args.filenum

# filename = 'C:/Users/Andrew.Hwang/Desktop/fastaq2phylo/output/blastout.txt'
# dbType = 'nt'
# filenum  = "0"

memory = {}
writeLines = []
with open(filename, 'r') as f:
    for line in f:
        line_arr = line.split("\t")
        ID=line_arr[1]
        pos=int(round(100 * int(line_arr[2]) / int(line_arr[4])))
        pos2=int(round(100 * int(line_arr[3]) / int(line_arr[4])))
        if dbType == 'nt':
            lineage = ncbi.get_lineage(ID)
            names = ncbi.get_taxid_translator(lineage)
            strList = [str(names[taxid]) for taxid in lineage]
            writeLines.append( '; '.join(strList) + "::" + str(pos)+"-"+str(pos2))
        elif dbType == 'viruses' or dbType == 'blood':
            if not memory.has_key(ID):
                handle = Entrez.efetch(db='nucleotide', id=ID, retmode="xml")
                records = Entrez.read(handle)
                lineage = records[0]['GBSeq_taxonomy']
                organism = records[0]['GBSeq_organism']
                lineage = "; ".join(lineage.split("; ")[:-1] + [organism])
                memory[ID] = lineage
            writeLines.append(memory[ID] + "::" + str(pos)+"-"+str(pos2))
        else:
            writeLines.append(line_arr[5][:-1] + "::" + str(pos)+"-"+str(pos2))
```

if len(sys.argv) == 1:
	sys.exit("USAGE: python %s <path/to/ncbi_gi_taxid_file> > <output.txt>" % sys.argv[0])

ncbi = NCBITaxa()
#ncbi.update_taxonomy_database()

fp = open('taxa-ids-not-found.txt', 'w')
hier = ["superkingdom", "kingdom", "phylum", "class", "order", "family", "genus", "species"]

missing = []
for x in open(sys.argv[1]):
    dat = x.rstrip().split('\t')[-1]
    try:
        lineage = ncbi.get_lineage(dat)
	names = ncbi.get_taxid_translator(lineage)
	ranks = ncbi.get_rank(lineage)
	
	new_ranks = {}
	for keys in ranks:
		if ranks[keys] in hier:
			new_ranks[keys]=ranks[keys]
        
	d = {}
        for taxid in lineage and new_ranks:
		d[new_ranks[taxid]] = names[taxid]
	
	for key in sorted(d):
		print x.rstrip() + "\t"+ str(key)+"\t"+d[key]
    except ValueError:
コード例 #35
0
#!/usr/bin/python

Usage = """
Print taxid's lineage and ranks
by default prints to the stdout
Usage:
  taxid_ranks.py taxid > ouput.txt

Arun Seetharam
[email protected]
taxid_ranks.py -version 1.0
04/13/2017
"""
from ete3 import NCBITaxa
import sys
ncbi = NCBITaxa()
if len(sys.argv)<2:
    print Usage
else:
    cmdargs = str(sys.argv)
    lineage = ncbi.get_lineage((sys.argv[1]))
    names = ncbi.get_taxid_translator(lineage)
    for taxid in lineage:
        print [ncbi.get_rank([taxid])], [names[taxid]]        
#    print [names[taxid] for taxid in lineage]
#    print [ncbi.get_rank([taxid]) for taxid in lineage]
#    print [ncbi.get_rank([name]) for name in names]
コード例 #36
0
ファイル: efecht.py プロジェクト: GuiSeSanz/myScripts





NCBI = False
if NCBI :
    from ete3 import NCBITaxa
    ncbi = NCBITaxa()
    #ncbi.update_taxonomy_database()
    taxIDlist=[]
    for gene in geneList:
        name2taxID = ncbi.get_name_translator([gene.organism])
        gene.taxID = name2taxID[gene.organism][0]
        for i in ncbi.get_lineage(gene.taxID):
            
            gene.addlineageid(i)
        taxIDlist.append(gene.taxID)
        
    #taxid2name = ncbi.get_taxid_translator([9606, 9443])
    #print taxid2name
tree = False
if tree :    
    tree = ncbi.get_topology(taxIDlist)
    print tree.get_ascii(attributes=["sci_name", "rank"])
    



コード例 #37
0
taxids = []

with open(diamond_path) as input_f:
    for line in input_f:
        newtaxid = line.split("\t")[1]
        taxids.append(newtaxid)

taxids_nr = list(set(taxids))
tax_names = ncbi.get_taxid_translator(taxids_nr)

input_f = open(diamond_path, "r")
output_f = open(out_path, 'w')

for line in input_f:
    line_split = line.rstrip().split("\t")
    id = line_split[0]
    taxid = line_split[1]
    evalue = line_split[2]
    if taxid == "0":
        name = "None"
        is_bacteria = 0
    else:
        name = tax_names[int(taxid)]
        is_bacteria = 1 if 2 in ncbi.get_lineage(taxid) else 0

    output_f.write('{}\t{}\t{}\t{}\t{}\n'.format(id, taxid, evalue, name, is_bacteria))

input_f.close()
output_f.close()
コード例 #38
0
ファイル: emapper.py プロジェクト: jhcepas/eggnog-mapper
def parse_args(parser):
    args = parser.parse_args()

    if args.version:
        print get_version()
        sys.exit(0)

    if not args.no_annot and not pexists(EGGNOGDB_FILE):
        print colorify('Annotation database data/eggnog.db not present. Use download_eggnog_database.py to fetch it', 'red')
        raise emapperException()

    if args.mode == 'diamond' and not pexists(EGGNOG_DMND_DB):
        print colorify('DIAMOND database data/eggnog_proteins.dmnd not present. Use download_eggnog_database.py to fetch it', 'red')
        raise emapperException()

    if args.cpu == 0:
        args.cpu = multiprocessing.cpu_count()

    # No --servermode available for diamond
    if args.mode == 'diamond' and args.servermode:
        parser.error('--mode [diamond] and --servermode are mutually exclusive')

    # Output file required unless running in servermode
    if not args.servermode and not args.output:
        parser.error('An output project name is required (-o)')

    # Servermode implies using mem-based databases
    if args.servermode:
        args.usemem = True

    # Direct annotation implies no searches
    if args.annotate_hits_table:
        args.no_search = True
        args.no_annot = False

    # Check inputs for running sequence searches
    if not args.no_search and not args.servermode:
        if not args.input:
            parser.error('An input fasta file is required (-i)')

        # HMM
        if args.mode == 'hmmer':
            if not args.db and not args.guessdb:
                parser.error('HMMER mode requires specifying a target database (i.e. -d, --guessdb ))')
            if args.db and args.guessdb:
                parser.error('-d and --guessdb options are mutually exclusive')

            if args.guessdb:
                from ete3 import NCBITaxa
                ncbi = NCBITaxa()
                lineage = ncbi.get_lineage(args.guessdb)
                for tid in reversed(lineage):
                    if tid in TAXID2LEVEL:
                        print tid, TAXID2LEVEL[tid]
                        args.db = TAXID2LEVEL[tid]
                        break
        # DIAMOND
        elif args.mode == 'diamond':
            #if args.db or args.guessdb:
            #    parser.error('diamond mode does not require -d or --guessdb options')
            pass

    return args
コード例 #39
0
ファイル: ete_ncbiquery.py プロジェクト: Ward9250/ete
def run(args):
    # add lineage profiles/stats

    import re
    from ete3 import PhyloTree, NCBITaxa

    # dump tree by default
    if not args.tree and not args.info and not args.descendants:
        args.tree = True

    ncbi = NCBITaxa()

    all_taxids = {}
    all_names = set()
    queries = []

    if not args.search:
        log.error('Search terms should be provided (i.e. --search) ')
        sys.exit(-1)
    for n in args.search:
        queries.append(n)
        try:
            all_taxids[int(n)] = None
        except ValueError:
            all_names.add(n.strip())

    # translate names
    name2tax = ncbi.get_name_translator(all_names)
    all_taxids.update([(v, None) for v in list(name2tax.values())])

    not_found_names = all_names - set(name2tax.keys())
    if args.fuzzy and not_found_names:
        log.warn("%s unknown names", len(not_found_names))
        for name in not_found_names:
            # enable extension loading
            tax, realname, sim = ncbi.get_fuzzy_name_translation(name, args.fuzzy)
            if tax:
                all_taxids[tax] = None
                name2tax[name] = tax
                name2realname[name] = realname
                name2score[name] = "Fuzzy:%0.2f" %sim

    if not_found_names:
        log.warn("[%s] could not be translated into taxids!" %','.join(not_found_names))

    if args.tree:
        if len(all_taxids) == 1:
            target_taxid = next(all_taxids.keys())
            log.info("Dumping NCBI descendants tree for %s" %(target_taxid))
            t = ncbi.get_descendant_taxa(target_taxid, collapse_subspecies=args.collapse_subspecies, rank_limit=args.rank_limit, return_tree=True)
        else:
            log.info("Dumping NCBI taxonomy of %d taxa..." %(len(all_taxids)))
            t = ncbi.get_topology(list(all_taxids.keys()),
                              intermediate_nodes=args.full_lineage,
                              rank_limit=args.rank_limit,
                              collapse_subspecies=args.collapse_subspecies)

        id2name = ncbi.get_taxid_translator([n.name for n in t.traverse()])
        for n in t.traverse():
            n.add_features(taxid=n.name)
            n.add_features(sci_name=str(id2name.get(int(n.name), "?")))
            n.name = "%s - %s" %(id2name.get(int(n.name), n.name), n.name)
            lineage = ncbi.get_lineage(n.taxid)
            n.add_features(named_lineage = '|'.join(ncbi.translate_to_names(lineage)))
        dump(t, features=["taxid", "name", "rank", "bgcolor", "sci_name",
                          "collapse_subspecies", "named_lineage"])
    elif args.descendants:
        log.info("Dumping NCBI taxonomy of %d taxa..." %(len(all_taxids)))
        print('# ' + '\t'.join(["Taxid", "Sci.Name", "Rank", "descendant_taxids", "descendant_names"]))
        translator = ncbi.get_taxid_translator(all_taxids)
        ranks = ncbi.get_rank(all_taxids)
        for taxid in all_taxids:
            descendants = ncbi.get_descendant_taxa(taxid, collapse_subspecies=args.collapse_subspecies, rank_limit=args.rank_limit)
            print('\t'.join([str(taxid), translator.get(taxid, taxid), ranks.get(taxid, ''),
                             '|'.join(map(str, descendants)),
                             '|'.join(map(str, ncbi.translate_to_names(descendants)))]))

    elif args.info:
        print('# ' + '\t'.join(["Taxid", "Sci.Name", "Rank", "Named Lineage", "Taxid Lineage"]))
        translator = ncbi.get_taxid_translator(all_taxids)
        ranks = ncbi.get_rank(all_taxids)
        for taxid, name in six.iteritems(translator):
            lineage = ncbi.get_lineage(taxid)
            named_lineage = ','.join(ncbi.translate_to_names(lineage))
            lineage_string = ','.join(map(str, lineage))
            print('\t'.join([str(taxid), name, ranks.get(taxid, ''), named_lineage, lineage_string]))