コード例 #1
0
def search_regulator(args):
    valid = True

    if args.min_abundance <= 0:
        print("Input minimum abundance is not valid.")
        valid = False

    from ete3 import NCBITaxa
    ncbi = NCBITaxa()

    if hasattr(args, 'temp_dir'):
        args.findmitoscaf_dir = os.path.join(args.temp_dir, 'findmitoscaf')
    else:
        args.findmitoscaf_dir = os.path.join(os.getcwd(), 'findmitoscaf')
    try:
        safe_makedirs(args.findmitoscaf_dir)
    except FileExistsError:
        valid = False
        print('Diretory exist before validating, please check and remove it to prevent data loss.')
    except Exception:
        valid = False
        print('Error occured when validating the directories, please check your permissions or things could be related.')

    if args.required_taxa not in ncbi.get_name_translator([args.required_taxa]):
        print("Specified taxanomy name not in NCBI taxanomy database.")
        return False
    args.taxa_ids = ncbi.get_name_translator([args.required_taxa])[
        args.required_taxa]

    return valid
コード例 #2
0
def get_rank_dict(taxa_name=None):
    ncbi = NCBITaxa()
    name_dict = ncbi.get_name_translator([taxa_name])
    if not name_dict:
        ## try only the first word (which may be a genus name?)
        print("can not find taxid for", taxa_name, file=sys.stderr)
        taxa_name = taxa_name.split()
        if len(taxa_name) > 1:
            taxa_name = taxa_name[0]
            print("try to search %s instead..." % taxa_name, file=sys.stderr)
            name_dict = ncbi.get_name_translator([taxa_name])

        if not name_dict:
            print("can not find taxid for %s, maybe it's a misspelling.\n" %
                  taxa_name,
                  file=sys.stderr)
            return None

    lineage_taxid_list = ncbi.get_lineage(name_dict[taxa_name][0])

    rank_dict = dict()
    for rank in [
            'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'
    ]:
        rank_dict[rank] = 'NA'

    for j in lineage_taxid_list:
        rank = ncbi.get_rank([j])[j]
        taxa = ncbi.get_taxid_translator([j])[j]
        if rank == 'kingdom':
            rank_dict['kingdom'] = taxa

        elif rank == 'phylum':
            rank_dict['phylum'] = taxa

        elif rank == 'class':
            rank_dict['class'] = taxa

        elif rank == 'order':
            rank_dict['order'] = taxa

        elif rank == 'family':
            rank_dict['family'] = taxa

        elif rank == 'genus':
            rank_dict['genus'] = taxa

        elif rank == 'species':
            rank_dict['species'] = taxa

        else:
            pass

    return rank_dict
コード例 #3
0
def read_old_kjV2(filename, artifact_threshold=0):
    """
    Takes kaiju output file and make them into python dictionary.

    Parameters
    ------------
    filename: str,
        location/file name of the kaiju output file

    artifact_threshold: int,
        threshold for artifact range, if it is lower than threshold, the OTU is not added to the dictionary. 

    Returns
    ------------
    readsDict: dict,
        dictionary file where:
                key : NCBI taxanomy ID
                value : number of reads
    """
    from ete3 import NCBITaxa
    ncbi = NCBITaxa()

    readFile = open(filename, 'r')
    readsDict = {}
    all_lines = readFile.readlines()

    for line in all_lines[1:-3]:
        tokens = line.rstrip().split('\t')

        if len(tokens) == 5:
            count = int(tokens[2])
            final_id = -1

            taxa = tokens[4]
            taxa = taxa[:-1]
            taxa = taxa.split(';')
            taxa = [t.strip() for t in taxa]

            otu_name = taxa[final_id]
            while not bool(ncbi.get_name_translator([otu_name])):
                final_id -= 1
                otu_name = taxa[final_id]

            taxa_id = ncbi.get_name_translator([otu_name])[otu_name]
            if count > artifact_threshold:
                readsDict[taxa_id[0]] = int(count)

    return readsDict
コード例 #4
0
def get_ncbi_taxonomy_species_tree(names_list):
    ncbi = NCBITaxa()
    name2taxid_dic = ncbi.get_name_translator(names_list)
    taxid_list = []
    for i in range(len(names_list)):
        taxid_list.append(name2taxid_dic[names_list[i]][0])
    return ncbi.get_topology(taxid_list)
コード例 #5
0
def get_species_by_rank(species_list, group, rank):
	
	ncbi = NCBITaxa()
	
	#convert group name to id 
	group_id = ncbi.get_name_translator([group])[group][0]

	#get an annotated tree
	tree = ncbi.get_descendant_taxa(group_id, collapse_subspecies=True, 
									return_tree=True)
	
	groups = []
	for node in tree.traverse("levelorder"):
		
		#if current rank is the requested rank
		if node.rank == rank:
			
			#get node's species ids 
			species = [str(leaf.taxid) for leaf in node.get_leaves()]
			
			#remove species not found in the species list  
			species_inlist = [s for s in species if s in species_list]
	
			if species_inlist:
				#create a group 
				group = Group(taxid = node.taxid, species = species_inlist)
				
				#add group to dictionary 
				groups.append(group)
				
	return groups
コード例 #6
0
def get_tax(
    cma_file
):  #finds the 'lowest common ancestor' of species represented in a cma file
    ncbi = NCBITaxa()
    org_regex = r'\[(.*)\]'
    taxid_list = []
    for line in open(cma_file, 'r'):
        if line.startswith(">"):
            find_org_name = re.search(org_regex, line)
            if find_org_name is not None:
                org_name = find_org_name.group(1)
                taxid = str(ncbi.get_name_translator([org_name]))
                taxid = re.sub(r'^.*\[', '', taxid)
                taxid = re.sub(r'\].*$', '', taxid)
                if taxid != '{}' and taxid != '32630' and taxid != '10239':  #omit sequences from viruses and synthetic constructs'
                    taxid_list.append(taxid)
    tax_list = ncbi.get_taxid_translator(taxid_list)
    tree = ncbi.get_topology(taxid_list)
    tree_labeled = tree.get_ascii(attributes=['sci_name', 'taxid'])
    lca_id = str(tree.get_tree_root)
    lca_id = re.sub(r"^.*node '", '', lca_id)
    lca_id = re.sub(r"'.*$", '', lca_id)
    lca_name = str(ncbi.get_taxid_translator([lca_id]))
    lca_name = re.sub(r"'}$", '', lca_name)
    lca_name = re.sub(r"^.*'", '', lca_name)
    return (lca_name, tax_list, tree_labeled)
コード例 #7
0
def get_species_by_rank(group, rank):

    ncbi = NCBITaxa()

    group_id = ncbi.get_name_translator([group])[group][0]

    # get an annotated tree
    tree = ncbi.get_descendant_taxa(group_id,
                                    collapse_subspecies=True,
                                    return_tree=True)

    dic_ids = {}
    dic_names = {}
    for node in tree.traverse("levelorder"):

        #if the rank of the current rank is the requested rank
        if node.rank == rank:
            #get its leaves
            leaves = node.get_leaves()
            #get their ids
            dic_ids[node.taxid] = [leaf.taxid for leaf in leaves]
            dic_names[node.sci_name] = [leaf.sci_name for leaf in leaves]

    print "# of {rank}: {num}".format(rank=rank, num=len(dic_ids))

    return dic_ids
コード例 #8
0
def name_ancestors(timetreefile, to_table=False, ete3_algo=False, uniq=True):
    logger.info('Loading data')
    ### /!\ quoted_node_names only from ete3 v3.1.1
    timetree = PhyloTree(timetreefile, format=1, quoted_node_names=True)
    ncbi = NCBITaxa()


    name2taxid = ncbi.get_name_translator([sp.replace('_', ' ') for sp in \
                                                    timetree.get_leaf_names()])

    for leaf in timetree.get_leaves():
        try:
            leaf.add_feature('taxid', name2taxid[leaf.name.replace('_',
                                                                   ' ')][0])
        except KeyError:
            logger.warning('Species %r not found', leaf.name)
            leaf.delete(prevent_nondicotomic=True, preserve_branch_length=True)

    logger.info('Placing common ancestors')
    if ete3_algo:
        ncbi.annotate_tree(timetree, 'taxid')
    else:
        myannotate(timetree, ncbi)
    matchrename_ncbitax(timetree, uniq)

    #logger.debug({ft:getattr(timetree, ft) for ft in timetree.features})

    if not to_table:
        print(timetree.write(format=1, format_root_node=True))
    else:
        for node in timetree.traverse():
            if not node.is_leaf():
                print(node.oldname + '\t' + getattr(node, 'sci_name', ''))
コード例 #9
0
def main(tree_path):

    ncbi = NCBITaxa()

    tree = Tree(tree_path, format=1)

    names = []
    ids = []
    in_magnoliophyta = []
    for leaf in tree:
        name = leaf.name.replace("_", ' ')
        name2taxid = ncbi.get_name_translator([name])

        if not name2taxid:
            if name in byhand:
                id = byhand[name]
                magno = "yes"

            else:
                id = "not found"
                magno = ""

        elif len(name2taxid) > 1:
            id = str(name2taxid[name])
            magno = ""
            print "two ids: ", name
        else:
            id = str(name2taxid[name][0])
            lineage = ncbi.get_lineage(id)
            if 3398 in lineage:  #3398 - magnoliophyta id
                magno = "yes"
            else:
                magno = "no"

        if id != "not found" and id in ids:
            print "duplicate: ", name, id
            id += "_B"

        leaf.name = id

        names.append(name)
        ids.append(id)
        in_magnoliophyta.append(magno)

    df = pd.DataFrame({
        'name': names,
        'id': ids,
        'in magnoliophyta': in_magnoliophyta
    })
    df.to_csv('names_to_ids.csv')

    p = "/groups/itay_mayrose/nomihadar/trees/magnoliophyta_tree/sequences_filtered_zanne/species/intersect_mytree_zannetree_mangoete3.ls"
    with open(p, 'r') as f:
        lines = f.read().splitlines()

    species = [x for x in lines if x not in ['58454', '142615', '77013']]

    tree.prune(list(set(species)), preserve_branch_length=True)

    tree.write(outfile="tree_ids.tree")
コード例 #10
0
def get_taxonomy(species_name,
                 name_format="Genus species",
                 ranks=None,
                 update_db=False):
    species_name = str(species_name)
    ncbi = NCBITaxa()
    if update_db == True:
        ncbi.update_taxonomy_database()
    if name_format == "Genus species":
        species_name = species_name
    if name_format == "Genus_species":
        species_name = species_name.replace("_", " ")
    species_id = ncbi.get_name_translator([species_name])
    if len(species_id) == 0 and ranks == None:
        return (['unknown'])
    if len(species_id) == 0 and ranks != None:
        return (['unknown'] * len(ranks))
    lineage_ids = ncbi.get_lineage(species_id[species_name][0])
    names = ncbi.get_taxid_translator(lineage_ids)
    if ranks == None:
        return (names)
    lineage_rk = ncbi.get_rank(lineage_ids)
    parsed_names = []
    for rk in ranks:
        for rk_id, rk_rk in lineage_rk.items():
            if rk_rk == rk:
                parsed_names.append(ncbi.get_taxid_translator([rk_id])[rk_id])
    return (parsed_names)
コード例 #11
0
def get_metadata(records: List[SeqRecord]):
    ncbi = NCBITaxa()

    species = [gb.annotations["organism"] for gb in records]
    name_translator = ncbi.get_name_translator(species)

    sought_ranks = [
        "superkingdom", "order", "family", "subfamily", "genus", "species"
    ]

    metadata = []

    for gb in records:
        taxid = name_translator[gb.annotations["organism"]][0]
        lineage = ncbi.get_lineage(taxid)
        ranks = ncbi.get_rank(lineage)
        names = ncbi.get_taxid_translator(lineage)
        taxonomy = {
            ranks[k]: names[k]
            for k in lineage if ranks[k] in sought_ranks
        }
        metadata.append({**taxonomy, "aid": gb.id})

    df = pd.DataFrame(metadata)
    df.to_csv("metadata.csv")

    return df
コード例 #12
0
def contig_tax(annot_df, ncbi_db, min_prot, prop_annot, tax_thres):
    """This function takes the annotation table generated by viral_contig_maps.py and generates a table that
    provides the taxonomic lineage of each viral contig, based on the corresponding ViPhOG annotations"""

    ncbi = NCBITaxa(dbfile=ncbi_db)
    tax_rank_order = ["genus", "subfamily", "family", "order"]
    contig_set = set(annot_df["Contig"])

    for contig in contig_set:
        contig_lineage = [contig]
        contig_df = annot_df[annot_df["Contig"] == contig]
        total_prot = len(contig_df)
        annot_prot = sum(contig_df["Best_hit"] != "No hit")
        if annot_prot < prop_annot * total_prot:
            contig_lineage.extend([""] * 4)
        else:
            contig_hits = contig_df[pd.notnull(
                contig_df["Label"])]["Label"].values
            taxid_list = [
                ncbi.get_name_translator([item])[item][0]
                for item in contig_hits
            ]
            hit_lineages = [{
                y: x
                for x, y in ncbi.get_rank(ncbi.get_lineage(item)).items()
                if y in tax_rank_order
            } for item in taxid_list]
            for rank in tax_rank_order:
                taxon_list = [item.get(rank) for item in hit_lineages]
                total_hits = sum(pd.notnull(taxon_list))
                if total_hits < min_prot:
                    contig_lineage.append("")
                    continue
                else:
                    count_hits = Counter(
                        [item for item in taxon_list if pd.notnull(item)])
                    best_hit = sorted(
                        [(x, y) for x, y in count_hits.items()],
                        key=lambda x: x[1],
                        reverse=True,
                    )[0]
                    prop_hits = best_hit[1] / total_hits
                    if prop_hits < tax_thres:
                        contig_lineage.append(prop_hits)
                        continue
                    else:
                        best_lineage = ncbi.get_lineage(best_hit[0])
                        contig_lineage.extend([
                            ncbi.get_taxid_translator([key])[key]
                            if pd.notnull(key) else "" for key in [{
                                y: x
                                for x, y in ncbi.get_rank(
                                    best_lineage).items()
                            }.get(item
                                  ) for item in tax_rank_order[tax_rank_order.
                                                               index(rank):]]
                        ])
                        break
        yield contig_lineage
コード例 #13
0
def main():
    """Make queries against NCBI Taxa databases"""
    # Get commandline args
    args = get_args()

    # Instantiate the ete NCBI taxa object
    ncbi = NCBITaxa()

    if args.verbose > 1:
        print("Taxa database is stored under ~/.etetoolkit/taxa.sqlite")

    # Update the database if required.
    if args.update is True:
        if args.verbose > 1:
            print(
                "Updating the taxonomy database. This may take several minutes..."
            )
        ncbi.update_taxonomy_database()

    # If a name was provided instead of a TaxID, convert and store it.
    if args.name:
        args.taxid = ncbi.get_name_translator([args.name])[args.name][0]

    if args.verbose > 0:
        tax_dict = {}
        # If a name was provided, simply add it to dict
        if args.name:
            tax_dict['Name'] = args.name
        # If not, do the opposite conversion to the above and store that
        else:
            tax_dict['Name'] = ncbi.get_taxid_translator([args.taxid
                                                          ])[args.taxid]

# Continue to populate the taxa dict with other information
        tax_dict['TaxID'] = args.taxid
        tax_dict['Rank'] = ncbi.get_rank([args.taxid])
        tax_dict['Lineage'] = ncbi.get_taxid_translator(
            ncbi.get_lineage(args.taxid))

        print("Information about your selected taxa:")
        pretty(tax_dict)

    # Main feature of the script is to get all taxa within a given group.
    descendent_taxa = ncbi.get_descendant_taxa(args.taxid)
    descendent_taxa_names = ncbi.translate_to_names(descendent_taxa)
    print("Descendent taxa for TaxID: %s" % (args.taxid))

    # Under python3, zip = izip. In python2, this list could be very large, and memory intensive
    # Suggest the script is run with python3
    if args.verbose > 0:
        for dtn, dt in zip(descendent_taxa_names, descendent_taxa):
            print("%s\t%s" % (dtn, dt))

    if args.outfile:
        with open(args.outfile, 'w') as ofh:
            for id in descendent_taxa:
                ofh.write(str(id) + '\n')
コード例 #14
0
def from_name2ids(phylum_name, dataset='genbank', return_d2ids=False):
    """
    retrieve ids and metadata from genbank file
    :param phylum_name:
    :return:
    """
    phylum_names = [_ for _ in phylum_name.split(';') if _]
    # phylum_name = "Nitrospirae;"
    # phylum_tid = "40117"
    ncbi = NCBITaxa()

    p2tid = ncbi.get_name_translator(phylum_names)

    for _ in phylum_names:
        if not p2tid.get(_):
            print(f" '{_}'' not found. please check the name")
    tids = [p2tid.get(_, [None])[0] for _ in phylum_names if p2tid.get(_)]
    tid2name = {
        p2tid.get(_, [None])[0]: _
        for _ in phylum_names if p2tid.get(_)
    }

    domain2dids = defaultdict(list)
    descend_ids = []
    tid2dids = {}
    for tid in tids:
        lineages = ncbi.get_lineage(tid)
        ranks = ncbi.get_rank(lineages)
        ranks = {v: k for k, v in ranks.items()}
        names = ncbi.get_taxid_translator(lineages)
        domain = names[ranks['superkingdom']]

        _descend_ids = ncbi.get_descendant_taxa(tid, intermediate_nodes=True)
        tid2dids[tid2name[tid]] = len(_descend_ids)
        descend_ids += _descend_ids
        domain2dids[domain].extend(_descend_ids)
    print(f"in total, {len(descend_ids)} taxids were found. ")
    if return_d2ids:
        return domain2dids

    domain2aids = defaultdict(list)
    collect_info = []
    descend_ids = set(descend_ids)
    for domain, ids in domain2dids.items():
        d = domain.lower()
        metadata = join(metadata_files_dir,
                        f"{dataset}_{d}_assembly_summary.txt")
        tqdm.write(
            f'read {metadata} which last modified at : {time.ctime(os.path.getmtime(metadata))}'
        )
        for row in tqdm(open(metadata)):
            if row.startswith("GC"):
                rows = row.split('\t')
                if int(rows[5]) in descend_ids:
                    collect_info.append(row)
                    domain2aids[d].append(rows[0])
    return domain2aids, collect_info
コード例 #15
0
def check_ancestor(name: str, tax_id: int, rank: str = None) -> bool:
    ncbi = NCBITaxa()
    ancestor_ids = ncbi.get_name_translator([name]).get(name, [])
    if not ancestor_ids:
        raise ValueError("No taxonomy id for {}".format(name))
    lineage = ncbi.get_lineage(tax_id)
    for anc_id in lineage:
        if rank is None or ncbi.get_rank([anc_id]).get(anc_id, '') == rank:
            if anc_id in ancestor_ids:
                return True
    return False
コード例 #16
0
def phylo_tree_2_ete_tree(tree, names_list):
    ncbi = NCBITaxa()
    handle = StringIO()
    Phylo.write(tree, handle, 'newick')
    newick_tree = handle.getvalue()
    name2taxid_dic = ncbi.get_name_translator(names_list)
    for i in range(len(names_list)):
        newick_tree = newick_tree.replace(
            names_list[i], str(name2taxid_dic[names_list[i]][0]))
    newick_tree = newick_tree.replace('\'', '')
    return Tree(newick_tree, format=1)
コード例 #17
0
def get_group_tree(group, rank):

    ncbi = NCBITaxa()

    translator = ncbi.get_name_translator([group])
    group_id = translator[group][0]

    # get an annotated tree
    tree = ncbi.get_descendant_taxa(group_id,
                                    collapse_subspecies=True,
                                    return_tree=True)

    return tree
コード例 #18
0
def get_ncbi_taxa_rank(taxa_name):

	ncbi= NCBITaxa()

	name2taxid=ncbi.get_name_translator([taxa_name])
	rank="N/A"
	ncbi_taxid="N/A"

	if name2taxid:
		ncbi_taxid=name2taxid[taxa_name].pop()
		ncbi_ranks=ncbi.get_rank([ncbi_taxid])
		rank=ncbi_ranks[ncbi_taxid]
	
	return(rank,ncbi_taxid)
コード例 #19
0
ファイル: utils.py プロジェクト: yemilawal/mob-suite
def NamesToTaxIDs(names):
    if not isETE3DBTAXAFILEexists(ETE3DBTAXAFILE):
        logging.info("Did not find taxa.sqlite in {}. Initializaing ete3 taxonomy database".format(ETE3DBTAXAFILE))
        initETE3Database(ETE3DBTAXAFILE)

    ncbi = NCBITaxa(dbfile=ETE3DBTAXAFILE)

    if not isETE3DBTAXAFILEexists(ETE3DBTAXAFILE):
        logging.error(
            "Tried ete3 init, but still was not able to find taxa.sqlite file for ete3 lib in {}. Aborting".format(
                ETE3DBTAXAFILE))
        return {}

    return ncbi.get_name_translator(names)
コード例 #20
0
def get_taxonomy(updateBool, spName):
    ncbi = NCBITaxa()

    #add update condition
    if updateBool is True:
        ncbi.update_taxonomy_database()

    #get only genus name
    genus = spName.partition('_')[0]

    name2taxid = ncbi.get_name_translator([genus])

    lineage = ncbi.get_lineage(name2taxid[genus][0])

    return lineage[2:]
コード例 #21
0
ファイル: Kraken-SBT.py プロジェクト: tbenavi1/Kraken-SBT
def get_taxonid_to_readfilenames(name_ftpdirpaths_filename): #searches through all the data files and create a dictionary that maps taxonids to readfilenames
	ncbi = NCBITaxa()
	taxonid_to_readfilenames = defaultdict(list) #a given taxonid may map to multiple readfiles, thus each value in the dictionary is a list
	for line in open(name_ftpdirpaths_filename):
		splits = line.strip().split(' ') #split on each of the spaces
		name = ' '.join(splits[:-1]) #concatenate everything before the last space
		ftpdirpath = splits[-1] #everything after the last space
		readfilename = './Bacteria_Genomes/' + ftpdirpath.split('/')[-1] + '_genomic.fna.gz'
		name_to_taxonid = ncbi.get_name_translator([name]) #a dictionary with name as key and [taxonid] as value
		listtaxonid = [taxonid for [taxonid] in name_to_taxonid.values()] #if the name was found in the ncbi database, listtaxonid is a list containing the taxonid; otherwise, an empty list.
		if listtaxonid:
			taxonid = listtaxonid[0]
			taxonid_to_readfilenames[taxonid].append(readfilename)
		else:
			if name == 'Donghicola sp. JLT3646': #upon further inspection, this name in assembly_summary.txt has been updated in NCBI
				print('Changing', name, 'to Marivivens sp. JLT3646')
				name = 'Marivivens sp. JLT3646'
				name_to_taxonid = ncbi.get_name_translator([name])
				listtaxonid = [taxonid for [taxonid] in name_to_taxonid.values()]
				taxonid = listtaxonid[0]
				taxonid_to_readfilenames[taxonid].append(readfilename)
			if name == 'Mycobacterium intracellulare MOTT-64': #upon further inspection, this name in assembly_summary.txt is curiously absent from NCBI
				print(name, 'is not in the NCBI database')
	return taxonid_to_readfilenames
コード例 #22
0
ファイル: ete_gui.py プロジェクト: dengzq1234/ete_gui
def run_ete_ncbiquery_py(query):
    ncbi = NCBITaxa()
    query = query.split(',')
    final_query = []
    for i in query:
        try:
            i.lstrip()
            i = int(i)
            final_query.append(i)
        except ValueError:
            i = i.lstrip()
            name2taxid = ncbi.get_name_translator([i])[i]
            final_query += name2taxid
    tree = ncbi.get_topology(final_query)

    return tree.get_ascii(attributes=["sci_name", "rank"])
コード例 #23
0
def filter_species(species_ids, group):

    ncbi = NCBITaxa()

    group_id = ncbi.get_name_translator([group])[group][0]

    filtered_ids = []
    for species_id in species_ids:
        lineage = ncbi.get_lineage(species_id)

        print species_id, lineage

        if group_id in lineage:
            filtered_ids.append(species_id)

    return filtered_ids
コード例 #24
0
def get_species_by_rank(input_tree, group, rank):

    #get species of input tree
    tree_taxa = [leaf.name for leaf in input_tree]

    ncbi = NCBITaxa()

    #convert group name to id
    group_id = ncbi.get_name_translator([group])[group][0]

    #get an annotated tree
    tree = ncbi.get_descendant_taxa(group_id,
                                    collapse_subspecies=True,
                                    return_tree=True)

    groups = {}
    for node in tree.traverse("levelorder"):

        #if the rank of the current rank is the requested rank
        if node.rank == rank:

            #get node's species and subspecies ids
            descen = list(
                ncbi.get_descendant_taxa(node.taxid,
                                         collapse_subspecies=False,
                                         return_tree=False))  #species
            descen2 = list(
                ncbi.get_descendant_taxa(node.taxid,
                                         collapse_subspecies=True,
                                         return_tree=False))  #species/varietas
            descendants = list(set(descen).union(descen2))

            #to string
            species = [str(descendant) for descendant in descendants]

            #remove species not found in the input tree
            species_intree = [s for s in species if s in tree_taxa]

            #create a group
            group = Group(taxid=node.taxid,
                          species=species,
                          species_intree=species_intree)

            #add group to dictionary
            groups[node.taxid] = group

    return groups
コード例 #25
0
ファイル: models.py プロジェクト: GuiSeSanz/myScripts
def AddmyID(modelIDList, ID, filepath):
    ncbi = NCBITaxa()
    if ID.isdigit(): 
        modelIDList.append(int(ID))        
    else:
         name2taxID = ncbi.get_name_translator(ID)
         modelIDList.append(int(name2taxID[ID][0]))
         
    tree = model2Tree(modelIDList)
    #print tree.get_ascii(attributes=["sci_name", "rank"])
    
    outfile = "outTree.tmp"
    out = open(outfile, "w") 
    for line in tree.get_ascii(attributes=["sci_name", "rank"]):
        out.write(line)        
    out.close()
    
    return modelIDList
コード例 #26
0
    def __init__(self, id='', name='', ncbi_id=None, cross_references=None):
        """
        Args:
            id (:obj:`str`, optional): identifier
            name (:obj:`str`, optional): name
            ncbi_id (:obj:`int`, optional): NCBI identifier
            cross_references (:obj:`list` of :obj:`CrossReference`, optional): list of cross references
        """

        self.id = id
        self.name = name
        self.id_of_nearest_ncbi_taxon = None
        self.distance_from_nearest_ncbi_taxon = None
        self.additional_name_beyond_nearest_ncbi_taxon = None
        self.cross_references = cross_references or []

        ncbi_taxa = NCBITaxa()

        if ncbi_id:
            self.id_of_nearest_ncbi_taxon = ncbi_id
            self.distance_from_nearest_ncbi_taxon = 0
            self.additional_name_beyond_nearest_ncbi_taxon = ''
            self.name = ncbi_taxa.translate_to_names([ncbi_id])[0]
            if self.name == ncbi_id:
                raise ValueError(
                    'The NCBI taxonomy database does not contain a taxon with id {}'
                    .format(ncbi_id))
        else:
            rank_names = name.split(' ')
            for i_rank in range(len(rank_names)):
                partial_name = ' '.join(rank_names[0:len(rank_names) - i_rank])
                result = ncbi_taxa.get_name_translator([partial_name])
                if result:
                    self.id_of_nearest_ncbi_taxon = result[partial_name][0]
                    self.distance_from_nearest_ncbi_taxon = i_rank
                    self.additional_name_beyond_nearest_ncbi_taxon = ''.join(
                        ' ' + n for n in rank_names[len(rank_names) - i_rank:])
                    self.name = ncbi_taxa.translate_to_names([self.id_of_nearest_ncbi_taxon])[0] \
                        + self.additional_name_beyond_nearest_ncbi_taxon
                    return

            self.name = name
コード例 #27
0
def get_taxonomic_lineage(base_species):
    """ Get the lineage of a species

        Args:
            base_species (:obj:`bool`): a species (e.g. escherichia coli)

        Returns:
            :`list` of :obj:`str`: a list of strings corresponding to the layer of its taxonomy
    """

    ncbi = NCBITaxa()
    base_species = ncbi.get_name_translator([base_species])[base_species][0]
    lineage = ncbi.get_lineage(base_species)
    names = ncbi.get_taxid_translator(lineage)
    chain = [names[taxid] for taxid in lineage]
    i = len(chain)
    new = []
    while i > 0:
        new.append(chain[i - 1])
        i = i - 1
    return new
コード例 #28
0
def findclade(namelist, ranks='family|genus'):
    #rankregex = re.compile('^(%s)$' % ranks)
    ncbi = NCBITaxa()
    name2taxid = ncbi.get_name_translator(namelist)
    lineages = ncbi.get_lineage_translator([v[0] for v in name2taxid.values()])
    cladetaxids = []
    for name in namelist:
        lineage = lineages[name2taxid[name][0]]
        #print(name, name2taxid[name], lineage)
        rank2clade = {
            rk: taxid
            for taxid, rk in ncbi.get_rank(lineage).items()
        }
        cladetaxids.append(
            [rank2clade.get(rank, 0) for rank in ranks.split('|')])

    #print(cladetaxids)
    taxid2clade = ncbi.get_taxid_translator(chain(*cladetaxids))

    for name, taxidlist in zip(namelist, cladetaxids):
        yield name, [taxid2clade.get(t, '') for t in taxidlist]
コード例 #29
0
def normalize_target_taxa(target_taxa):
    """
    Receives a list of taxa IDs and/or taxa names and returns a set of expanded taxids numbers
    """
    from ete3 import NCBITaxa
    ncbi = NCBITaxa()
    expanded_taxa = set()

    for taxon in target_taxa:
        taxid = ""
        try:
            taxid = int(taxon)
        except ValueError:
            taxid = ncbi.get_name_translator([taxon])[taxon][0]
        else:
            taxon = ncbi.get_taxid_translator([taxid])[taxid]

        species = ncbi.get_descendant_taxa(taxid, collapse_subspecies=False)
        for sp in species:
            expanded_taxa.add(sp)

    return expanded_taxa
コード例 #30
0
def load_taxonomy_tree(otu_list):
    ncbi = NCBITaxa()
    # downloads the NCBI locally
    # TODO: it should maybe be done as a separate rule
    # ncbi.update_taxonomy_database()

    sp2taxid = ncbi.get_name_translator(otu_list)
    # The ETE docs scared me into checking if there are no or different matches
    for sp in sp2taxid:
        if len(sp2taxid[sp]) > 1:
            print("More than two NCBI taxonomy matches for:\n",
                  sp, sp2taxid[sp])
            sys.exit()
        if len(sp2taxid[sp]) < 1:
            print("No NCBI taxonomy match for:\n", sp, sp2taxid[sp])
            sys.exit()

    lineages = {}
    i = 1
    ncbi = NCBITaxa()
    for sp in sp2taxid:
        lineage = ncbi.get_lineage(sp2taxid[sp][0])
        names = ncbi.get_taxid_translator(lineage)
        lineages[sp] = [names[taxid] for taxid in lineage]
        i += 1
    tree = TreeNode.from_taxonomy(lineages.items())

    # Branches are required to have a length
    # TODO: Length should be parametrized
    for node in tree.postorder():
        node.length = 1

    # Lineages for species that are not found are silently not reported
    # They break the unifraq and have to be discarded from the dataframe
    # TODO: The notfound list should be reported!
    tips = set([tip.name for tip in tree.tips()])
    notfound = set([otu for otu in otu_list if otu not in tips])

    return tree, notfound
コード例 #31
0
def create_taxonomic_data_ete(species_name):
    """
    Query ete taxonomy with the species name to create a dictionary containing taxon id,
    taxonomy and some other informations.
    Useful when no internet connection is available and the NCBITaxa database have already been downloaded.

    Args:
        species_name (str): species name (must be with genus for example "Escherichia coli")

    Returns:
        species_informations (dict): dictionary containing information about species
    """
    species_informations = {}

    compatible_species_name = species_name.replace('/', '_')
    species_informations['description'] = compatible_species_name + ' genome'
    species_informations['organism'] = compatible_species_name
    species_informations['keywords'] = [compatible_species_name]

    ncbi = NCBITaxa()
    species_taxids = ncbi.get_name_translator([species_name])
    if species_name in species_taxids:
        species_taxid = species_taxids[species_name][-1]
        species_informations['db_xref'] = 'taxon:' + str(species_taxid)
    else:
        logger.critical(
            '/!\\ Error with {} this taxa has not been found in ete3 NCBITaxa Database'
            .format(species_name))
        logger.critical(
            '/!\\ Check the name of the taxa and its presence in the NCBITaxa database.'
        )
        logger.critical(
            '/!\\ No genbank will be created for {}.'.format(species_name))
        return None

    return species_informations
コード例 #32
0
ファイル: models.py プロジェクト: GuiSeSanz/myScripts
def model_organisms(inputfile):
    ncbi = NCBITaxa()
    infile = open(inputfile, "r")
    modelList = []
    for line in infile:
        modelList.append(line[:-1])
    infile.close()
  
    if modelList[0].isdigit():        
        print "List of model IDs Loaded"
        Type = 'Id'
    else:
        print "List of model names Loaded"
        Type = 'Sp'
    modelIDList = []
    
    if Type == 'Sp':
        name2taxID = ncbi.get_name_translator(modelList)
        for model in modelList:
            modelIDList.append(name2taxID[model][0])
    else:
       modelIDList = modelList 
       
    return modelIDList
コード例 #33
0
options, args = parser.parse_args()

if options.database == "yes":
    try:
        ncbi.update_taxonomy_database()
    except:
        pass

if options.input_species_filename is None:
    raise Exception('-s option must be specified, Species list in text format one species in each line')

with open(options.input_species_filename) as f:
    species_name = [_.strip().replace('_', ' ') for _ in f.readlines()]

name2taxid = ncbi.get_name_translator(species_name)

taxid = [name2taxid[_][0] for _ in species_name]

tree = ncbi.get_topology(taxid)

if options.treebest == "yes":
    inv_map = {str(v[0]): k.replace(" ", "") + "*" for k, v in name2taxid.items()}
else:
    inv_map = {str(v[0]): k for k, v in name2taxid.items()}


for leaf in tree:
    leaf.name = inv_map[leaf.name]

newickTree = tree.write(format=int(options.format))
コード例 #34
0
# begin iterating through the file and getting GenBank records
while 1:
    # get a SeqFeature object for the next GenBank record. When we run
    # out of records in the file, cur_entry will be None
    cur_entry = iterator.next()

    if cur_entry is None:
        break

    nid = cur_entry.id
    
    organism = cur_entry.annotations['organism']
    # name exception..
    try:
        taxid = ncbi.get_name_translator([organism])[organism][0]
    except KeyError:
        if organism in CorrectDict:
            taxid = ncbi.get_name_translator([CorrectDict[organism]])[CorrectDict[organism]][0]
        else:
            print cur_entry.annotations
            try:
                correct_organism = raw_input("name for %s\n" % organism)
                taxid = ncbi.get_name_translator([correct_organism])[correct_organism][0]
            except KeyError:
                taxid = int(raw_input("taxid"))
    #print "Printing cDNA info for %s" % nid
    # loop through all of the features for the entry
    for feature in cur_entry.features:
        # when we've got CDS features, parse the info out of them
        if feature.type == "CDS":
コード例 #35
0
ファイル: efecht.py プロジェクト: GuiSeSanz/myScripts







NCBI = False
if NCBI :
    from ete3 import NCBITaxa
    ncbi = NCBITaxa()
    #ncbi.update_taxonomy_database()
    taxIDlist=[]
    for gene in geneList:
        name2taxID = ncbi.get_name_translator([gene.organism])
        gene.taxID = name2taxID[gene.organism][0]
        for i in ncbi.get_lineage(gene.taxID):
            
            gene.addlineageid(i)
        taxIDlist.append(gene.taxID)
        
    #taxid2name = ncbi.get_taxid_translator([9606, 9443])
    #print taxid2name
tree = False
if tree :    
    tree = ncbi.get_topology(taxIDlist)
    print tree.get_ascii(attributes=["sci_name", "rank"])
    

コード例 #36
0
ファイル: ete_ncbiquery.py プロジェクト: Ward9250/ete
def run(args):
    # add lineage profiles/stats

    import re
    from ete3 import PhyloTree, NCBITaxa

    # dump tree by default
    if not args.tree and not args.info and not args.descendants:
        args.tree = True

    ncbi = NCBITaxa()

    all_taxids = {}
    all_names = set()
    queries = []

    if not args.search:
        log.error('Search terms should be provided (i.e. --search) ')
        sys.exit(-1)
    for n in args.search:
        queries.append(n)
        try:
            all_taxids[int(n)] = None
        except ValueError:
            all_names.add(n.strip())

    # translate names
    name2tax = ncbi.get_name_translator(all_names)
    all_taxids.update([(v, None) for v in list(name2tax.values())])

    not_found_names = all_names - set(name2tax.keys())
    if args.fuzzy and not_found_names:
        log.warn("%s unknown names", len(not_found_names))
        for name in not_found_names:
            # enable extension loading
            tax, realname, sim = ncbi.get_fuzzy_name_translation(name, args.fuzzy)
            if tax:
                all_taxids[tax] = None
                name2tax[name] = tax
                name2realname[name] = realname
                name2score[name] = "Fuzzy:%0.2f" %sim

    if not_found_names:
        log.warn("[%s] could not be translated into taxids!" %','.join(not_found_names))

    if args.tree:
        if len(all_taxids) == 1:
            target_taxid = next(all_taxids.keys())
            log.info("Dumping NCBI descendants tree for %s" %(target_taxid))
            t = ncbi.get_descendant_taxa(target_taxid, collapse_subspecies=args.collapse_subspecies, rank_limit=args.rank_limit, return_tree=True)
        else:
            log.info("Dumping NCBI taxonomy of %d taxa..." %(len(all_taxids)))
            t = ncbi.get_topology(list(all_taxids.keys()),
                              intermediate_nodes=args.full_lineage,
                              rank_limit=args.rank_limit,
                              collapse_subspecies=args.collapse_subspecies)

        id2name = ncbi.get_taxid_translator([n.name for n in t.traverse()])
        for n in t.traverse():
            n.add_features(taxid=n.name)
            n.add_features(sci_name=str(id2name.get(int(n.name), "?")))
            n.name = "%s - %s" %(id2name.get(int(n.name), n.name), n.name)
            lineage = ncbi.get_lineage(n.taxid)
            n.add_features(named_lineage = '|'.join(ncbi.translate_to_names(lineage)))
        dump(t, features=["taxid", "name", "rank", "bgcolor", "sci_name",
                          "collapse_subspecies", "named_lineage"])
    elif args.descendants:
        log.info("Dumping NCBI taxonomy of %d taxa..." %(len(all_taxids)))
        print('# ' + '\t'.join(["Taxid", "Sci.Name", "Rank", "descendant_taxids", "descendant_names"]))
        translator = ncbi.get_taxid_translator(all_taxids)
        ranks = ncbi.get_rank(all_taxids)
        for taxid in all_taxids:
            descendants = ncbi.get_descendant_taxa(taxid, collapse_subspecies=args.collapse_subspecies, rank_limit=args.rank_limit)
            print('\t'.join([str(taxid), translator.get(taxid, taxid), ranks.get(taxid, ''),
                             '|'.join(map(str, descendants)),
                             '|'.join(map(str, ncbi.translate_to_names(descendants)))]))

    elif args.info:
        print('# ' + '\t'.join(["Taxid", "Sci.Name", "Rank", "Named Lineage", "Taxid Lineage"]))
        translator = ncbi.get_taxid_translator(all_taxids)
        ranks = ncbi.get_rank(all_taxids)
        for taxid, name in six.iteritems(translator):
            lineage = ncbi.get_lineage(taxid)
            named_lineage = ','.join(ncbi.translate_to_names(lineage))
            lineage_string = ','.join(map(str, lineage))
            print('\t'.join([str(taxid), name, ranks.get(taxid, ''), named_lineage, lineage_string]))