コード例 #1
0
def extract_descendant_tax_ids(tax, include_subspecies=True):
    """Given either tax_id or tax_name, extract all the descendants' taxIDs.
    Subspecies are automatically included, but can be disables with
    include_subspecies = False.
    """

    ncbi = NCBITaxa()

    # species
    try:
        descendants = set(
            ncbi.get_descendant_taxa(tax, collapse_subspecies=True))
    except ValueError:
        return []

    # subspecies
    if include_subspecies:
        try:
            descendants |= set(
                ncbi.get_descendant_taxa(tax, collapse_subspecies=False))
        except ValueError:
            pass

    if Path("taxdump.tar.gz").exists():
        Path("taxdump.tar.gz").unlink()

    return list(descendants)
コード例 #2
0
def get_species_by_rank(species_list, group, rank):
	
	ncbi = NCBITaxa()
	
	#convert group name to id 
	group_id = ncbi.get_name_translator([group])[group][0]

	#get an annotated tree
	tree = ncbi.get_descendant_taxa(group_id, collapse_subspecies=True, 
									return_tree=True)
	
	groups = []
	for node in tree.traverse("levelorder"):
		
		#if current rank is the requested rank
		if node.rank == rank:
			
			#get node's species ids 
			species = [str(leaf.taxid) for leaf in node.get_leaves()]
			
			#remove species not found in the species list  
			species_inlist = [s for s in species if s in species_list]
	
			if species_inlist:
				#create a group 
				group = Group(taxid = node.taxid, species = species_inlist)
				
				#add group to dictionary 
				groups.append(group)
				
	return groups
コード例 #3
0
def get_species_by_rank(group, rank):

    ncbi = NCBITaxa()

    group_id = ncbi.get_name_translator([group])[group][0]

    # get an annotated tree
    tree = ncbi.get_descendant_taxa(group_id,
                                    collapse_subspecies=True,
                                    return_tree=True)

    dic_ids = {}
    dic_names = {}
    for node in tree.traverse("levelorder"):

        #if the rank of the current rank is the requested rank
        if node.rank == rank:
            #get its leaves
            leaves = node.get_leaves()
            #get their ids
            dic_ids[node.taxid] = [leaf.taxid for leaf in leaves]
            dic_names[node.sci_name] = [leaf.sci_name for leaf in leaves]

    print "# of {rank}: {num}".format(rank=rank, num=len(dic_ids))

    return dic_ids
コード例 #4
0
def get_species_by_rank(input_tree, group, rank):

    #get species of input tree
    tree_taxa = [leaf.name for leaf in input_tree]

    ncbi = NCBITaxa()

    #convert group name to id
    group_id = ncbi.get_name_translator([group])[group][0]

    #get an annotated tree
    tree = ncbi.get_descendant_taxa(group_id,
                                    collapse_subspecies=True,
                                    return_tree=True)

    groups = {}
    for node in tree.traverse("levelorder"):

        #if the rank of the current rank is the requested rank
        if node.rank == rank:

            #get node's species and subspecies ids
            descen = list(
                ncbi.get_descendant_taxa(node.taxid,
                                         collapse_subspecies=False,
                                         return_tree=False))  #species
            descen2 = list(
                ncbi.get_descendant_taxa(node.taxid,
                                         collapse_subspecies=True,
                                         return_tree=False))  #species/varietas
            descendants = list(set(descen).union(descen2))

            #to string
            species = [str(descendant) for descendant in descendants]

            #remove species not found in the input tree
            species_intree = [s for s in species if s in tree_taxa]

            #create a group
            group = Group(taxid=node.taxid,
                          species=species,
                          species_intree=species_intree)

            #add group to dictionary
            groups[node.taxid] = group

    return groups
コード例 #5
0
class TaxaRetriever(object):

    # tested
    def __init__(self, category):
        self.ncbi = NCBITaxa()
        self.species = list(
            self.ncbi.get_descendant_taxa(category, collapse_subspecies=True))
        self.ranks = self.ncbi.get_rank(self.species)
        self.taxas = filter(lambda x: self.ranks[x] == 'species', self.species)
コード例 #6
0
def main():
    """Make queries against NCBI Taxa databases"""
    # Get commandline args
    args = get_args()

    # Instantiate the ete NCBI taxa object
    ncbi = NCBITaxa()

    if args.verbose > 1:
        print("Taxa database is stored under ~/.etetoolkit/taxa.sqlite")

    # Update the database if required.
    if args.update is True:
        if args.verbose > 1:
            print(
                "Updating the taxonomy database. This may take several minutes..."
            )
        ncbi.update_taxonomy_database()

    # If a name was provided instead of a TaxID, convert and store it.
    if args.name:
        args.taxid = ncbi.get_name_translator([args.name])[args.name][0]

    if args.verbose > 0:
        tax_dict = {}
        # If a name was provided, simply add it to dict
        if args.name:
            tax_dict['Name'] = args.name
        # If not, do the opposite conversion to the above and store that
        else:
            tax_dict['Name'] = ncbi.get_taxid_translator([args.taxid
                                                          ])[args.taxid]

# Continue to populate the taxa dict with other information
        tax_dict['TaxID'] = args.taxid
        tax_dict['Rank'] = ncbi.get_rank([args.taxid])
        tax_dict['Lineage'] = ncbi.get_taxid_translator(
            ncbi.get_lineage(args.taxid))

        print("Information about your selected taxa:")
        pretty(tax_dict)

    # Main feature of the script is to get all taxa within a given group.
    descendent_taxa = ncbi.get_descendant_taxa(args.taxid)
    descendent_taxa_names = ncbi.translate_to_names(descendent_taxa)
    print("Descendent taxa for TaxID: %s" % (args.taxid))

    # Under python3, zip = izip. In python2, this list could be very large, and memory intensive
    # Suggest the script is run with python3
    if args.verbose > 0:
        for dtn, dt in zip(descendent_taxa_names, descendent_taxa):
            print("%s\t%s" % (dtn, dt))

    if args.outfile:
        with open(args.outfile, 'w') as ofh:
            for id in descendent_taxa:
                ofh.write(str(id) + '\n')
コード例 #7
0
def from_name2ids(phylum_name, dataset='genbank', return_d2ids=False):
    """
    retrieve ids and metadata from genbank file
    :param phylum_name:
    :return:
    """
    phylum_names = [_ for _ in phylum_name.split(';') if _]
    # phylum_name = "Nitrospirae;"
    # phylum_tid = "40117"
    ncbi = NCBITaxa()

    p2tid = ncbi.get_name_translator(phylum_names)

    for _ in phylum_names:
        if not p2tid.get(_):
            print(f" '{_}'' not found. please check the name")
    tids = [p2tid.get(_, [None])[0] for _ in phylum_names if p2tid.get(_)]
    tid2name = {
        p2tid.get(_, [None])[0]: _
        for _ in phylum_names if p2tid.get(_)
    }

    domain2dids = defaultdict(list)
    descend_ids = []
    tid2dids = {}
    for tid in tids:
        lineages = ncbi.get_lineage(tid)
        ranks = ncbi.get_rank(lineages)
        ranks = {v: k for k, v in ranks.items()}
        names = ncbi.get_taxid_translator(lineages)
        domain = names[ranks['superkingdom']]

        _descend_ids = ncbi.get_descendant_taxa(tid, intermediate_nodes=True)
        tid2dids[tid2name[tid]] = len(_descend_ids)
        descend_ids += _descend_ids
        domain2dids[domain].extend(_descend_ids)
    print(f"in total, {len(descend_ids)} taxids were found. ")
    if return_d2ids:
        return domain2dids

    domain2aids = defaultdict(list)
    collect_info = []
    descend_ids = set(descend_ids)
    for domain, ids in domain2dids.items():
        d = domain.lower()
        metadata = join(metadata_files_dir,
                        f"{dataset}_{d}_assembly_summary.txt")
        tqdm.write(
            f'read {metadata} which last modified at : {time.ctime(os.path.getmtime(metadata))}'
        )
        for row in tqdm(open(metadata)):
            if row.startswith("GC"):
                rows = row.split('\t')
                if int(rows[5]) in descend_ids:
                    collect_info.append(row)
                    domain2aids[d].append(rows[0])
    return domain2aids, collect_info
コード例 #8
0
def get_group_tree(group, rank):

    ncbi = NCBITaxa()

    translator = ncbi.get_name_translator([group])
    group_id = translator[group][0]

    # get an annotated tree
    tree = ncbi.get_descendant_taxa(group_id,
                                    collapse_subspecies=True,
                                    return_tree=True)

    return tree
コード例 #9
0
def setup_database(force_update=False):
    """ Setup a local sqllite copy of the NCBI Taxonomy database. If :obj:`force_update` is `False`, then
    only download the content from NCBI and build the sqllite database, if a local database doesn't already
    exist. If :obj:`force_update` is `True`, then always download the content from NCBI and rebuild the
    sqllite copy of the database.

    Args:
        force_update (:obj:`bool`, optional):

            * :obj:`False`: only download the content for the database and build a local sqllite database
                if a local sqllite copy of the database doesn't already exist
            * :obj:`True`: always download the content for the database from NCBI and rebuild a local sqllite
                database
    """
    ncbi_taxa = NCBITaxa()
    if force_update:
        # force downloading of latest content from NCBI and (re)building of local sqllite database
        ncbi_taxa.update_taxonomy_database()
    else:
        # run an operation on the local sqllite database to trigger NCBITaxa to setup a local sqllite
        # database if one doesn't already exist
        ncbi_taxa.get_descendant_taxa('H**o')
コード例 #10
0
class SynTax:
    """Synopsis: SynTax class contains all the relevant taxonomy to mine"""
    def __init__(self):
        self.ncbi = NCBITaxa()

    def get_descendants(self, domain: str, taxon_rank: str) -> List[str]:
        """Synopsis: Fetch all the available taxids"""
        # Domain must be in title case
        taxids = self.ncbi.get_descendant_taxa(domain,
                                               rank_limit=taxon_rank,
                                               collapse_subspecies=True)
        taxa_names = (self.ncbi.get_taxid_translator([taxa])
                      for taxa in taxids)
        return [values for i in taxa_names for key, values in i.items()]
コード例 #11
0
def determine_unassigned_rank(taxid):
    """
    Given a taxid, will use ete3 to look at all its descendants. Based on what it finds, will infer what taxonomic
    level the taxid should be at. Useful for things that have 'no rank' according to NCBI.
    :param taxid: NCBI taxid, should be an integer
    :return: string that says what taxonomy level we're at, one of the options from tax_order
    """
    tax_order = ['kingdom', 'domain', 'phylum', 'class', 'order', 'family', 'genus', 'species']
    ncbi = NCBITaxa()
    descendants = ncbi.get_descendant_taxa(taxid, intermediate_nodes=True)
    lowest_rank = 900
    for descendant in descendants:
        rank = ncbi.get_rank([descendant])
        if rank[descendant] in tax_order:
            rank_number = tax_order.index(rank[descendant])
            if rank_number < lowest_rank:
                lowest_rank = rank_number
    return tax_order[lowest_rank - 1]
コード例 #12
0
def normalize_target_taxa(target_taxa):
    """
    Receives a list of taxa IDs and/or taxa names and returns a set of expanded taxids numbers
    """
    from ete3 import NCBITaxa
    ncbi = NCBITaxa()
    expanded_taxa = set()

    for taxon in target_taxa:
        taxid = ""
        try:
            taxid = int(taxon)
        except ValueError:
            taxid = ncbi.get_name_translator([taxon])[taxon][0]
        else:
            taxon = ncbi.get_taxid_translator([taxid])[taxid]

        species = ncbi.get_descendant_taxa(taxid, collapse_subspecies=False)
        for sp in species:
            expanded_taxa.add(sp)

    return expanded_taxa
コード例 #13
0
class NCBIController:
    def __init__(self):
        self.ncbi = NCBITaxa()

    def translate(self, taxid):
        """
        :ret scientific name
        """
        return self.ncbi.get_taxid_translator([taxid])[taxid]

    def get_lineage(self, taxid, rank_lst=None):
        if rank_lst is None:
            rank_lst = [
                "superkingdom", "phylum", "class", "order", "family", "genus",
                "species"
            ]

        dct = {}
        try:
            for taxidLineage, rank in self.ncbi.get_rank(
                    self.ncbi.get_lineage(taxid)).items():
                if rank in rank_lst:
                    dct[rank] = taxidLineage
                    dct[rank + "_s"] = self.translate(taxidLineage)
            return dct
        except (KeyError, ValueError):
            #            print("ERROR: unknown taxid = {}".format(taxid))
            return dict()

    def get_descendant(self, taxid, rank):
        ret = []
        children = self.ncbi.get_descendant_taxa(taxid, rank_limit="genus")
        for k, v in self.ncbi.get_rank(children).items():
            if v == rank:
                ret.append(k)
        return ret
コード例 #14
0
    }  # add more regexes here
    taxon_re = None

    print("Reading NCBI Taxa...")
    ncbi = NCBITaxa()
    print("Done...")

    if len(sys.argv) < 2:
        print("\nNeed exactly two parameters! None given...\n")
        print("Documentation:")
        print(__doc__)
        sys.exit(9)

    #root_taxon = 'Leptospira alexanderi'
    root_taxon = sys.argv[2]
    lineage = ncbi.get_descendant_taxa(root_taxon, intermediate_nodes=True)
    root_taxon_id = ncbi.get_name_translator([root_taxon])[root_taxon][0]
    lineage.append(root_taxon_id)
    names = ncbi.translate_to_names(lineage)
    seqs_by_taxon = dict()
    for name in names:
        seqs_by_taxon[name] = []
    if DEBUG: print("Total # of Taxons: %s " % (len(seqs_by_taxon)))
    if DEBUG: print("First 10 taxons: %s" % seqs_by_taxon.keys()[:10])

    FASTAFILE = sys.argv[1]
    FASTAFILE = os.path.expanduser(FASTAFILE)
    OUTFILE = os.path.splitext(FASTAFILE)[0] + '_' + root_taxon.replace(
        ' ', '_') + os.path.splitext(FASTAFILE)[1]

    if not os.path.isfile(FASTAFILE):
コード例 #15
0
for line in ResultsTuple:
    ResultTaxid = line[2]
    Pearson = line[5] + "___" + line[6]
    ResultTaxids.append(ResultTaxid)
    if ResultTaxid in AllTaxidDict:
        OldValue = int(AllTaxidDict[ResultTaxid])
        AllTaxidDict[ResultTaxid] = OldValue + 1
    if ResultTaxid in TaxidPearsonDict:
        pass
    else:
        TaxidPearsonDict[ResultTaxid] = Pearson
for k, v in AllTaxidDict.iteritems():
    ChildrenCount = 0
    descendants = ncbi.get_descendant_taxa(str(k),
                                           collapse_subspecies=False,
                                           intermediate_nodes=True)
    taxid = list([k])
    name = ncbi.get_taxid_translator(taxid)
    if len(descendants) == 1:  #This is a leaf node
        PearsonSkewness = TaxidPearsonDict.get(k)
        Skewness = (PearsonSkewness.strip().split("___"))[0]
        HitDist = (PearsonSkewness.strip().split("___"))[1]
        Output.write(
            str(name.itervalues().next()) + "," + str(k) + "," + str(v) +
            ",0," + str(Skewness) + "," + str(HitDist) + "\n")
    else:
        for i in descendants:
            if str(i) in AllTaxidDict:
                ChildrenCount = ChildrenCount + int(AllTaxidDict[str(i)])
                #print "Match: %s is a descendant of %s and it is in our dictionary" % (i,k)
コード例 #16
0
ncbi = NCBITaxa()



if __name__ == '__main__':
	parser = argparse.ArgumentParser(description='')
	parser.add_argument('file_to_search', help = '')
	parser.add_argument('taxid', help = '')
	parser.add_argument('--r', action='store_true', help='')
	
	args = parser.parse_args()
	search_taxid = args.taxid
	search_file = args.file_to_search
	
	taxid_search_list = [int(search_taxid)]
	
	if args.r:
		taxid_search_list =  taxid_search_list + ncbi.get_descendant_taxa(search_taxid, intermediate_nodes=True)
	
	header_list = []
	seq_list = []
	g = open(search_file.split('.')[0] + str(search_taxid) + '.fasta', 'w')
	for line in open(search_file):
		line_list = line.split('\t')
		if int(line_list[1]) in taxid_search_list:
			g.write('>' + line_list[0] + '\n')
			g.write(line_list[2])
	g.close()
	
	
コード例 #17
0
ファイル: functionality.py プロジェクト: rsprit/vogfastOLD
def get_vogs(db: Session,
             response_body,
             id: Optional[Set[str]],
             pmin: Optional[int],
             pmax: Optional[int],
             smax: Optional[int],
             smin: Optional[int],
             function: Optional[Set[str]],
             consensus_function: Optional[Set[str]],
             mingLCA: Optional[int],
             maxgLCA: Optional[int],
             mingGLCA: Optional[int],
             maxgGLCA: Optional[int],
             ancestors: Optional[Set[str]],
             h_stringency: Optional[bool],
             m_stringency: Optional[bool],
             l_stringency: Optional[bool],
             virus_specific: Optional[bool],
             phages_nonphages: Optional[str],
             proteins: Optional[Set[str]],
             species: Optional[Set[str]],
             tax_id: Optional[Set[int]],
             inclusive: Optional[str] = 'i'
             ):
    """
    This function searches the VOG based on the given query parameters
    """
    if inclusive is not 'i' and inclusive is not 'u':
        raise HTTPException(status_code=404,
                            detail="The parameter for the Intersecion or Union search has to be 'i' or 'u'.")

    result = db.query(response_body)
    arguments = locals()
    filters = []

    for key, value in arguments.items():  # type: str, any
        if value is not None:
            if key == "id":
                filters.append(getattr(models.VOG_profile, key).in_(value))

            if key == "consensus_function":
                for fct_d in value:
                    d = "%" + fct_d + "%"
                    filters.append(getattr(models.VOG_profile, key).like(d))

            if key == "function":
                for fct_d in value:
                    d = "%" + fct_d + "%"
                    filters.append(getattr(models.VOG_profile, key).like(d))

            if key == "smax":
                filters.append(getattr(models.VOG_profile, "species_count") < value + 1)

            if key == "smin":
                filters.append(getattr(models.VOG_profile, "species_count") > value - 1)

            if key == "pmax":
                filters.append(getattr(models.VOG_profile, "protein_count") < value + 1)

            if key == "pmin":
                filters.append(getattr(models.VOG_profile, "protein_count") > value - 1)

            if key == "proteins":
                for protein in value:
                    p = "%" + protein + "%"
                    filters.append(getattr(models.VOG_profile, key).like(p))

            if key == "species":
                if inclusive == 'i':
                    # THIS IS THE INTERSECTION SEARCH:
                    vog_ids = db.query().with_entities(models.Protein_profile.vog_id).join(models.Species_profile). \
                        filter(models.Species_profile.species_name.in_(species)).group_by(
                        models.Protein_profile.vog_id). \
                        having(func.count(models.Species_profile.species_name) == len(species)).all()
                else:
                    # UNION SEARCH below:
                    vog_ids = db.query().with_entities(models.Protein_profile.vog_id).join(models.Species_profile). \
                        filter(models.Species_profile.species_name.in_(species)).group_by(
                        models.Protein_profile.vog_id).all()
                vog_ids = {id[0] for id in vog_ids}  # convert to set
                filters.append(getattr(models.VOG_profile, "id").in_(vog_ids))

            if key == "maxgLCA":
                filters.append(getattr(models.VOG_profile, "genomes_total") < value + 1)

            if key == "mingLCA":
                filters.append(getattr(models.VOG_profile, "genomes_total") > value - 1)

            if key == "maxgGLCA":
                filters.append(getattr(models.VOG_profile, "genomes_in_group") < value + 1)

            if key == "mingGLCA":
                filters.append(getattr(models.VOG_profile, "genomes_in_group") > value - 1)

            if key == "ancestors":
                for anc in value:
                    a = "%" + anc + "%"
                    filters.append(getattr(models.VOG_profile, key).like(a))

            if key == "h_stringency":
                filters.append(getattr(models.VOG_profile, key).is_(value))

            if key == "m_stringency":
                filters.append(getattr(models.VOG_profile, key).is_(value))

            if key == "l_stringency":
                filters.append(getattr(models.VOG_profile, key).is_(value))

            if key == "virus_specific":
                filters.append(getattr(models.VOG_profile, key).is_(value))

            if key == "phages_nonphages":
                val = "%" + value + "%"
                filters.append(getattr(models.VOG_profile, key).like(val))

            if key == "tax_id":
                ncbi = NCBITaxa()
                # ncbi.update_taxonomy_database()

                try:
                    id_list = []
                    if inclusive == 'u':
                        # UNION SEARCH:
                        for id in tax_id:
                            id_list.extend(
                                ncbi.get_descendant_taxa(id, collapse_subspecies=False, intermediate_nodes=True))
                            id_list.append(id)
                        vog_ids = db.query().with_entities(models.Protein_profile.vog_id).join(
                            models.Species_profile). \
                            filter(models.Species_profile.taxon_id.in_(id_list)).group_by(
                            models.Protein_profile.vog_id). \
                            filter(models.Species_profile.taxon_id.in_(id_list)).group_by(
                            models.Protein_profile.vog_id).all()
                        vog_ids = {id[0] for id in vog_ids}  # convert to set
                        filters.append(getattr(models.VOG_profile, "id").in_(vog_ids))
                        print("ID LIST")
                        print(id_list)
                    else:
                        # INTERSECTION SEARCH:
                        for id in tax_id:
                            id_list.extend(
                                ncbi.get_descendant_taxa(id, collapse_subspecies=False, intermediate_nodes=True))
                            id_list.append(id)
                            vog_ids = db.query().with_entities(models.Protein_profile.vog_id).join(
                                models.Species_profile). \
                                filter(models.Species_profile.taxon_id.in_(id_list)).group_by(
                                models.Protein_profile.vog_id). \
                                filter(models.Species_profile.taxon_id.in_(id_list)).group_by(
                                models.Protein_profile.vog_id).all()
                            vog_ids = {id[0] for id in vog_ids}  # convert to set
                            filters.append(getattr(models.VOG_profile, "id").in_(vog_ids))

                except ValueError:
                    raise HTTPException(status_code=404, detail="The provided taxonomy ID is invalid.")

    result = result.filter(*filters)
    return result.all()
コード例 #18
0
    taxid_to_assemble = []
    # go through every line in the report file and pull all species level assignments with above MIN_READ_CUTOFF reads
    for line in open(file_name):
        line_list = line.split('\t')

        if line_list[3] == 'S' and int(line_list[2]) >= MIN_READ_CUTOFF:
            lineage = ncbi.get_lineage(line_list[4])
            # No eukaryotic assembly and no 'uncultured bacterium' assembly
            if 2759 not in lineage and line_list[4] != '77133':
                taxid_to_assemble.append(line_list[4])

    # for every taxid we assemble a list of reads that are at the given species rank or lower
    for taxid in taxid_to_assemble:
        taxid_search_list = [str(taxid)]
        taxid_search_list = taxid_search_list + ncbi.get_descendant_taxa(
            taxid, get_descendant_taxa=True)
        list_of_reads_to_pull = []
        for a_line in open(base + '_assignments.txt'):
            a_line_list = a_line.split('\t')
            if a_line_list[1] in taxid_search_list:
                list_of_reads_to_pull.append(a_line_list[0])

        acc_num_list = []
        # go through all the sam files and grab the accession numbers that the reads we got in the last loop aligned to
        for db_num in db_list:
            for sam_line in open(base + '.hf.trimmed.fastq_' + db_num +
                                 '.sam'):
                sam_line_list = sam_line.split('\t')
                # only grab accession numbers that have 'complete genome' in the name to avoid assembling to partial segments
                if sam_line_list[
                        0] in list_of_reads_to_pull and 'complete_genome' in sam_line_list[
コード例 #19
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Remove taxids from unclassiffied organisms from accession2taxid.map
@ V.R.Marcelino
Created on Wed Jun 19 09:35:25 2019 (NCBItaxonomy updated)
"""

from ete3 import NCBITaxa
ncbi = NCBITaxa()

input_acc2taxid = "accession_taxid_nucl.map"

## taxids_two_exclude:

descendants_artificial = ncbi.get_descendant_taxa(28384)  # 14882 taxids

descendants_env_euks = ncbi.get_descendant_taxa(61964)  # 295 taxids

descendants_env_proks = ncbi.get_descendant_taxa(48479)  # 26083

descendants_unclassified_seq = ncbi.get_descendant_taxa(12908)  # 942

# join all ists
all_unclassified = descendants_artificial + descendants_env_euks + descendants_env_proks + descendants_unclassified_seq

# write line sthat are not in the list to a new file:
new_acc2taxid = open('accession2taxid_clean.map', 'w')
with open(input_acc2taxid) as ass:
    next(ass)  #skip first line
    for line in ass:
コード例 #20
0
if __name__ == "__main__":
    OUTFILE = sys.argv[1]

    assert os.path.basename(OUTFILE) != b'taxonomy.json', os.path.basename(
        OUTFILE)
    assert os.path.isdir(os.path.dirname(OUTFILE))

    ncbi = NCBITaxa()

    # ncbi.update_taxonomy_database()

    insects = 'Insecta'
    insecta_taxid = ncbi.get_name_translator([insects])[insects][0]
    tree = ncbi.get_descendant_taxa(insecta_taxid,
                                    collapse_subspecies=True,
                                    return_tree=True)
    {'Insecta': {}}
    ranks = ['order', 'family', 'genus', 'species']
    for n in tree.traverse():
        if n.rank not in ranks:
            n.delete()

    def recurse(tree, depth=0, max_depth=4):
        out = [(tree.name, tree.rank, depth)]
        for subtree in tree.get_children():
            out.extend(recurse(subtree, depth + 1))
        return out

    valid_ids = []
    for name, r, depth in recurse(tree):
コード例 #21
0
         "Chlamydiae", "Chlorarachniophyceae", "Chloroflexi",
         "Choanoflagellida", "Cryptophyta", "Cryptomonadales",
         "Cyanobacteria", "Deinococcus-Thermus", "Discosea",
         "Euryarchaeota", "Firmicutes", "Fungi", "Gemmatimonadetes",
         "Glaucocystophyta", "Gonyaulacales", "Gymnodiniales",
         "Haptophyceae", "Heterolobosea", "Ichthyosporea",
         "Mamiellophyceae", "Metazoa", "Mycetozoa", "Noctilucales",
         "Oxyrrhinales", "Peridiniales", "Planctomycetes",
         "Pyrenomonadales", "Rhizaria", "Rhodophyta",
         "Rhodothermaeota", "Spirochaetes", "Stramenopiles",
         "Synergistetes", "Thaumarchaeota", "Thermotogae",
         "Viridiplantae"
 ]:
     # list of used taxons incomplete!
     try:
         descendants = ncbi.get_descendant_taxa(
             shortname.split("_")[0])
         lineage = ncbi.get_lineage(descendants[0])[2:]
         names = ncbi.get_taxid_translator(lineage)
         rank = [names[taxid] for taxid in lineage]
         if "Proteobacteria" in rank:
             hightaxon = rank[2]
             if hightaxon == "delta/epsilon subdivisions":
                 hightaxon = rank[3]
             shortname = shortname.replace("_Proteobacteria", "")
         elif "FCB group" in rank:
             hightaxon = rank[3]
         elif "Opisthokonta" in rank:
             hightaxon = rank[2]
         elif "Terrabacteria group" in rank:
             hightaxon = rank[2]
         elif "Bacteria" in rank:
コード例 #22
0
def analysis():
    args = setting()
    cwd = args.workdir #os.getcwd()
    ncbi = NCBITaxa()
    home = str(Path.home())
    pathogens = args.pathogens_species.split(",")
    file_combined_fastq = os.path.join(os.getcwd(), args.fastq)
    if not os.path.isfile(file_combined_fastq):
        fastq_files = [os.path.join(file_combined_fastq, f) for f in listdir(file_combined_fastq) if isfile(join(file_combined_fastq, f)) and f.endswith("fastq")]
        k = file_combined_fastq.rfind("/")
        file_combined_fastq = file_combined_fastq[:k] + ".fastq" + file_combined_fastq[k + 1:]
        with open(file_combined_fastq, 'wb') as wfd:
            for file in fastq_files:
                with open(file, 'rb') as fd:
                    shutil.copyfileobj(fd, wfd, 1024 * 1024 * 10)

    reads_fastq = []
    if file_combined_fastq.endswith("fastq") or file_combined_fastq.endswith("fq"):
        for record in SeqIO.parse(file_combined_fastq, "fastq"):
            reads_fastq.append(str(record.id))
    elif file_combined_fastq.endswith("fasta") or file_combined_fastq.endswith("fa"):
        for record in SeqIO.parse(file_combined_fastq, "fasta"):
            reads_fastq.append(str(record.id))
    else:
        print("Not known reads file format")

    number_reads = len(reads_fastq)

    if args.host_specie == "" and args.pathogens_species == "":
        species = ""
    elif args.host_specie == "" and not args.pathogens_species == "":
        species = pathogens
    elif not args.host_specie == "" and args.pathogens_species == "":
        species = [args.host_specie]
    else:
        species = [args.host_specie] + pathogens

    species.sort()
    name_database = "_".join(species).replace(" ", "_")
    genome_db = os.path.join(cwd, name_database + ".fasta")
    genome_db_id = os.path.join(cwd, name_database + ".txt")
    all_genomes = False
    if "refseq" in args.NCBIdatabase:
        table_file = "assembly_summary_refseq.txt"
    if "assembly" in args.NCBIdatabase:
        all_genomes = True
        table_file = "assembly_summary_genbank.txt"
    if os.path.exists(os.path.join(cwd,table_file)):
        os.remove(os.path.join(cwd,table_file))
    cmd = WGET % table_file
    wget = sb.Popen(cmd, shell=True, stdout=sb.PIPE, stderr=sb.PIPE, cwd=cwd)
    wget.communicate()



    sys.stdout.write("### UPDATING THE DATABASE\n")
    # This part checks for a new version of the taxdump.tar.gz; the code looks for a new version every day
    ete = os.path.expanduser("~/.etetoolkit/taxa.sqlite.traverse.pkl")
    modified = os.path.getmtime(ete)
    modificationTime = time.strftime('%m', time.localtime(modified))
    today = datetime.date.today()
    month = today.strftime("%m")
    if modificationTime != month:
        ncbi.update_taxonomy_database()
    dict_species = {}

    # here we set if is an not know pathogen or we have and idea of which pathogen to investigate
    with open(os.path.join(cwd, table_file), "r") as fh:
        descendants_all = []
        for specie in species:
            name2taxid = ncbi.get_name_translator([specie])
            if args.host_specie in specie:
                plant = name2taxid[specie]
            for key in name2taxid[specie]:
                descendants = ncbi.get_descendant_taxa(key, collapse_subspecies=True)
                for sstaxa in descendants:
                    descendants_all.append(str(sstaxa))
        for line in fh:
            if not line.startswith("#"):
                if line.split("\t")[6] in descendants_all:# and "subsp" in line:
                    ssname = " ".join([line.split("\t")[7].split(" ")[0], line.split("\t")[7].split(" ")[1]])
                    tax = line.split("\t")[6]
                    ftp = line.split("\t")[19]
                    genome = ftp.split("/")[-1] + "_genomic.fna.gz"
                    ftp_genome = os.path.join(ftp, genome)
                    path_genome = os.path.join(cwd, genome)
                    #species_assembly = " ".join([line.split("\t")[7]].split(" ")[0], [line.split("\t")[7]].split(" ")[1])
                    if ssname in dict_species:
                        dict_species[ssname] = dict_species[ssname] + [(ftp_genome, path_genome, tax, genome, ssname)]
                    else:
                        dict_species[ssname] = [(ftp_genome, path_genome, tax, genome, ssname)]
    db_file = os.path.join(home, ".db_monica." + name_database)
    if all_genomes:
        print("DOWNLOADING MULTIPLE GENOMES FOR THE SAME SPECIES")
        genomes_select = [name for specie in dict_species for name in dict_species[specie]]
    else:
        print("DOWNLOADING ONE GENOME FOR SPECIES")
        genomes_select = [dict_species[specie][-1] for specie in dict_species]
    print("I WILL DOWNLOAD %s GENOMES" % str(len(genomes_select)))
    if not os.path.exists(db_file) or not os.path.exists(genome_db):
        with open(genome_db, "w") as output_handle, open(genome_db_id, "w") as output_handle_id:
            with open(db_file, "w") as fh:
                for names in genomes_select:
                    ftp_genome, path_genome, tax, genome, ssname = names
                    if genome.startswith("GC"):
                        genome_used = cwd + genome + "\n"
                        fh.write(genome_used)
                        if not os.path.exists(path_genome):
                            cmd = WGET_GENOME % ftp_genome
                            wget_gen = sb.Popen(cmd, shell=True, stdout=sb.PIPE, stderr=sb.PIPE, cwd=cwd)
                            wget_gen.communicate()
                        with gzip.open(path_genome, "rt") as handle:
                            print("PARSING " + genome + " GENOME")
                            for record in SeqIO.parse(handle, "fasta"):
                                record.id = tax + "_" + str(record.id)
                                record.description = genome.split(".")[0]
                                SeqIO.write(record, output_handle, "fasta")
                                output_handle_id.write(str(record.name) + "%" + str(record.description) + "\n")

    sys.stdout.write("### PREPARING FOR MAPPING\n")
    genome_to_contig = {}
    with open(genome_db_id, "r") as fhtxt:
        for record in fhtxt: #txt SeqIO.parse(genome_db, "fasta"):
            line = record.split("%")
            genome_to_contig[line[0]] = line[1].rsplit()
    genome_to_species= {}
    with open(os.path.join(cwd, table_file), "r") as fh:
        for line in fh:
            line = line.rstrip().split("\t")
            genome = line[0].split(".")[0]
            if len(line) > 9 and not line[0].startswith("#"):
                subspecies = line[7].split(" ")[:2]
                subspecie = "_".join(subspecies) #+ " " + line[8].split("=")[1:]
                tribu = "_".join(line[8].split("=")[1:])
                genome_to_species[genome] = subspecie + "-" + tribu
    sam_output = file_combined_fastq + ".sam"
    cmd = MINIMAP % (str(args.threads), genome_db, file_combined_fastq, sam_output)
    sys.stdout.write("RUNNING MINIMAP2\n")
    minimap = sb.Popen(cmd, shell=True, cwd=cwd)
    minimap.communicate()
    reads_dict = {}
    count = 0
    with open(sam_output) as fh:
        for sam in fh:
            if sam != "" and not sam.startswith("@"):
                fields = sam.split("\t")
                if not fields[2] == "*":
                    for entry in fields:
                        if entry.startswith("MD"):
                            md = entry.split(":")[-1]
                            mismatch = len(re.findall("[A-Z]", md))
                            match = sum([int(number) for number in re.sub('[A-Z]|\^', ',', md).split(",") if number != "" and number.isdigit()])
                            if match > 0:
                                if mismatch > 0:
                                    iden = (match - mismatch) / match * 100
                                    if fields[0] in reads_dict:
                                        if iden == reads_dict[fields[0]][0]:
                                            if reads_dict[fields[0]][1].startswith(fields[2].split("_")[0]):
                                                continue
                                            else:
                                                count += 1
                                                reads_dict.pop(fields[0], None)
                                        elif iden > reads_dict[fields[0]][0]:
                                            reads_dict[fields[0]] = (iden, fields[2], fields[0])
                                    else:
                                        reads_dict[fields[0]] = (iden, fields[2], fields[0])
                                else:
                                    iden = 100
                                    if fields[0] in reads_dict:
                                        if iden == reads_dict[fields[0]][0]:
                                            if reads_dict[fields[0]][1].startswith(fields[2].split("_")[0]):
                                                continue
                                            else:
                                                count += 1
                                                reads_dict.pop(fields[0], None)
                                        elif iden > reads_dict[fields[0]][0]:
                                            reads_dict[fields[0]] = (iden, fields[2], fields[0])
                                    else:
                                        reads_dict[fields[0]] = (iden, fields[2], fields[0])

    out_file = file_combined_fastq + ".reads.txt"
    with open(out_file, "w") as csv:
        for key in reads_dict:
            csv.write("\t".join([reads_dict[key][1], reads_dict[key][2]]) + " \n")
    print(count)
    count = {}
    number_reads_mapped = 0
    for read in reads_dict:
        match = reads_dict[read][1].split("_")
        if len(match) > 1:
            number_reads_mapped += 1
            if all_genomes:
                contig = match[1] #+ "_" + match[2]
            else:
                contig = match[1] + "_" + match[2]
            genome_map = genome_to_contig[contig]
            species_ss = genome_to_species[genome_map[0]]
            uniq_name = match[0] + "_" + species_ss
            if not uniq_name in count:
                count[uniq_name] = 1
            else:
                count[uniq_name] = count[uniq_name] + 1
    print("Name sample: " + file_combined_fastq)
    print("Number reads:" + str(number_reads))
    print("Number reads mapped:" + str(number_reads_mapped) + "\nPercentage of reads mapped:" + str(
        number_reads_mapped/number_reads * 100) + " %\n")
    header = []
    reads_mapped = []
    partial_tree = []
    for clade in types:
        header.append(clade[0])
        reads_mapped.append("")
    header.append("A")
    reads_mapped.append(str(number_reads-number_reads_mapped))
    total = [header] + [reads_mapped]
    tribu_dict = {}
    sorted_list = []
    for value in count:
        key = value.split("_")[0]
        if not str(key).startswith(str(plant[0])):
            sorted_list.append((value[1],(count[value]/number_reads_mapped*100)))
            lineage = ncbi.get_lineage(int(key))
            a = ncbi.get_rank(lineage)
            tribu = value.split("-")[1]
            tribu_dict["tribu"] = tribu
            tree = []
            for match in types:
                combination = [match[1]]
                if match[0] in tribu_dict:
                    combination.append("".join([tribu_dict[match[0]]]))
                else:
                    for tax in a:
                        if match[0].startswith(a[tax]) and match[0].endswith(a[tax]):
                            combination.append(ncbi.get_taxid_translator([int(tax)])[tax].replace(" ","_"))
                tree.append("".join(combination))
            tree.append(str(count[value]))
            partial_tree = partial_tree + [tree]
    partial_tree.sort()
    total = total + partial_tree
    out_file = file_combined_fastq + ".txt"
    with open(out_file, "w") as csv:
        for line in total:
            csv.write(",".join(line) + " \n")
    plot_circ(out_file, file_combined_fastq)
    print("done")
コード例 #23
0
def get_child_taxa(taxid):
    """get child taxids using """
    ncbi = NCBITaxa(dbfile=app.config['TAXA_SQLITE'])
    child_taxids = ncbi.get_descendant_taxa(int(taxid),
                                            intermediate_nodes=True)
    return child_taxids
コード例 #24
0
def run(args):
    # add lineage profiles/stats

    import re
    from ete3 import PhyloTree, NCBITaxa

    # dump tree by default
    if not args.tree and not args.info and not args.descendants:
        args.tree = True

    ncbi = NCBITaxa()

    all_taxids = {}
    all_names = set()
    queries = []

    if not args.search:
        log.error('Search terms should be provided (i.e. --search) ')
        sys.exit(-1)
    for n in args.search:
        queries.append(n)
        try:
            all_taxids[int(n)] = None
        except ValueError:
            all_names.add(n.strip())

    # translate names
    name2tax = ncbi.get_name_translator(all_names)
    all_taxids.update([(v, None) for v in list(name2tax.values())])

    not_found_names = all_names - set(name2tax.keys())
    if args.fuzzy and not_found_names:
        log.warn("%s unknown names", len(not_found_names))
        for name in not_found_names:
            # enable extension loading
            tax, realname, sim = ncbi.get_fuzzy_name_translation(
                name, args.fuzzy)
            if tax:
                all_taxids[tax] = None
                name2tax[name] = tax
                name2realname[name] = realname
                name2score[name] = "Fuzzy:%0.2f" % sim

    if not_found_names:
        log.warn("[%s] could not be translated into taxids!" %
                 ','.join(not_found_names))

    if args.tree:
        if len(all_taxids) == 1:
            target_taxid = next(all_taxids.keys())
            log.info("Dumping NCBI descendants tree for %s" % (target_taxid))
            t = ncbi.get_descendant_taxa(
                target_taxid,
                collapse_subspecies=args.collapse_subspecies,
                rank_limit=args.rank_limit,
                return_tree=True)
        else:
            log.info("Dumping NCBI taxonomy of %d taxa..." % (len(all_taxids)))
            t = ncbi.get_topology(list(all_taxids.keys()),
                                  intermediate_nodes=args.full_lineage,
                                  rank_limit=args.rank_limit,
                                  collapse_subspecies=args.collapse_subspecies)

        id2name = ncbi.get_taxid_translator([n.name for n in t.traverse()])
        for n in t.traverse():
            n.add_features(taxid=n.name)
            n.add_features(sci_name=str(id2name.get(int(n.name), "?")))
            n.name = "%s - %s" % (id2name.get(int(n.name), n.name), n.name)
            lineage = ncbi.get_lineage(n.taxid)
            n.add_features(
                named_lineage='|'.join(ncbi.translate_to_names(lineage)))
        dump(t,
             features=[
                 "taxid", "name", "rank", "bgcolor", "sci_name",
                 "collapse_subspecies", "named_lineage"
             ])
    elif args.descendants:
        log.info("Dumping NCBI taxonomy of %d taxa..." % (len(all_taxids)))
        print('# ' + '\t'.join([
            "Taxid", "Sci.Name", "Rank", "descendant_taxids",
            "descendant_names"
        ]))
        translator = ncbi.get_taxid_translator(all_taxids)
        ranks = ncbi.get_rank(all_taxids)
        for taxid in all_taxids:
            descendants = ncbi.get_descendant_taxa(
                taxid,
                collapse_subspecies=args.collapse_subspecies,
                rank_limit=args.rank_limit)
            print('\t'.join([
                str(taxid),
                translator.get(taxid, taxid),
                ranks.get(taxid, ''), '|'.join(map(str, descendants)),
                '|'.join(map(str, ncbi.translate_to_names(descendants)))
            ]))

    elif args.info:
        print('# ' + '\t'.join(
            ["Taxid", "Sci.Name", "Rank", "Named Lineage", "Taxid Lineage"]))
        translator = ncbi.get_taxid_translator(all_taxids)
        ranks = ncbi.get_rank(all_taxids)
        for taxid, name in six.iteritems(translator):
            lineage = ncbi.get_lineage(taxid)
            named_lineage = ','.join(ncbi.translate_to_names(lineage))
            lineage_string = ','.join(map(str, lineage))
            print('\t'.join([
                str(taxid), name,
                ranks.get(taxid, ''), named_lineage, lineage_string
            ]))
コード例 #25
0
ファイル: makeETEFiles.py プロジェクト: yw595/GutMicrobiota
#         genusNode = tree.search_nodes(name=str(name2taxid[genus][0]))[0]
#         speciesNode = tree.search_nodes(name=str(name2taxid[species][0]))[0]
#         dist = tree.get_distance(speciesNode, genusNode)
#         if minDist == -1:
#             minDist = dist
#             minDistSpecies = species
#         elif minDist > dist:
#             minDist = dist
#             minDistSpecies = species
#     print(genus+" "+minDistSpecies+" "+str(minDist))
#     print(genus+"\t"+minDistSpecies,file=fh)
# fh.close()

fh = open(
    '/home/ubuntu/MATLAB/GutMicrobiota/output/writeETEFiles/allDescendants.txt',
    'w')
for genus in ZhangZhaoGenera + ForslundHildebrandGenera:
    print(genus)
    genusNode = tree.search_nodes(name=str(name2taxid[genus][0]))[0]
    #descendants = genusNode.get_descendants()
    #descendantNames = []
    #for d in descendants:
    #    descendantNames.append(d.name)
    descendants = ncbi.get_descendant_taxa(str(name2taxid[genus][0]),
                                           intermediate_nodes=True)
    descendantNames = [str(name2taxid[genus][0])]
    for d in descendants:
        descendantNames.append(str(d))
    fh.write(genus + "\t" + ",".join(descendantNames))  #,file=fh)
fh.close()
コード例 #26
0
                    '--version',
                    action='version',
                    version='%(prog)s v3.2')

# Getting arguments

args = parser.parse_args()
kaiju_file = args.kaiju_file
R1 = args.R1_file
R2 = args.R2_file
taxonomy_level = args.taxonomy_level

# Getting taxonomy database and taxonomy level

ncbi = NCBITaxa()
descendants = ncbi.get_descendant_taxa(taxonomy_level)

# Create filtered files names

# Input: SRR8771429.trimmed.5905288_00_R1.fastq
# Output: SRR8771429.trimmed.5905288_00_filtered.R1.fastq
# Output: SRR8771429.trimmed.5905288_00_unclassified.R1.fastq

filtered_R1 = R1[:-8] + "filtered.R1.fastq"
filtered_R2 = R2[:-8] + "filtered.R1.fastq"
unfiltered_R1 = R1[:-8] + "unclassified.R1.fastq"
unfiltered_R2 = R2[:-8] + "unclassified.R2.fastq"

# Create index for large fastq files - This process dramatically decreases the runtime and RAM usage when compared to dictionaries.

record_R1_dict = SeqIO.index(R1, "fastq")
コード例 #27
0
ファイル: pdb_analysis.py プロジェクト: bwbai/bpforms
# Escherichia coli
query_org_id = 562

# Gammaproteobacteria
# query_org_id = 1236

# Saccharomyces cerevisiae
# query_org_id = 4932

# Saccharomycetes
# query_org_id = 4891

# get org_ids for query organism and descendants
ncbi = NCBITaxa()
query_ids = ncbi.get_descendant_taxa(query_org_id, intermediate_nodes=True)
query_ids.append(query_org_id)

rank_distance = {'species':0,
                 'genus':1,
                 'family':2,
                 'order':3,
                 'class':4,
                 'phylum':5,
                 'kingdom':6,
                 'superkingdom':7,
                 'no rank':8}

class Entry(object):
    """ Simple class to hold information about a parsed PDB entry
コード例 #28
0
##Command line: "python3.6 Taxpull.py > taxids"

##Taxpull.py
from ete3 import NCBITaxa
ncbi = NCBITaxa()
descendants = ncbi.get_descendant_taxa('Mus')
print(descendants)
コード例 #29
0
        print(
            "{} Dont exist. Please create an index_db for your paired-end files running 'create_index_db.py'"
            .format(R1[:-5] + "index"))

    try:
        index_R2 = SeqIO.index_db(R2[:-5] + "index")
    except ValueError:
        print(
            "{} Dont exist. Please create an index_db for your R2_fastq_files running 'create_index_db.py'"
            .format(R2[:-5] + "index"))
        sys.exit(1)

# Getting taxonomy database and taxonomy level

ncbi = NCBITaxa()
descendants = ncbi.get_descendant_taxa(taxonomy_level, intermediate_nodes=True)

# Getting user taxonomy level and append to descendants
name2taxid = ncbi.get_name_translator([taxonomy_level])
user_tax_id = name2taxid.values()[0][0]
descendants.append(user_tax_id)
'''	Just to confirm if user_taxid_id is within descendants
if user_tax_id in descendants:
	print("user_taxid in descendants = True")
else:
	print("user_taxid in descendants = False")
'''
### Create filtered files names

# Getting file informations (file ID?)
n = re.search(r"(?<=\_)[0-9]+(?=\.kraken)", kaiju_file)
コード例 #30
0
from ete3 import NCBITaxa
ncbi = NCBITaxa()

descendants = ncbi.get_descendant_taxa('Salamandridae')
print(ncbi.translate_to_names(descendants))

descendants = ncbi.get_descendant_taxa('Salamandridae',
                                       collapse_subspecies=True)
print(ncbi.translate_to_names(descendants))

tree = ncbi.get_descendant_taxa('Salamandridae',
                                collapse_subspecies=True,
                                return_tree=True)
print(tree.get_ascii(attributes=['sci_name', 'taxid']))

# ['Notophthalmus viridescens', 'Notophthalmus perstriatus', 'Notophthalmus meridionalis kallerti',
# 'Notophthalmus meridionalis meridionalis', 'Pleurodeles waltl waltl', 'Pleurodeles poireti',
# 'Pleurodeles nebulosus', 'Taricha granulosa', 'Taricha rivularis', 'Taricha torosa torosa',
# 'Taricha torosa sierrae', 'Taricha sp. AMNH A168420', 'Triturus cristatus',
# 'Triturus karelinii arntzeni', 'Triturus karelinii karelinii', 'Triturus carnifex carnifex',
# 'Triturus dobrogicus dobrogicus', 'Triturus dobrogicus macrosomus', 'Triturus marmoratus marmoratus',
# 'Triturus pygmaeus', 'Triturus macedonicus', 'Triturus cristatus x Triturus dobrogicus macrosomus',
# 'Triturus cristatus s.l. AH-2007', "Triturus cf. karelinii 'eastern'", "Triturus cf. karelinii 'western'",
# 'Triturus ivanbureschi', 'Triturus anatolicus', 'Cynops pyrrhogaster', 'Cynops ensicauda',
# 'Cynops orientalis', 'Cynops cyanurus chuxiongensis', 'Cynops cyanurus cyanurus', 'Cynops orphicus',
# 'Cynops fudingensis', 'Cynops glaucus', 'Euproctus montanus', 'Euproctus platycephalus',
# 'Tylototriton taliangensis', 'Tylototriton verrucosus pulcherrima', 'Tylototriton shanjing',
# 'Tylototriton kweichowensis',
# Tylototriton sp. MH-2011', 'Tylototriton pseudoverrucosus', 'Tylototriton yangi', 'Tylototriton uyenoi',
# 'Tylototriton shanorum', 'Tylototriton anguliceps', 'Tylototriton daweishanensis',
# 'Tylototriton podichthys', 'Tylototriton himalayanus', 'Tylototriton ngarsuensis',
コード例 #31
0
ファイル: ete_ncbiquery.py プロジェクト: Ward9250/ete
def run(args):
    # add lineage profiles/stats

    import re
    from ete3 import PhyloTree, NCBITaxa

    # dump tree by default
    if not args.tree and not args.info and not args.descendants:
        args.tree = True

    ncbi = NCBITaxa()

    all_taxids = {}
    all_names = set()
    queries = []

    if not args.search:
        log.error('Search terms should be provided (i.e. --search) ')
        sys.exit(-1)
    for n in args.search:
        queries.append(n)
        try:
            all_taxids[int(n)] = None
        except ValueError:
            all_names.add(n.strip())

    # translate names
    name2tax = ncbi.get_name_translator(all_names)
    all_taxids.update([(v, None) for v in list(name2tax.values())])

    not_found_names = all_names - set(name2tax.keys())
    if args.fuzzy and not_found_names:
        log.warn("%s unknown names", len(not_found_names))
        for name in not_found_names:
            # enable extension loading
            tax, realname, sim = ncbi.get_fuzzy_name_translation(name, args.fuzzy)
            if tax:
                all_taxids[tax] = None
                name2tax[name] = tax
                name2realname[name] = realname
                name2score[name] = "Fuzzy:%0.2f" %sim

    if not_found_names:
        log.warn("[%s] could not be translated into taxids!" %','.join(not_found_names))

    if args.tree:
        if len(all_taxids) == 1:
            target_taxid = next(all_taxids.keys())
            log.info("Dumping NCBI descendants tree for %s" %(target_taxid))
            t = ncbi.get_descendant_taxa(target_taxid, collapse_subspecies=args.collapse_subspecies, rank_limit=args.rank_limit, return_tree=True)
        else:
            log.info("Dumping NCBI taxonomy of %d taxa..." %(len(all_taxids)))
            t = ncbi.get_topology(list(all_taxids.keys()),
                              intermediate_nodes=args.full_lineage,
                              rank_limit=args.rank_limit,
                              collapse_subspecies=args.collapse_subspecies)

        id2name = ncbi.get_taxid_translator([n.name for n in t.traverse()])
        for n in t.traverse():
            n.add_features(taxid=n.name)
            n.add_features(sci_name=str(id2name.get(int(n.name), "?")))
            n.name = "%s - %s" %(id2name.get(int(n.name), n.name), n.name)
            lineage = ncbi.get_lineage(n.taxid)
            n.add_features(named_lineage = '|'.join(ncbi.translate_to_names(lineage)))
        dump(t, features=["taxid", "name", "rank", "bgcolor", "sci_name",
                          "collapse_subspecies", "named_lineage"])
    elif args.descendants:
        log.info("Dumping NCBI taxonomy of %d taxa..." %(len(all_taxids)))
        print('# ' + '\t'.join(["Taxid", "Sci.Name", "Rank", "descendant_taxids", "descendant_names"]))
        translator = ncbi.get_taxid_translator(all_taxids)
        ranks = ncbi.get_rank(all_taxids)
        for taxid in all_taxids:
            descendants = ncbi.get_descendant_taxa(taxid, collapse_subspecies=args.collapse_subspecies, rank_limit=args.rank_limit)
            print('\t'.join([str(taxid), translator.get(taxid, taxid), ranks.get(taxid, ''),
                             '|'.join(map(str, descendants)),
                             '|'.join(map(str, ncbi.translate_to_names(descendants)))]))

    elif args.info:
        print('# ' + '\t'.join(["Taxid", "Sci.Name", "Rank", "Named Lineage", "Taxid Lineage"]))
        translator = ncbi.get_taxid_translator(all_taxids)
        ranks = ncbi.get_rank(all_taxids)
        for taxid, name in six.iteritems(translator):
            lineage = ncbi.get_lineage(taxid)
            named_lineage = ','.join(ncbi.translate_to_names(lineage))
            lineage_string = ','.join(map(str, lineage))
            print('\t'.join([str(taxid), name, ranks.get(taxid, ''), named_lineage, lineage_string]))