コード例 #1
0
ファイル: phylotree.py プロジェクト: Ward9250/ete
    def annotate_ncbi_taxa(self, taxid_attr='species', tax2name=None, tax2track=None, tax2rank=None, dbfile=None):
        """Add NCBI taxonomy annotation to all descendant nodes. Leaf nodes are
        expected to contain a feature (name, by default) encoding a valid taxid
        number.

        All descendant nodes (including internal nodes) are annotated with the
        following new features:

        `Node.spname`: scientific spcies name as encoded in the NCBI taxonomy database

        `Node.named_lineage`: the NCBI lineage track using scientific names

        `Node.taxid`: NCBI taxid number

        `Node.lineage`: same as named_lineage but using taxid codes.


        Note that for internal nodes, NCBI information will refer to the first
        common lineage of the grouped species.

        :param name taxid_attr: the name of the feature that should be used to access the taxid number associated to each node.

        :param None tax2name: A dictionary where keys are taxid numbers and
        values are their translation into NCBI scientific name. Its use is
        optional and allows to avoid database queries when annotating many trees
        containing the same set of taxids.

        :param None tax2track: A dictionary where keys are taxid numbers and
        values are their translation into NCBI lineage tracks (taxids). Its use is
        optional and allows to avoid database queries when annotating many trees
        containing the same set of taxids.

        :param None tax2rank: A dictionary where keys are taxid numbers and
        values are their translation into NCBI rank name. Its use is optional
        and allows to avoid database queries when annotating many trees
        containing the same set of taxids.

        :param None dbfile : If provided, the provided file will be used as a
        local copy of the NCBI taxonomy database.

        :returns: tax2name (a dictionary translating taxid numbers into
        scientific name), tax2lineage (a dictionary translating taxid numbers
        into their corresponding NCBI lineage track) and tax2rank (a dictionary translating taxid numbers into
        rank names).

        """

        ncbi = NCBITaxa(dbfile=dbfile)
        return ncbi.annotate_tree(self, taxid_attr=taxid_attr, tax2name=tax2name, tax2track=tax2track, tax2rank=tax2rank)
コード例 #2
0
ファイル: phylotree.py プロジェクト: Ward9250/ete
    def ncbi_compare(self, autodetect_duplications=True, cached_content=None):
        if not cached_content:
            cached_content = self.get_cached_content()
        cached_species = set([n.species for n in cached_content[self]])

        if len(cached_species) != len(cached_content[self]):
            print(cached_species)
            ntrees, ndups, target_trees = self.get_speciation_trees(autodetect_duplications=autodetect_duplications, map_features=["taxid"])
        else:
            target_trees = [self]


        ncbi = NCBITaxa()
        for t in target_trees:
            ncbi.get_broken_branches(t, cached_content)
コード例 #3
0
ファイル: models.py プロジェクト: GuiSeSanz/myScripts
def AddmyID(modelIDList, ID, filepath):
    ncbi = NCBITaxa()
    if ID.isdigit(): 
        modelIDList.append(int(ID))        
    else:
         name2taxID = ncbi.get_name_translator(ID)
         modelIDList.append(int(name2taxID[ID][0]))
         
    tree = model2Tree(modelIDList)
    #print tree.get_ascii(attributes=["sci_name", "rank"])
    
    outfile = "outTree.tmp"
    out = open(outfile, "w") 
    for line in tree.get_ascii(attributes=["sci_name", "rank"]):
        out.write(line)        
    out.close()
    
    return modelIDList
コード例 #4
0
def main():
    """Make queries against NCBI Taxa databases
    """
    # Get commandline args		
    args = get_args()
	
    # Instantiate the ete NCBI taxa object
    ncbi = NCBITaxa(dbfile=args.database)
    ## dbfile location
    if args.verbose > 1:
        sys.stderr.write('Taxa database is stored at {}\n'.format(ncbi.dbfile))

    # Update the database if required.
    if args.update is True:
        if args.verbose > 1:
            msg = 'Updating the taxonomy database. This may take several minutes...\n'
            sys.stderr.write(msg)
        ncbi.update_taxonomy_database()
            
    # If names were provided in taxid list, convert to taxids
    args.taxid = args.taxid.replace('"', '').replace("'", '').split(',')
    args.taxid = name2taxid(args.taxid, ncbi)

    # Output
    if args.outfile is None:
        outFH = sys.stdout
    else:
        outFH = open(args.outfile, 'w')
    ## header
    if args.taxon_info:
        outFH.write('\t'.join(['name', 'taxid', 'rank', 'lineage']) + '\n')
    elif not args.just_taxids:
        outFH.write('\t'.join(['parent_taxid',
                               'descendent_taxid',
                               'descendent_name']) + '\n')
    ## body
    for taxid in args.taxid:
        if args.taxon_info:
            taxon_info(taxid, ncbi, outFH)
        else:
            desc_taxa(taxid, ncbi,  outFH, args.just_taxids)
            
    outFH.close()
コード例 #5
0
ファイル: check_hgt_trees.py プロジェクト: AnnaNenarokova/ngs
def get_tags_leaves(tree, taxid_dict):
    ncbi_taxa = NCBITaxa()
    bacteria_taxid = 2
    dpapi_taxid = 91374
    leaf_tags = {}
    for leaf in tree.iter_leaves():
        seqid = leaf.name
        if "DIPPA" in seqid:
            leaf_tags[seqid] = "dpapi"
        elif seqid in taxid_dict.keys():
            # print (seqid)
            # print (taxid_dict[seqid])
            taxid = int(taxid_dict[seqid])
            if taxid == dpapi_taxid:
                leaf_tags[seqid] = "dpapi"
            elif bacteria_taxid in ncbi_taxa.get_lineage(taxid):
                leaf_tags[seqid] = "bacteria"
            else:
                leaf_tags[seqid] = "other"
        else:
            print (seqid, "is not in taxid dict!")
            leaf_tags[seqid] = "other"
    return leaf_tags
コード例 #6
0
ファイル: models.py プロジェクト: GuiSeSanz/myScripts
def model_organisms(inputfile):
    ncbi = NCBITaxa()
    infile = open(inputfile, "r")
    modelList = []
    for line in infile:
        modelList.append(line[:-1])
    infile.close()
  
    if modelList[0].isdigit():        
        print "List of model IDs Loaded"
        Type = 'Id'
    else:
        print "List of model names Loaded"
        Type = 'Sp'
    modelIDList = []
    
    if Type == 'Sp':
        name2taxID = ncbi.get_name_translator(modelList)
        for model in modelList:
            modelIDList.append(name2taxID[model][0])
    else:
       modelIDList = modelList 
       
    return modelIDList
コード例 #7
0
ファイル: otu.py プロジェクト: rjansen1984/CSVNanopore
import sqlite3
import csv
import re
import random
from os import listdir
from os.path import isfile, join
from collections import Counter
from collections import defaultdict
from collections import OrderedDict
from ete3 import NCBITaxa
from operator import itemgetter
from datetime import datetime, date, time


d = defaultdict(list)
ncbi = NCBITaxa()
DEFAULT_TAXADB = os.path.join(os.environ.get(
    'HOME', '/'), '.etetoolkit', 'taxa.sqlite')
DB_VERSION = 2


def get_input():
    """Get all user input and return all files and settings.
    
    Returns:
        Filepaths and all QC and classification files.
        Searchranks that will be added to the OTU table.
        Minimum qscore used for filtering the reads.
    """
    while True:
        mypath = input("Enter classification files path: ")
コード例 #8
0
ファイル: efecht.py プロジェクト: GuiSeSanz/myScripts
    return

#comapre2taxonomies(geneList[0].taxonomy, geneList[1].taxonomy)









NCBI = False
if NCBI :
    from ete3 import NCBITaxa
    ncbi = NCBITaxa()
    #ncbi.update_taxonomy_database()
    taxIDlist=[]
    for gene in geneList:
        name2taxID = ncbi.get_name_translator([gene.organism])
        gene.taxID = name2taxID[gene.organism][0]
        for i in ncbi.get_lineage(gene.taxID):
            
            gene.addlineageid(i)
        taxIDlist.append(gene.taxID)
        
    #taxid2name = ncbi.get_taxid_translator([9606, 9443])
    #print taxid2name
tree = False
if tree :    
    tree = ncbi.get_topology(taxIDlist)
コード例 #9
0
def main():
    parser = argparse.ArgumentParser(
        'Design FISH probes for a complex microbial community')
    parser.add_argument(
        'input_folder',
        type=str,
        help='Input folder containing images of biological samples')
    parser.add_argument(
        '-p',
        '--probe_design_filename',
        dest='probe_design_filename',
        type=str,
        default='',
        help='Input folder containing images of biological samples')
    parser.add_argument(
        '-r',
        '--ref_clf',
        dest='ref_clf',
        type=str,
        default='',
        help='Input folder containing images of biological samples')
    parser.add_argument('-d',
                        '--dimension',
                        dest='dimension',
                        type=int,
                        default='',
                        help='Reference folder')
    parser.add_argument('-s',
                        '--subfolder',
                        dest='subfolder',
                        type=str,
                        default='F',
                        help='Sub folder')
    parser.add_argument('-e',
                        '--epithelial',
                        dest='ep',
                        type=str,
                        default='F',
                        help='Sub folder')
    args = parser.parse_args()
    if args.probe_design_filename == '':
        filenames = glob.glob('{}/*.czi'.format(args.input_folder))
        samples = list(
            set([
                re.sub('_[0-9][0-9][0-9].czi', '', file) for file in filenames
            ]))
        i = 1
        for s in samples:
            measure_biofilm_images_no_reference(s, args.dimension)
            print("Finished str(i) of str(len(samples))")
            i = i + 1
    else:
        probes = pd.read_csv(args.probe_design_filename, dtype={'code': str})
        ncbi = NCBITaxa()
        taxon_lookup = probes.loc[:,
                                  ['target_taxon', 'code']].drop_duplicates()
        taxon_lookup['H'] = np.arange(0, 1, 1 / taxon_lookup.shape[0])
        taxon_lookup['S'] = 1
        taxon_lookup['V'] = 1
        taxon_sciname = pd.DataFrame.from_dict(ncbi.get_taxid_translator(
            taxon_lookup.target_taxon.values),
                                               orient='index').reset_index()
        taxon_sciname.columns = ['target_taxon', 'sci_name']
        taxon_lookup = taxon_lookup.merge(taxon_sciname, on='target_taxon')
        taxon_lookup.to_csv('{}/taxon_color_lookup.csv'.format(
            args.input_folder))
        if args.ep == 'T':
            taxon_lookup.loc[taxon_lookup.shape[0]] = [
                '0', '0000000', 0, 0, 0.5, 'Epithelial'
            ]
        umap_transform = joblib.load(args.ref_clf)
        clf_umap = joblib.load(
            re.sub('transform_biofilm_7b.pkl',
                   'transformed_biofilm_7b_svc.pkl', args.ref_clf))
        clf = joblib.load(
            re.sub('transform_biofilm_7b.pkl',
                   'transformed_biofilm_7b_check_svc.pkl', args.ref_clf))
        if args.subfolder == 'T':
            sf = glob.glob('{}/*'.format(args.input_folder))
            for subf in sf:
                filenames = glob.glob('{}/*.czi'.format(subf))
                samples = list(
                    set([
                        re.sub('_[0-9][0-9][0-9].czi', '', file)
                        for file in filenames
                    ]))
                for s in samples:
                    measure_biofilm_images(s, args.dimension, umap_transform,
                                           clf_umap, clf, taxon_lookup)
        else:
            filenames = glob.glob('{}/*.czi'.format(args.input_folder))
            samples = list(
                set([
                    re.sub('_[0-9][0-9][0-9].czi', '', file)
                    for file in filenames
                ]))
            for s in samples:
                measure_biofilm_images(s, args.dimension, umap_transform,
                                       clf_umap, clf, taxon_lookup)
    return
コード例 #10
0
parser.add_argument('-v',
                    '--version',
                    action='version',
                    version='%(prog)s v3.2')

# Getting arguments

args = parser.parse_args()
kaiju_file = args.kaiju_file
R1 = args.R1_file
R2 = args.R2_file
taxonomy_level = args.taxonomy_level

# Getting taxonomy database and taxonomy level

ncbi = NCBITaxa()
descendants = ncbi.get_descendant_taxa(taxonomy_level)

# Create filtered files names

# Input: SRR8771429.trimmed.5905288_00_R1.fastq
# Output: SRR8771429.trimmed.5905288_00_filtered.R1.fastq
# Output: SRR8771429.trimmed.5905288_00_unclassified.R1.fastq

filtered_R1 = R1[:-8] + "filtered.R1.fastq"
filtered_R2 = R2[:-8] + "filtered.R1.fastq"
unfiltered_R1 = R1[:-8] + "unclassified.R1.fastq"
unfiltered_R2 = R2[:-8] + "unclassified.R2.fastq"

# Create index for large fastq files - This process dramatically decreases the runtime and RAM usage when compared to dictionaries.
コード例 #11
0
__email__ = "*****@*****.**"

import sys
import argparse
from ete3 import NCBITaxa

#Display help and usage
parser = argparse.ArgumentParser(description="Incorrect number of command line arguments")
parser.add_argument('Sorted-LCA.csv')
parser.add_argument('Output.gv')
if len(sys.argv[1:]) == 0:
    parser.print_help()
    parser.exit()
args = parser.parse_args()

ncbi = NCBITaxa()

#The number of species you want to create the tree with
NumberOfSpecies = 10   

#Read CSV results into list, remove all but the top 10 most abundant taxonomies
ResultsList = list(line.strip().split(",") for line in open(sys.argv[1]))
ResultsList = ResultsList[0:int(NumberOfSpecies) + 1] #Take first n items in list (+1 is to negate the header line)

#Open output file for writing
Output = open(sys.argv[2], "w")

#Write header line in dot format
Output.write('digraph G {\n\tsize="8,5!";\n')

#Define lists, dicts and variables
コード例 #12
0
import optparse

from ete3 import NCBITaxa

ncbi = NCBITaxa()

parser = optparse.OptionParser()
parser.add_option('-s', '--species', dest="input_species_filename",
                  help='Species list in text format one species in each line')

parser.add_option('-f', '--format', type='choice', choices=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '100'], dest="format",
                  default='8', help='outpur format for tree')

parser.add_option('-t', '--treebest', type='choice', choices=['yes', 'no'], dest="treebest",
                  default='no', help='To be used in TreeBest')

parser.add_option('-d', '--database', type='choice', choices=['yes', 'no'], dest="database",
                  default='no', help='Update database')

options, args = parser.parse_args()

if options.database == "yes":
    try:
        ncbi.update_taxonomy_database()
    except:
        pass

if options.input_species_filename is None:
    raise Exception('-s option must be specified, Species list in text format one species in each line')

with open(options.input_species_filename) as f:
コード例 #13
0
def taxId2Species(taxid):
    return NCBITaxa().get_taxid_translator([taxid])
コード例 #14
0
ファイル: ete_lineage.py プロジェクト: jetjr/Bioinformatics
#!/usr/bin/env python

from ete3 import NCBITaxa
import sys
import os


args = sys.argv

if len(args) < 2:
  print("Usage:", args[0], "[IDs]")
  sys.exit(1)

ncbi = NCBITaxa()

for id in open(args[1]):
    print ncbi.get_lineage(id)
コード例 #15
0
ファイル: models.py プロジェクト: GuiSeSanz/myScripts
def model2Tree(modelIDList):    
    ncbi = NCBITaxa()        
    tree = ncbi.get_topology(modelIDList)
    print tree.get_ascii(attributes=["sci_name", "rank"])
    
    return tree
#This script uses python module ete3 to query NCBI taxonomy hierarchy for corresponding NCBI taxID
```
The input file, ncbi_gi_taxid_file, resemles a tab delimited two column table,
where the first column is the NCBI gene id, the second column is the NCBI taxID. 

The output file, output.txt, is a four column table, 
where the first column is the NCBI gene id, the second column is the NCBI taxID, the third column is the taxonomy rank (e.g, phylum), and the fourth column is the calssification at the taxonomy rank (e.g., Proteobacteria)

The output does not contain header. It's also in long table format as known in R. The output can be converted to the traditional wide table format in R using "reshape2".
```

if len(sys.argv) == 1:
	sys.exit("USAGE: python %s <path/to/ncbi_gi_taxid_file> > <output.txt>" % sys.argv[0])

ncbi = NCBITaxa()
#ncbi.update_taxonomy_database()

fp = open('taxa-ids-not-found.txt', 'w')
hier = ["superkingdom", "kingdom", "phylum", "class", "order", "family", "genus", "species"]

missing = []
for x in open(sys.argv[1]):
    dat = x.rstrip().split('\t')[-1]
    try:
        lineage = ncbi.get_lineage(dat)
	names = ncbi.get_taxid_translator(lineage)
	ranks = ncbi.get_rank(lineage)
	
	new_ranks = {}
	for keys in ranks:
コード例 #17
0
ファイル: emapper.py プロジェクト: jhcepas/eggnog-mapper
def parse_args(parser):
    args = parser.parse_args()

    if args.version:
        print get_version()
        sys.exit(0)

    if not args.no_annot and not pexists(EGGNOGDB_FILE):
        print colorify('Annotation database data/eggnog.db not present. Use download_eggnog_database.py to fetch it', 'red')
        raise emapperException()

    if args.mode == 'diamond' and not pexists(EGGNOG_DMND_DB):
        print colorify('DIAMOND database data/eggnog_proteins.dmnd not present. Use download_eggnog_database.py to fetch it', 'red')
        raise emapperException()

    if args.cpu == 0:
        args.cpu = multiprocessing.cpu_count()

    # No --servermode available for diamond
    if args.mode == 'diamond' and args.servermode:
        parser.error('--mode [diamond] and --servermode are mutually exclusive')

    # Output file required unless running in servermode
    if not args.servermode and not args.output:
        parser.error('An output project name is required (-o)')

    # Servermode implies using mem-based databases
    if args.servermode:
        args.usemem = True

    # Direct annotation implies no searches
    if args.annotate_hits_table:
        args.no_search = True
        args.no_annot = False

    # Check inputs for running sequence searches
    if not args.no_search and not args.servermode:
        if not args.input:
            parser.error('An input fasta file is required (-i)')

        # HMM
        if args.mode == 'hmmer':
            if not args.db and not args.guessdb:
                parser.error('HMMER mode requires specifying a target database (i.e. -d, --guessdb ))')
            if args.db and args.guessdb:
                parser.error('-d and --guessdb options are mutually exclusive')

            if args.guessdb:
                from ete3 import NCBITaxa
                ncbi = NCBITaxa()
                lineage = ncbi.get_lineage(args.guessdb)
                for tid in reversed(lineage):
                    if tid in TAXID2LEVEL:
                        print tid, TAXID2LEVEL[tid]
                        args.db = TAXID2LEVEL[tid]
                        break
        # DIAMOND
        elif args.mode == 'diamond':
            #if args.db or args.guessdb:
            #    parser.error('diamond mode does not require -d or --guessdb options')
            pass

    return args
コード例 #18
0
#!/usr/bin/python3
from ete3 import NCBITaxa

ncbi = NCBITaxa()
diamond_path = "/home/anna/bioinformatics/diplonema/dpapi_genome_diamond.tsv"
out_path = "/home/anna/bioinformatics/diplonema/dpapi_genome_diamond_annotation.tsv"

taxids = []

with open(diamond_path) as input_f:
    for line in input_f:
        newtaxid = line.split("\t")[1]
        taxids.append(newtaxid)

taxids_nr = list(set(taxids))
tax_names = ncbi.get_taxid_translator(taxids_nr)

input_f = open(diamond_path, "r")
output_f = open(out_path, 'w')

for line in input_f:
    line_split = line.rstrip().split("\t")
    id = line_split[0]
    taxid = line_split[1]
    evalue = line_split[2]
    if taxid == "0":
        name = "None"
        is_bacteria = 0
    else:
        name = tax_names[int(taxid)]
        is_bacteria = 1 if 2 in ncbi.get_lineage(taxid) else 0
コード例 #19
0
def test_raise_taxdict_level():
    ncbi = NCBITaxa()
    testdict = {246200: 181.8, 190047: 259.8}
    higher_level = vica.minhash._raise_taxdict_level(testdict, ncbi)
    eq_({1224: 181.8, 1117: 259.8}, higher_level)
コード例 #20
0
from ete3 import NCBITaxa

#The first time this will download the taxonomic NCBI database and save a parsed version
#of it in  `~/.etetoolkit/taxa.sqlite`.May take some minutes
ncbi = NCBITaxa()
print("ncbi.dbfile", ncbi.dbfile)

with open(snakemake.input[0], 'r', encoding='utf8') as fh:
    genus_list = fh.read().strip().split('\n')

genus_to_taxid = ncbi.get_name_translator(genus_list)
tax_id_vals = genus_to_taxid.values()

tree = ncbi.get_topology(
    [genus_id for subls in tax_id_vals for genus_id in subls],
    intermediate_nodes=True)

# `get_ascii()` has a bug, prints the taxons before to genus without any separation between them, so a way to avoid that is using extra attribues, `dist` seems to be less invasive. Also, numbers from 'dist' are replaced
with open(snakemake.output[0], mode='w', encoding='utf8') as fh:
    print(tree.get_ascii(attributes=["dist", "sci_name"]).replace('1.0,', '-'),
          file=fh)
コード例 #21
0
                        help="""
                        save path of Colorbar for the heatmap with matplotlib
                        """)        
    

    args = parser.parse_args()
    infile = args.infile
    mode = args.mode
    newick = args.newick

    if newick:
        t = PhyloTree(args.newick)      
        species2taxid = dict([ line.split()[0], line.strip().split()[1] ] for line in open(infile))
        taxids = set(species2taxid.values())
    else:
        ncbi = NCBITaxa()
        taxids = set([ line.strip() for line in open(infile) ])


    if args.taxoncolors:
        taxon2color = dict([int(line.split()[0]), line.split()[1]] for line in open(args.taxoncolors))

    tNCBI = ncbi.get_topology(taxids, intermediate_nodes=True)
    tNCBI = tNCBI.search_nodes(name="2759")[0]
    ncbi.annotate_tree(tNCBI, taxid_attr="name")
    tax2node = dict([node.taxid, node] for node in tNCBI.traverse())

    if args.no_intermediate_nodes:
        for node in tNCBI.get_descendants():
            if len(node.children) == 1:
                node.delete()
コード例 #22
0
def parse_args(parser):
    args = parser.parse_args()

    if args.version:
        print get_version()
        sys.exit(0)

    if args.data_dir:
        set_data_path(args.data_dir)

    if not args.no_annot and not pexists(get_eggnogdb_file()):
        print colorify('Annotation database data/eggnog.db not present. Use download_eggnog_database.py to fetch it', 'red')
        raise emapperException()

    if args.mode == 'diamond':
        dmnd_db = args.dmnd_db if args.dmnd_db else get_eggnog_dmnd_db()
        if not pexists(dmnd_db):
            print colorify('DIAMOND database %s not present. Use download_eggnog_database.py to fetch it' % dmnd_db, 'red')
            raise emapperException()

    if args.cpu == 0:
        args.cpu = multiprocessing.cpu_count()

    # No --servermode available for diamond
    if args.mode == 'diamond' and args.servermode:
        parser.error('--mode [diamond] and --servermode are mutually exclusive')

    # Output file required unless running in servermode
    if not args.servermode and not args.output:
        parser.error('An output project name is required (-o)')

    # Servermode implies using mem-based databases
    if args.servermode:
        args.usemem = True

    # Direct annotation implies no searches
    if args.annotate_hits_table:
        args.no_search = True
        args.no_annot = False


    # Sets GO evidence bases
    if args.go_evidence == 'experimental':
        args.go_evidence = set(["EXP","IDA","IPI","IMP","IGI","IEP"])
        args.go_excluded = set(["ND", "IEA"])

    elif args.go_evidence == 'non-electronic':
        args.go_evidence = None
        args.go_excluded = set(["ND", "IEA"])
    else:
        raise ValueError('Invalid --go_evidence value')

    # Check inputs for running sequence searches
    if not args.no_search and not args.servermode:
        if not args.input:
            parser.error('An input fasta file is required (-i)')

        # HMM
        if args.mode == 'hmmer':
            if not args.db and not args.guessdb:
                parser.error('HMMER mode requires specifying a target database (i.e. -d, --guessdb ))')
            if args.db and args.guessdb:
                parser.error('-d and --guessdb options are mutually exclusive')

            if args.guessdb:
                from ete3 import NCBITaxa
                ncbi = NCBITaxa()
                lineage = ncbi.get_lineage(args.guessdb)
                for tid in reversed(lineage):
                    if tid in TAXID2LEVEL:
                        print tid, TAXID2LEVEL[tid]
                        args.db = TAXID2LEVEL[tid]
                        break
        # DIAMOND
        elif args.mode == 'diamond':
            #if args.db or args.guessdb:
            #    parser.error('diamond mode does not require -d or --guessdb options')
            pass

    return args
コード例 #23
0
import sys
import cPickle
from pandas import DataFrame
from collections import Counter, defaultdict
import os
from glob import glob
from sys import exit
from ete3 import SeqGroup
from ete3 import NCBITaxa

ncbi = NCBITaxa()

path = sys.argv[1] + "*clustalo"
infiles = glob(path)
print "%s infiles" % len(infiles)

valid_cols = 0
spvariants = defaultdict(Counter)
refaas = []

for infile in infiles:
#for infile in glob("formatted_MG_seqs.faa.final_tree.fa"):
    print infile
    if os.stat(infile).st_size == 0:
        continue
    alg = SeqGroup(infile)
    alg_matrix = []
    labels = []    
    
    for name, seq, _ in alg:
        # Replace trailing gaps with # and * for stop
コード例 #24
0
ファイル: lineage.py プロジェクト: andrewwhwang/fastaq2phylo
from argparse import ArgumentParser
from Bio import Entrez
from ete3 import NCBITaxa
import sys
ncbi = NCBITaxa()
Entrez.email = "*****@*****.**"
parser = ArgumentParser()
parser.add_argument('-file', help="txt file to be parsed")
parser.add_argument('-dbType', help="type of database")
parser.add_argument('-filenum', help="output name")
args = parser.parse_args()

filename = args.file
dbType = args.dbType
filenum  = args.filenum

# filename = 'C:/Users/Andrew.Hwang/Desktop/fastaq2phylo/output/blastout.txt'
# dbType = 'nt'
# filenum  = "0"

memory = {}
writeLines = []
with open(filename, 'r') as f:
    for line in f:
        line_arr = line.split("\t")
        ID=line_arr[1]
        pos=int(round(100 * int(line_arr[2]) / int(line_arr[4])))
        pos2=int(round(100 * int(line_arr[3]) / int(line_arr[4])))
        if dbType == 'nt':
            lineage = ncbi.get_lineage(ID)
            names = ncbi.get_taxid_translator(lineage)
コード例 #25
0
##Command line: "python3.6 Taxpull.py > taxids"

##Taxpull.py
from ete3 import NCBITaxa
ncbi = NCBITaxa()
descendants = ncbi.get_descendant_taxa('Mus')
print(descendants)
コード例 #26
0
from intermine.webservice import Service

from ete3 import NCBITaxa
ncbi = NCBITaxa()
#ncbi.update_taxonomy_database()

service = Service("https://phytozome.jgi.doe.gov/phytomine/service")
query = service.new_query("Organism")
query.add_view("annotationVersion", "assemblyVersion", "commonName", "genus",
               "name", "proteomeId", "shortName", "species", "taxonId",
               "version")
k = [
    "proteomeId", "commonName", "name", "shortName", "annotationVersion",
    "assemblyVersion", "genus", "species", "taxonId", "version"
]
t = [
    "superkingdom", "kingdom", "phylum", "class", "subclass", "order",
    "family", "genus", "species"
]
print("\t".join(k + t + ["full_lineage"]))


def filterRanks(L):
    subset = {ncbi.get_rank([x])[x]: x for x in L}
    #return([if x in subset: ncbi.get_taxid_translator([x])[x] else: "NA" for x in t])
    return ([
        list(ncbi.get_taxid_translator([subset[x]]).values())[0]
        if x in subset else 'NA' for x in t
    ])

コード例 #27
0
                        line2 = line2.strip()
                        genome_name1 = line2.split("\t")[1]
                        bacteria_name = line2.split("\t")[0]
                    #    genome_name1 = genome_name1.split(".")[0]   
			dic_genome_bacteria[genome_name1]=bacteria_name

with open(sys.argv[4],'r') as tax_id_fi:
                for line3 in tax_id_fi:
                        line3 = line3.strip() 
                        tax_id = line3.split("\t")[2]
                        species_name = line3.split("\t")[3]
                        bacteria_name2 = line3.split("\t")[0]
			dic_taxid_bacteria[bacteria_name2]=tax_id
			dic_taxid_name[tax_id]=species_name

ncbi = NCBITaxa()

with open(sys.argv[1],'r') as blast_file:
                for line in blast_file:
                        line = line.strip()
                        blastn_gene_name =line.split("\t")[0]
			identity = line.split("\t")[2]
                        genome_name = line.split("\t")[1]
			dic_blastn_identity[blastn_gene_name]=identity
                        dic_blastn[blastn_gene_name]=genome_name
			if blastn_gene_name in dic_cu:
				lineage = ncbi.get_lineage(dic_taxid_bacteria[dic_genome_bacteria[dic_blastn[blastn_gene_name]]])
				names = ncbi.get_taxid_translator(lineage)
				tax_seq = [names[taxid] for taxid in lineage]
				if len(tax_seq)>9:
					tax_seq2 = [tax_seq[4],tax_seq[8],tax_seq[9]]
コード例 #28
0
    331111, 262724, 300852, 83333, 83333, 331111, 83333, 562, 83333, 1351,
    83333, 562, 83334, 93061, 93062, 559292, 285006, 1280, 10665, 83333, 663,
    83333, 83333, 7460, 246196, 759272, 5693, 559292, 4932, 1773, 300852, 574,
    562, 331111, 1247190, 5811, 5722, 300852, 287, 262724, 562, 274, 246196,
    1772, 300852, 9739, 83333, 284590, 559292, 284590, 4932, 9823, 9606, 9606,
    9823, 562, 7460, 9986, 194966, 679895, 5702, 562, 9606, 262724, 274, 1351,
    300852, 562, 300852, 562, 262724, 9986, 9606, 9615, 6039, 209285, 311400,
    287, 272844, 273057, 83333, 224308, 69014, 1293037, 2287, 562, 1223565,
    1144670, 1217649, 1977881, 480119, 1217710, 1310637, 421052, 470, 1310678,
    52133, 1144663, 1960940, 1144670, 1217649, 1977881, 480119, 1217710, 470,
    1310637, 421052, 1144663, 1960940, 1310678, 52133, 1217649, 1977881,
    480119, 1310637, 421052, 470, 1310678, 1144670, 1217710, 1144663, 1960940,
    52133, 1144670, 480119, 1217710, 1217649, 470, 1310637, 421052, 1144663,
    1977881, 474186, 3702, 575584
]
ncbi = NCBITaxa()
# ncbi.update_taxonomy_database()

unique_taxa = list(set(unique_taxa))

with open('unique_taxa.txt', 'w') as infile:
    for i in unique_taxa:
        infile.write(str(i) + "\n")

    infile.close()

taxid2name = ncbi.get_taxid_translator(unique_taxa)
b = ncbi.get_name_translator(['Bacteria'])['Bacteria'][0]
a = ncbi.get_name_translator(['Archaea'])['Archaea'][0]
e = ncbi.get_name_translator(['Eukaryota'])['Eukaryota'][0]
v = ncbi.get_name_translator(['Viruses'])['Viruses'][0]
コード例 #29
0
ファイル: phylowalk_m8parser.py プロジェクト: czbiohub/idinf
parser.add_argument('-o', '-outfile', help='output filepath', type=str)
args = parser.parse_args()

num_cores = multiprocessing.cpu_count()

### set up MySQL connections ###

# connect to NCBITaxonomy DB to get gi to taxid mappings
mysql_cn = pymysql.connect(host='localhost',
                           port=3306,
                           user='******',
                           passwd='balamuthia',
                           db='NCBI_Taxonomy')

# initialize the NCBI database
ncbi = NCBITaxa()

### MAIN FUNCTIONALITY ###

# read in .m8 file (GSNAPL output) to pandas dataframe, assign to column names
print(date() + "  Begin reading in .m8 dataframe")
df = pd.read_csv(args.i,
                 sep='\t',
                 header=None,
                 names=[
                     'qseqid', 'sseqid', 'pident', 'length', 'mismatch',
                     'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue',
                     'bitscore'
                 ])
print(date() + '  Finished reading in .m8 dataframe')
コード例 #30
0
ファイル: workflow.py プロジェクト: alienzj/EukCC
    def inferLineage(self, places):
        """
        infer the lineage from looking at the location of placement
        looking at the leaves and their tax id
        and looking at the lineages of all these
        """
        if self.cfg["touch"]:
            return
        ncbi = NCBITaxa()
        # fetch file and load taxinformation
        seqinfo = self.config.pkgfile("concat.refpkg", "seq_info")
        taxids = {}
        si = base.readCSV(seqinfo)
        # make dictionary
        for r in si:
            taxids[r["seqname"]] = r["tax_id"]

        # for each placement:
        logging.debug("Infering lineages now")
        for p in places:
            # get the GCA names
            children = p["sisters"]
            # fetch lineages for all
            lngs = []
            for c in children:
                try:
                    lngs.append(ncbi.get_lineage(taxids[c]))
                except ValueError as e:
                    logging.warning(e)

            # find common elements:
            common = set(lngs[0])
            for l in lngs[1:]:
                common = common & set(l)

            # common lineage
            lng = []
            for v in lngs[0]:
                if v not in common:
                    break
                # add common elements
                lng.append(v)

            nodetaxid = lng[-1]
            # now we can make it pretty
            if not self.cfg["fullineage"]:
                # limit to desired ranks2
                desired_ranks = [
                    "superkingdom",
                    "kingdom",
                    "phylum",
                    "class",
                    "order",
                    "family",
                    "genus",
                    "species",
                ]
                lineage2ranks = ncbi.get_rank(lng)
                ranks2lineage = dict(
                    (rank, taxid) for (taxid, rank) in lineage2ranks.items())
                ranks = {
                    "{}_id".format(rank): ranks2lineage.get(rank, "NA")
                    for rank in desired_ranks
                }
                lng = [i for i in lng if i in ranks.values()]
            # get translator and make string
            named = ncbi.translate_to_names(lng)
            # save to placed object
            p["lineage"] = "_".join(named)
            # replace names with spaces into points
            p["lineage"] = p["lineage"].replace(" ", ".")
            p["taxidlineage"] = "_".join([str(x) for x in lng])
            p["taxid"] = nodetaxid

        return ()
コード例 #31
0
#!/usr/bin/python

Usage = """
Print taxid's lineage and ranks
by default prints to the stdout
Usage:
  taxid_ranks.py taxid > ouput.txt

Arun Seetharam
[email protected]
taxid_ranks.py -version 1.0
04/13/2017
"""
from ete3 import NCBITaxa
import sys
ncbi = NCBITaxa()
if len(sys.argv)<2:
    print Usage
else:
    cmdargs = str(sys.argv)
    lineage = ncbi.get_lineage((sys.argv[1]))
    names = ncbi.get_taxid_translator(lineage)
    for taxid in lineage:
        print [ncbi.get_rank([taxid])], [names[taxid]]        
#    print [names[taxid] for taxid in lineage]
#    print [ncbi.get_rank([taxid]) for taxid in lineage]
#    print [ncbi.get_rank([name]) for name in names]
コード例 #32
0
#!/Users/zoliq/anaconda3/bin/python3

from distutils.log import error
import os,re,time,argparse
from Bio import SeqIO,Entrez
from ete3 import NCBITaxa
#http://etetoolkit.org/docs/2.3/tutorial/tutorial_ncbitaxonomy.html
ncbi = NCBITaxa()
Entrez.email = '*****@*****.**'
#Entrez.api_key = os.environ["API_KEY"]
#update at times:
#ncbi.update_taxonomy_database()


def assembly_methods(blastline):
	# Infer the assembler and protein predictor from blast input or faa file
	seqname = blastline.split()[0].replace(">", "")
	assembler, predictor = "NA", "NA"
	if "::" in seqname:
		#format >TRINITY_DN7724_c0_g1::TRINITY_DN7724_c0_g1_i1::g.1::m.1 type:3prime_partial len:132 gc:universal TRINITY_DN7724_c0_g1_i1:113-505(+)
		predictor = "transdecoder-old" #old!
	elif re.search(r"\.p\d+", seqname):
		#format >c0_g2_i1.p1 type:3prime_partial len:319 gc:universal c0_g2_i1:175-1128(+)
		predictor = "transdecoder-new"

	if seqname.startswith("TRINITY"):
		assembler = "trinity-new"
		if seqname.split("_")[-2].startswith("i"):
			#format >TRINITY_DN61_c0_g2_i2_1 # 3 # 644 # -1 # ID=1_1;partial=10;start_type=GTG;rbs_motif=None;rbs_spacer=None;gc_cont=0.372
			#prodigal appends a _number to seqid
			predictor = "prodigal"
コード例 #33
0
    else:
        #set_env()
        try:
            os.mkdir(OUTDIR)
        except OSError:
            print("Be careful : The directory taxonDB_data exists")
        TAXON_DT = {}

        DOC = search_in_database(PARAM.r, PARAM.dbName, PARAM.taxid)

        if DOC and DOC["_id"] != PARAM.taxid:
            DOC = None

        LIST_GCF = list(set([PARAM.gcf] + DOC["GCF"])) if DOC else [PARAM.gcf]

        ncbi = NCBITaxa()

        name = ncbi.get_taxid_translator([int(PARAM.taxid)])
        if not name:
            raise Exception("No correspondance for " + PARAM.taxid +
                            " in ete3 NCBITaxa")

        name = name[int(PARAM.taxid)]

        LIST_NAME = list(set([name] + DOC["names"])) if DOC else [name]

        tmp_taxon_dt = init_taxondt(LIST_GCF, PARAM.user, PARAM.taxid,
                                    PARAM.fasta, PARAM.gcf, LIST_NAME, name)
        if (tmp_taxon_dt != 1):
            TAXON_DT[PARAM.taxid] = tmp_taxon_dt
コード例 #34
0
#!/usr/bin/env python

#This script will take tab-separated uniprotid and taxids and finds first_recenet_common ancestor
#and number of steps from FCA to Eukaryota

# It produces 4 output files (summary report,ontology.tab, trees.tab)
#and a general newick file for taxa from all proteins

#This script uses ete3 for dealing with trees and graphs
import sys
from ete3 import Tree
from ete3 import NCBITaxa
from ete3 import PhyloTree

ncbi = NCBITaxa()
tax_dict = dict()
tree_dict = dict()
ontology = dict()
#creating output files
outputFile = open('trees.tab', 'w')
outputFile2 = open('summary_report.tab', 'w')
outputFile2.write('uni_id' + '\t' + 'FCA_id' + '\t' + 'FCA_name' + '\t' +
                  'steps_from_Eukaryota' + '\n')
outputFile3 = open('ontology.tab', 'w')
#Here I open the file that Matt script creates and loops in each line and get the taxids
with open('SP_by_taxa.tab', 'r') as fo:
    for line in fo:
        line = line.rstrip()
        (uniprotid, taxids) = line.split('\t')
        one_taxid = taxids.split(
            ',')  # divide the list of taxids to diff taxids 'strings'
コード例 #35
0
    def tax_id(lyst):
        from Bio import Entrez

        def get_tax_id(species):
            """to get data from ncbi taxomomy, we need to have the taxid. we can
            get that by passing the species name to esearch, which will return
            the tax id"""
            species = species.replace(' ', "+").strip()
            search = Entrez.esearch(term=species, db="taxonomy", retmode="xml")
            record = Entrez.read(search)
            if species != 'Not assigned' or 'root' and record['IdList'] != []:
                return record['IdList'][0]

        def get_tax_data(taxid):
            """once we have the taxid, we can fetch the record"""
            search = Entrez.efetch(id=taxid, db="taxonomy", retmode="xml")
            return Entrez.read(search)

        Entrez.email = "*****@*****.**"
        if not Entrez.email:
            print("you must add your email address")
            sys.exit(2)
        species_list = [
            'Terrabacteria group', 'Helicobacter pylori 26695',
            'Thermotoga maritima MSB8', 'Deinococcus radiodurans R1',
            'Treponema pallidum subsp. pallidum str. Nichols',
            'Aquifex aeolicus VF5', 'Archaeoglobus fulgidus DSM 4304'
        ]
        species_list = lyst
        taxid_list = []  # Initiate the lists to store the data to be parsed in
        data_list = []
        lineage_list = []

        print('parsing taxonomic data...'
              )  # message declaring the parser has begun

        for species in species_list:
            print('\t' + species)  # progress messages

            taxid = get_tax_id(species)  # Apply your functions
            data = get_tax_data(taxid)
            if 'LineageEx' in data[0]:
                lineage = {
                    d['Rank']: d['ScientificName']
                    for d in data[0]['LineageEx'] if d['Rank'] in ['phylum']
                }
            else:
                print('ERROR:', species, 'not found in dictionary')
            taxid_list.append(
                taxid)  # Append the data to lists already initiated
            data_list.append(data)
            lineage_list.append(lineage)

        print('complete!')
        print()
        print('TaxId\'s:')
        print(taxid_list)
        print()

        from ete3 import NCBITaxa

        ncbi = NCBITaxa()

        def get_desired_ranks(taxid, desired_ranks):
            lineage = ncbi.get_lineage(taxid)
            names = ncbi.get_taxid_translator(lineage)
            lineage2ranks = ncbi.get_rank(names)
            ranks2lineage = dict(
                (rank, taxid) for (taxid, rank) in lineage2ranks.items())
            return {
                '{}_id'.format(rank): ranks2lineage.get(rank, '<not present>')
                for rank in desired_ranks
            }

        if __name__ == '__main__':
            taxids = taxid_list
            desired_ranks = [
                'superkingdom', 'kingdom', 'class', 'family'
            ]  #, 'genus']  #['kingdom', 'phylum', 'class', 'order', 'superfamily', 'family', 'subfamily', 'tribe', 'subtribe', 'genus', 'subgenus', 'species', 'subspecies']
            results = list()
            for taxid in taxids:
                results.append(list())
                results[-1].append(str(taxid))
                ranks = get_desired_ranks(taxid, desired_ranks)
                for key, rank in ranks.items():
                    if rank != '<not present>':
                        results[-1].append(
                            list(ncbi.get_taxid_translator([rank
                                                            ]).values())[0])
                    else:
                        results[-1].append(rank)

            #generate the header
            header = ['reads', 'Original search', 'Original_query_taxid']
            header.extend(desired_ranks)
            out = []
            #            print('\t'.join(header))
            out.append(header)

            #print the results
            for result, reads, term in zip(results, values, lyst):
                cnt = 0
                for i in result:
                    if 'bacter' in i or 'Bacter' in i:
                        result[2] = 'Bacteria'
                    if i == '<not present>':
                        result[cnt] = ''
                    cnt += 1

                temp = [reads, term]
                temp.extend(result)

                out.append(temp)
        out = pd.DataFrame(out)
        return out
コード例 #36
0
               "Humphaplotropis culaishanensis" : "Humphaplotropis culaishanensis (nomen nudum)",
               "Paraglypturus tonganus" : "Paraglypturus tonganus (nomen nudum)",
               "Hoploplana elisabelloi" : "Hoploplana elisabelloi (nomen nudum)",
               "Palpitomonas bilix Eukaryota." : "Palpitomonas bilix",
               "Eukaryota sp. BB2 Eukaryota." : "Eukaryota sp. BB2",
               "Ancoracysta twista Eukaryota." : "Ancoracysta twista"
    }

# --- load a parser and iterator for our GenBank file
gb_handle = gzip.open(sys.argv[1], "r")
# -- a parser that will give you back SeqFeature objects
feature_parser = GenBank.FeatureParser()
iterator = GenBank.Iterator(gb_handle, feature_parser)

# load taxonomy for taxids
ncbi = NCBITaxa()

# output using prefix
out_1 = open("%s.cdna.fasta" % sys.argv[2], "w")
out_2 = open("%s.codons.tab" % sys.argv[2], "w")

strands = []
stop_codons = []
prot_ids = []
excluded = []
missing_id = []

# begin iterating through the file and getting GenBank records
while 1:
    # get a SeqFeature object for the next GenBank record. When we run
    # out of records in the file, cur_entry will be None
コード例 #37
0
parser = argparse.ArgumentParser()
parser.add_argument("--taxons",
                    "-t",
                    type=str,
                    help="File containing the list of species.")
parser.add_argument("--output",
                    "-o",
                    type=str,
                    help="Name of the output file in newick")
args = parser.parse_args()

if not args.output:
    args.output = "species_tree.nw"

# Setting up a local copy of the NCBI taxonomy database and upgrade it
ncbi = NCBITaxa()
#ncbi.update_taxonomy_database()

# Load the species names
try:
    with open(args.taxons, 'r') as taxFile:
        listTaxa = taxFile.readlines()
        listTaxa = [x.strip() for x in listTaxa]
        listTaxa = [x.split(" ") for x in listTaxa]
        listTaxa = list(set(itertools.chain(*listTaxa)))
        listTaxa = [x.replace("_", " ") for x in listTaxa]
except FileNotFoundError:
    print("File does not exist")
    sys.exit(1)

# Retrieve TaxId from species names
コード例 #38
0
def test_pick_higher_level():
    ncbi = NCBITaxa()
    higher_level = vica.minhash._pick_higher_level(9606, ncbi)
    eq_(7711, higher_level)
コード例 #39
0
from ete3 import NCBITaxa
#http://etetoolkit.org/docs/2.3/tutorial/tutorial_ncbitaxonomy.html
ncbi = NCBITaxa()
from Bio import SeqIO

goodones = {"Panarthropoda"}
species = set()
goodscafs = {}
distribution = {"Panarthropoda": 0}
with open("besthits_NR.taxified.out") as infile, open("highertaxa.txt", "w") as outfile:
	table = infile.read().split("\n")
	for line in table:
		if len(line.split("\t")) != 1:
			line = line.split("\t")
			if line[1] != "N/A":
				taxid = line[1]
				lineage = ncbi.get_lineage(taxid)[2:]
				names = ncbi.get_taxid_translator(lineage)
				rank = [names[taxid] for taxid in lineage]
				# if "Eukaryota" in rank:
				# 	rank.remove("Eukaryota")
				if taxid not in species:
					species.add(taxid)
					#orgn = ncbi.get_taxid_translator([taxid])[int(taxid)]
					#print("{}\t{}".format(orgn, "_".join(rank)))
				if "Panarthropoda" in rank:
					orgn = ncbi.get_taxid_translator([taxid])[int(taxid)]
					outfile.write("{}\t{}\t{}\n".format(line[0], orgn, "_".join(rank)))
					goodscafs[line[0]] = orgn
					distribution["Panarthropoda"] += 1
				elif "Metazoa" in rank:
コード例 #40
0
def generate_consensus(input_blast_filename, input_fasta_filename, similarity,
                       outdir, target_rank, ud):
    # read in blast result
    blast_result = pd.read_table(input_blast_filename, header=None)
    blast_result.columns = [
        'molecule_id', 'reference_id', 'pid', 'qcovhsp', 'length', 'mismatch',
        'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore',
        'staxids'
    ]

    # initiate an instance of ncbi taxonomy database
    ncbi = NCBITaxa()

    # retrieve lineage information for each full length 16S molecule
    desired_ranks = [
        'superkingdom', 'phylum', 'class', 'order', 'family', 'genus',
        'species'
    ]
    ranks = pd.DataFrame(columns=['staxids'] + desired_ranks)
    blast_result_staxids = blast_result['staxids'].unique()
    ranks['staxids'] = blast_result_staxids
    for i in range(0, blast_result_staxids.shape[0]):
        taxid = blast_result_staxids[i]
        if not str(taxid).isdigit():
            taxid = taxid.split(';')[0]
        ranks.ix[i, 1:len(desired_ranks) + 1] = get_lineage_at_desired_ranks(
            taxid, desired_ranks)

    # merge lineage information with PacBio 16S blast results
    blast_lineage = blast_result.merge(ranks, on='staxids', how='left')

    seq_dict = SeqIO.to_dict(SeqIO.parse(input_fasta_filename, 'fasta'))

    blast_lineage_filename = outdir + '/blast_lineage.tab'
    blast_lineage.to_csv(blast_lineage_filename, sep='\t', index=False)
    if target_rank == 'strain':
        blast_lineage.groupby(['species']).apply(write_taxon_fasta,
                                                 taxon=target_rank,
                                                 seq_dict=seq_dict,
                                                 similarity=similarity,
                                                 outdir=outdir,
                                                 ud=ud)
    else:
        blast_lineage.groupby([target_rank]).apply(write_taxon_fasta,
                                                   taxon=target_rank,
                                                   seq_dict=seq_dict,
                                                   similarity=similarity,
                                                   outdir=outdir,
                                                   ud=ud)
    if target_rank == 'strain':
        blast_lineage_strain = retrieve_cluster(blast_lineage_filename, outdir,
                                                'F')
        taxon_abundance = blast_lineage_strain['strain'].value_counts(
        ).reset_index()
        taxon_abundance.columns = ['taxid', 'counts']
        taxon_abundance.to_csv(outdir + '/taxon_abundance.csv', index=False)
    else:
        blast_lineage_strain = retrieve_cluster(blast_lineage_filename, outdir,
                                                'F')
        taxon_abundance = blast_lineage[target_rank].value_counts(
        ).reset_index()
        taxon_abundance.columns = ['taxid', 'counts']
        taxon_abundance.to_csv(outdir + '/taxon_abundance.csv', index=False)
    return (blast_lineage)
コード例 #41
0
def contig_tax(annot_df, min_prot, prop_annot, tax_thres):
	'''This function takes the annotation table generated by viral_contig_maps.py and generates a table that
	   provides the taxonomic lineage of each viral contig, based on the corresponding ViPhOG annotations'''

	ncbi = NCBITaxa()
	tax_rank_order = ["genus", "subfamily", "family", "order"]
	contig_list = list(annot_df["Contig"].value_counts().index)
	df_rows = []

	def get_tax_rank(label):
		try:
			tax_id = ncbi.get_name_translator([label])[label]
			tax_rank = ncbi.get_rank(tax_id)[tax_id[0]]
		except:
			tax_rank = ""
		return tax_rank

	for contig in contig_list:
		assigned_taxa = []
		assigned_taxa.append(contig)
		contig_df = annot_df[annot_df["Contig"] == contig]
		filtered_df = contig_df[contig_df["Label"].notnull()]
		filtered_df = filtered_df.reset_index(drop = True)
		total_annot_prot = len(filtered_df)
		if total_annot_prot < max(min_prot, prop_annot * len(contig_df)):
			assigned_taxa.extend([""]*4)
		else:
			filtered_df["Rank"] = filtered_df["Label"].apply(get_tax_rank)
			for item in tax_rank_order:
				tax_hits = {}
				if item == "genus":
					for row, column in filtered_df.iterrows():
						if column["Rank"] == item:
							if column["Label"] not in tax_hits.keys():
								tax_hits[column["Label"]] = 1
							else:
								tax_hits[column["Label"]] += 1
					if len(tax_hits) < 1:
						assigned_taxa.append("")
					else:
						annot_ratio = max(tax_hits.items(), key = operator.itemgetter(1))[1]/total_annot_prot
						if annot_ratio < tax_thres:
							assigned_taxa.append(str(annot_ratio))
						else:
							max_tax = []
							for key,value in tax_hits.items():
								if value == max(tax_hits.items(), key = operator.itemgetter(1))[1]:
									max_tax.append(key)
							if len(max_tax) > 1:
								assigned_taxa.append("-".join(max_tax))
							else:
								assigned_taxa.append(max_tax[0])
				else:
					for row, column in filtered_df.iterrows():
						if column["Rank"] == item:
							if column["Label"] not in tax_hits.keys():
								tax_hits[column["Label"]] = 1
							else:
								tax_hits[column["Label"]] += 1
						else:
							try:
								name2taxid = ncbi.get_name_translator([column["Label"]])
								label_lineage = ncbi.get_lineage(name2taxid[column["Label"]][0])
								lineage_names = ncbi.get_taxid_translator(label_lineage)
								lineage_ranks = ncbi.get_rank(label_lineage)
								if item in lineage_ranks.values():
									for x,y in lineage_ranks.items():
										if y == item:
											if lineage_names[x] not in tax_hits.keys():
												tax_hits[lineage_names[x]] = 1
											else:
												tax_hits[lineage_names[x]] += 1
											break
											
							except:
								continue
								
					if len(tax_hits) < 1:
						assigned_taxa.append("")
					else:
						annot_ratio = max(tax_hits.items(), key = operator.itemgetter(1))[1]/total_annot_prot
						if annot_ratio < tax_thres:
							assigned_taxa.append(str(annot_ratio))
						else:
							max_tax = []
							for key,value in tax_hits.items():
								if value == max(tax_hits.items(), key = operator.itemgetter(1))[1]:
									max_tax.append(key)
							if len(max_tax) > 1:
								assigned_taxa.append("-".join(max_tax))
							else:
								assigned_taxa.append(max_tax[0])
		df_rows.append(assigned_taxa)
	final_df = pd.DataFrame(df_rows, columns = ["contig_ID", "genus", "subfamily", "family", "order"])
	return final_df
コード例 #42
0
from ete3 import NCBITaxa
ncbi = NCBITaxa()

infile = "accesionsetc/40117.prot.accession2taxid"
pre = "accesionsetc/"

target_rank = 1783270
target_name = 'FCB'

outfile = file(target_name + '_Acc.txt', 'w')
with open(infile, "rb") as data:
    next(data)
    for line in data:
        entry = line.strip().split("\t")
        Taxon = entry[2]
        Version = entry[1]
        try:
            rank_list = ncbi.get_lineage(Taxon)
            if target_rank in rank_list and Version.lower() != "na":
                outfile.write(Version + "\t" + str(Taxon) + "\n")
        except:
            pass
    outfile.close()
コード例 #43
0
def tdb_from_hits(hits, minPerc=50, testing=False):
    '''
    Determines the lowest taxonomic level with at least minPerc certainty

    For every hit:
        reconstruct the lineage (kingdom, phylum, class, ect.)
        add a count to every rank in the lineage

    For every rank:
        see if the number of hits matching one taxa at that rank is above the minPerc
        the denominator for this equation is the number of hits that have a phyla rank

    * Note: this is complicated because some lower ranks don't have higher ranks
        For example, species [Eubacterium] rectale (taxID 39491) has no genus
        Also, species [artifical construct] (taxID 32630) has no anything but species

    '''

    from ete3 import NCBITaxa
    ncbi = NCBITaxa()

    Levels = [
        'superkingdom', 'phylum', 'class', 'order', 'family', 'genus',
        'species'
    ]

    # generate nested dictionary for levels
    countDic = {}
    for level in Levels:
        countDic[level] = {}

    # fill in nested dictionary
    for t in hits['taxID'].tolist():
        if t == 0:
            continue

        # This try / except thing is trying to catch sporatic errors of:
        # sqlite3.OperationalError: disk I/O error
        try:
            lin = ncbi.get_lineage(t)
            lin2name = ncbi.get_taxid_translator(lin)
            name2rank = ncbi.get_rank(lin)
        except:
            time.sleep(1)
            lin = ncbi.get_lineage(t)
            lin2name = ncbi.get_taxid_translator(lin)
            name2rank = ncbi.get_rank(lin)

        for i in lin:
            rank = name2rank[i]
            name = lin2name[i]
            if rank in countDic:
                countDic[rank][i] = countDic[rank].get(i, 0) + 1

    # make the table
    total = sum(countDic['phylum'].values())
    table = {
        'tax_ID': [],
        'tax_confidence': [],
        'tax_level': [],
        'taxonomy': []
    }
    count = None

    for level in Levels:
        dic = countDic[level]
        for name in sorted(dic, key=dic.get, reverse=True):
            count = dic[name]
            break

        if count == None:
            table['tax_ID'].append(None)
            table['tax_confidence'].append(0)
            table['tax_level'].append(level)
            table['taxonomy'].append('unk')

        else:
            lin = ncbi.get_lineage(name)
            lin2name = ncbi.get_taxid_translator(lin)
            name2rank = ncbi.get_rank(lin)
            rank2name = {v: k for k, v in name2rank.items()}
            tax = (lin2name[rank2name[level]])

            table['tax_ID'].append(name)
            table['tax_confidence'].append(((count / total) * 100))
            table['tax_level'].append(level)
            table['taxonomy'].append(tax)

        count = None
    tdb = pd.DataFrame(table)

    # find and mark the best hit
    best = tdb['tax_ID'][tdb['tax_confidence'] >= minPerc].tolist()[-1]
    tdb['best_hit'] = [True if i == best else False for i in tdb['tax_ID']]

    # get the full taxonomy for the best hit
    tdb['full_tax'] = [lineage_from_taxId(t) if b else False for t, b in zip(\
            tdb['tax_ID'], tdb['best_hit'])]

    return tdb
コード例 #44
0
#!/usr/bin/env python

import click
from ete3 import NCBITaxa
import pandas as pd

from micone import Lineage

NCBI = NCBITaxa()
TAX_LEVELS = Lineage._fields
TAX_MAP = {
    "Kingdom": "superkingdom",
    "Phylum": "phylum",
    "Class": "class",
    "Order": "order",
    "Family": "family",
    "Genus": "genus",
    "Species": "species",
}


def get_lineage(species_name):
    taxid = NCBI.get_name_translator([species_name])[species_name][0]
    lineage_taxids = NCBI.get_lineage(taxid)
    lineage_names = NCBI.get_taxid_translator(lineage_taxids)
    lineage_ranks = {v: k for k, v in NCBI.get_rank(lineage_taxids).items()}
    lineage_dict = dict()
    for tax_level in TAX_LEVELS:
        try:
            rank_taxid = lineage_ranks[TAX_MAP[tax_level]]
            rank_name = lineage_names[rank_taxid]
コード例 #45
0
#!/usr/bin/env python3

import sys
from ete3 import NCBITaxa

ncbi = NCBITaxa()

for taxid in sys.stdin:
    taxid = taxid.strip()
    
    lineage = ncbi.get_lineage(taxid)
    names = ncbi.get_taxid_translator(lineage)
    lineage_s = "; ".join([names[taxid] for taxid in lineage[1:]])
    
    print("{}\t{}".format(taxid, lineage_s))
コード例 #46
0
def main(argv):

    #read in taxonomy info for each BUSCO
    species_taxids = []  #species_taxids[marker_id] = taxid

    for line in open(sys.argv[1]):
        tax = line.split('\t')[1].strip('\n')
        if tax not in species_taxids:
            species_taxids.append(tax)

    #initialize NCBI taxdb
    ncbi = NCBITaxa(sys.argv[2])

    #create 2 dicts for ease of lookup
    #taxid_seqs: {taxid: [seq1, seq2]}. Save every seen taxid and which seqs
    #seq_taxids = {seq: taxid, seq:taxid} Save every seq
    taxid_seqs = {}
    seq_taxids = {}
    for line in open(sys.argv[1]):
        line = line.strip('\n')
        taxid = line.split('\t')[1]
        if taxid not in taxid_seqs:
            taxid_seqs[taxid] = []
        seq = line.split('\t')[0]
        taxid_seqs[taxid].append(seq)
        seq_taxids[seq] = taxid

    #iterate over idxstats file and save counts
    #seq_counts[seq] = [readcount, correct_bases, total_bases, seqlen, coverage]
    seq_counts = {}
    seen_taxids = []
    counter = 0
    countfile = open(sys.argv[4])
    countfile.readline()
    for line in countfile:
        counter += 1
        line = line.strip('\n')
        seq = line.split('\t')[0]
        count = int(line.split('\t')[1])
        correct_bases = int(line.split('\t')[2])
        incorrect_bases = int(line.split('\t')[3])
        total_bases = int(line.split('\t')[4])
        subjlen = int(line.split('\t')[5])
        coverage = float(line.split('\t')[6])
        seq_counts[seq] = [
            count, correct_bases, total_bases, subjlen, coverage
        ]
        taxid = seq_taxids[seq]
        if taxid not in seen_taxids:
            seen_taxids.append(int(taxid))

    if counter == 0:
        message = "Empty read count file. Likely no aligned reads in sample."
        print(message)
        #still have to write stuff
        f = open(sys.argv[5], 'w')
        f.write(message + '\n')
        f.close()
        f = open(sys.argv[6], 'w')
        f.write(message + '\n')
        f.close()
        sys.exit()
    #done parsing idxstats file

    #create NCBI taxon tree of observed taxa + extend to cellular_org
    tree = ncbi.get_topology(seen_taxids)
    tree_root = tree.get_tree_root().name
    lineage = ncbi.get_lineage(tree_root)
    full_taxids = seen_taxids + lineage
    full_tree = ncbi.get_topology(full_taxids, intermediate_nodes=True)

    full_seq_taxids = {
        line.split('\t')[0]: [
            line.split('\t')[1].split(','),
            line.split('\t')[-1].strip('\n').split(',')
        ]
        for line in open(sys.argv[3])
    }
    #full_seq_taxids: {taxid: [[specific buscos], [specific + inherited buscos]]}
    #determine seq counts

    #taxid_counts: {taxid: [[marker, readcount, correct_bases, total_bases, seqlen, coverage]]}
    taxid_counts = {}
    for seq in seq_counts:
        taxid = seq_taxids[seq]
        if taxid not in taxid_counts:
            taxid_counts[taxid] = []
        taxid_counts[taxid].append([
            seq,
            int(seq_counts[seq][0]),
            int(seq_counts[seq][1]), seq_counts[seq][2], seq_counts[seq][3],
            seq_counts[seq][4]
        ])

    #write just observed taxid seqs
    taxon_coverage = {}
    #taxon_coverage[taxon] = [observed_markers, readcounts, total_bases, percentage_markers, marker_coverage, percent_id ]
    #dest = open(sys.argv[6], 'w')
    #dest.write("Name\tNCBI_Rank\tTaxID\tObserved_markers\tRead_counts\tPercent_observed_markers\tMarker_coverage\tPercent_identity\n")

    for tax in taxid_counts:
        mc = len(taxid_counts[tax])
        counts = 0
        bases = 0
        correct = 0
        total_bases = 0
        subj_len = 0
        for i in range(0, len(taxid_counts[tax])):
            counts += taxid_counts[tax][i][1]
            bases += taxid_counts[tax][i][3]
            correct += taxid_counts[tax][i][2]
            total_bases += taxid_counts[tax][i][3]
            subj_len += taxid_counts[tax][i][4]
        percent_identity = round((correct / total_bases) * 100, 2)
        overall_coverage = round((total_bases / subj_len) * 100, 2)
        total_markers = len(taxid_seqs[tax])
        marker_percentage = round(mc / total_markers * 100, 2)
        name = [
            ncbi.get_taxid_translator([tax])[e]
            for e in ncbi.get_taxid_translator([tax])
        ][0]
        #rank = [ncbi.get_rank([tax])[e] for e in ncbi.get_rank([tax])][0]
        #dest.write(name + '\t'
        #	+ rank + '\t'
        #		+ tax + '\t'
        #		+ str(mc) + '\t'
        #		+ str(counts) + '\t'
        #		+ str(marker_percentage) + '%\t'
        #		+ str(overall_coverage) + '%\t'
        #		+ str(percent_identity) + '%\n')
        taxon_coverage[tax] = [
            mc, counts, total_bases, marker_percentage, overall_coverage,
            percent_identity
        ]
    #dest.close()
    dest = open(sys.argv[6], 'w')
    dest.write(
        "Name\tObserved_markers\tRead_counts\tPercent_observed_markers\tTotal_marker_coverage\tPercent_identity\n"
    )
    marker_sorted = sorted(taxon_coverage.keys(),
                           reverse=True,
                           key=lambda x: taxon_coverage[x][3])

    for tax in marker_sorted:
        rank = [ncbi.get_rank([tax])[e] for e in ncbi.get_rank([tax])][0]
        name = [
            ncbi.get_taxid_translator([tax])[e]
            for e in ncbi.get_taxid_translator([tax])
        ][0]
        mc = taxon_coverage[tax][0]
        counts = taxon_coverage[tax][1]
        marker_percentage = taxon_coverage[tax][3]
        overall_coverage = taxon_coverage[tax][4]
        percent_identity = taxon_coverage[tax][5]
        dest.write(name + '\t' + str(mc) + '\t' + str(counts) + '\t' +
                   str(marker_percentage) + '%\t' + str(overall_coverage) +
                   '%\t' + str(percent_identity) + '%\n')

    orphan_children = []

    #find counts of seqs for internal nodes
    for node in full_tree.traverse():
        if node.is_leaf() == False:
            if node.name not in taxid_counts:
                taxid_counts[node.name] = []
            for desc in node.iter_descendants():
                if desc.name in taxid_counts:
                    for seq in taxid_counts[desc.name]:
                        if seq not in taxid_counts[node.name]:
                            taxid_counts[node.name].append(seq)
        else:
            if node.name not in taxid_counts:
                orphan_children.append(node.name)

    #print the tree
    level_counts = []
    currspaces = 0
    currparent = ''
    seen_parents = {}
    dest = open(sys.argv[5], 'w')
    dest.write(
        "Markers_Obs\tTotal_Markers\tPercent_Makers_Obs\tPercent_ID\tMarker_read_count\tRank\tName\n"
    )
    for node in full_tree.traverse("preorder"):
        if node.name not in orphan_children:
            rank = [
                ncbi.get_rank([node.name])[e]
                for e in ncbi.get_rank([node.name])
            ][0]
            name = [
                ncbi.get_taxid_translator([node.name])[e]
                for e in ncbi.get_taxid_translator([node.name])
            ][0]
            if node.is_root():
                currspaces = 0
            else:
                if currparent == '':
                    currparent = node.up.name
                    currspaces += 4
                else:
                    if currparent != node.up.name:
                        currparent = node.up.name
                        if currparent in seen_parents:
                            currspaces = seen_parents[currparent]
                        else:
                            currspaces += 4
                            seen_parents[currparent] = currspaces
            if node.name in taxon_coverage:
                pid = str(taxon_coverage[node.name][5]) + '%'
            else:
                pid = "NA"
            #total_buscos
            buscos = len(taxid_counts[node.name])
            seqs = sum([b[1] for b in taxid_counts[node.name]])
            total_buscos = len(full_seq_taxids[node.name][1])
            percent = round((buscos / total_buscos) * 100, 2)
            dest.write(
                str(buscos) + '\t' + str(total_buscos) + "\t" + str(percent) +
                '%\t' + str(pid) + '\t' + str(seqs) + '\t' + rank + '\t' +
                ' ' * currspaces + name + '\n')
    dest.close()
コード例 #47
0
def parse_args(parser):
    args = parser.parse_args()

    if args.version:
        print get_version()
        sys.exit(0)

    if not args.no_annot and not pexists(EGGNOGDB_FILE):
        print colorify(
            'Annotation database data/eggnog.db not present. Use download_eggnog_database.py to fetch it',
            'red')
        raise emapperException()

    if args.mode == 'diamond' and not pexists(EGGNOG_DMND_DB):
        print colorify(
            'DIAMOND database data/eggnog_proteins.dmnd not present. Use download_eggnog_database.py to fetch it',
            'red')
        raise emapperException()

    if args.cpu == 0:
        args.cpu = multiprocessing.cpu_count()

    # No --servermode available for diamond
    if args.mode == 'diamond' and args.servermode:
        parser.error(
            '--mode [diamond] and --servermode are mutually exclusive')

    # Output file required unless running in servermode
    if not args.servermode and not args.output:
        parser.error('An output project name is required (-o)')

    # Servermode implies using mem-based databases
    if args.servermode:
        args.usemem = True

    # Direct annotation implies no searches
    if args.annotate_hits_table:
        args.no_search = True
        args.no_annot = False

    # Check inputs for running sequence searches
    if not args.no_search and not args.servermode:
        if not args.input:
            parser.error('An input fasta file is required (-i)')

        # HMM
        if args.mode == 'hmmer':
            if not args.db and not args.guessdb:
                parser.error(
                    'HMMER mode requires specifying a target database (i.e. -d, --guessdb ))'
                )
            if args.db and args.guessdb:
                parser.error('-d and --guessdb options are mutually exclusive')

            if args.guessdb:
                from ete3 import NCBITaxa
                ncbi = NCBITaxa()
                lineage = ncbi.get_lineage(args.guessdb)
                for tid in reversed(lineage):
                    if tid in TAXID2LEVEL:
                        print tid, TAXID2LEVEL[tid]
                        args.db = TAXID2LEVEL[tid]
                        break
        # DIAMOND
        elif args.mode == 'diamond':
            #if args.db or args.guessdb:
            #    parser.error('diamond mode does not require -d or --guessdb options')
            pass

    return args
コード例 #48
0
#!/usr/bin/env python
# Originally from Magpy. Altered to fit this pipeline

import sys

from ete3 import NCBITaxa

# get NCBI taxonomu object
ncbi = NCBITaxa()

if len(sys.argv) == 1:
	print("Please provide a filename")
	sys.exit()


# open the file
checkm_file = snakemake.input[0]
outfile = snakemake.output[0]

# skip three lines
row1 = checkm_file.readline()

# print titles for the output
titles = ["name",
		"nprots",
		"nhits",
		"nfull",
		"genus",
		"ngenus",
		"species",
		"nspecies",
コード例 #49
0
ファイル: ete_ncbiquery.py プロジェクト: Ward9250/ete
def run(args):
    # add lineage profiles/stats

    import re
    from ete3 import PhyloTree, NCBITaxa

    # dump tree by default
    if not args.tree and not args.info and not args.descendants:
        args.tree = True

    ncbi = NCBITaxa()

    all_taxids = {}
    all_names = set()
    queries = []

    if not args.search:
        log.error('Search terms should be provided (i.e. --search) ')
        sys.exit(-1)
    for n in args.search:
        queries.append(n)
        try:
            all_taxids[int(n)] = None
        except ValueError:
            all_names.add(n.strip())

    # translate names
    name2tax = ncbi.get_name_translator(all_names)
    all_taxids.update([(v, None) for v in list(name2tax.values())])

    not_found_names = all_names - set(name2tax.keys())
    if args.fuzzy and not_found_names:
        log.warn("%s unknown names", len(not_found_names))
        for name in not_found_names:
            # enable extension loading
            tax, realname, sim = ncbi.get_fuzzy_name_translation(name, args.fuzzy)
            if tax:
                all_taxids[tax] = None
                name2tax[name] = tax
                name2realname[name] = realname
                name2score[name] = "Fuzzy:%0.2f" %sim

    if not_found_names:
        log.warn("[%s] could not be translated into taxids!" %','.join(not_found_names))

    if args.tree:
        if len(all_taxids) == 1:
            target_taxid = next(all_taxids.keys())
            log.info("Dumping NCBI descendants tree for %s" %(target_taxid))
            t = ncbi.get_descendant_taxa(target_taxid, collapse_subspecies=args.collapse_subspecies, rank_limit=args.rank_limit, return_tree=True)
        else:
            log.info("Dumping NCBI taxonomy of %d taxa..." %(len(all_taxids)))
            t = ncbi.get_topology(list(all_taxids.keys()),
                              intermediate_nodes=args.full_lineage,
                              rank_limit=args.rank_limit,
                              collapse_subspecies=args.collapse_subspecies)

        id2name = ncbi.get_taxid_translator([n.name for n in t.traverse()])
        for n in t.traverse():
            n.add_features(taxid=n.name)
            n.add_features(sci_name=str(id2name.get(int(n.name), "?")))
            n.name = "%s - %s" %(id2name.get(int(n.name), n.name), n.name)
            lineage = ncbi.get_lineage(n.taxid)
            n.add_features(named_lineage = '|'.join(ncbi.translate_to_names(lineage)))
        dump(t, features=["taxid", "name", "rank", "bgcolor", "sci_name",
                          "collapse_subspecies", "named_lineage"])
    elif args.descendants:
        log.info("Dumping NCBI taxonomy of %d taxa..." %(len(all_taxids)))
        print('# ' + '\t'.join(["Taxid", "Sci.Name", "Rank", "descendant_taxids", "descendant_names"]))
        translator = ncbi.get_taxid_translator(all_taxids)
        ranks = ncbi.get_rank(all_taxids)
        for taxid in all_taxids:
            descendants = ncbi.get_descendant_taxa(taxid, collapse_subspecies=args.collapse_subspecies, rank_limit=args.rank_limit)
            print('\t'.join([str(taxid), translator.get(taxid, taxid), ranks.get(taxid, ''),
                             '|'.join(map(str, descendants)),
                             '|'.join(map(str, ncbi.translate_to_names(descendants)))]))

    elif args.info:
        print('# ' + '\t'.join(["Taxid", "Sci.Name", "Rank", "Named Lineage", "Taxid Lineage"]))
        translator = ncbi.get_taxid_translator(all_taxids)
        ranks = ncbi.get_rank(all_taxids)
        for taxid, name in six.iteritems(translator):
            lineage = ncbi.get_lineage(taxid)
            named_lineage = ','.join(ncbi.translate_to_names(lineage))
            lineage_string = ','.join(map(str, lineage))
            print('\t'.join([str(taxid), name, ranks.get(taxid, ''), named_lineage, lineage_string]))