Example #1
1
  def test_get_topology(self):
    ncbi = NCBITaxa(dbfile=DATABASE_PATH)
    t1 = ncbi.get_topology([9606, 7507, 9604])
    t2 = ncbi.get_topology([9606, 7507, 678])

    self.assertEqual(sorted(t1.get_leaf_names()), ["7507", "9606"])
    self.assertEqual(sorted(t2.get_leaf_names()), ["678", "7507", "9606"])
Example #2
0
def get_desired_ranks(taxid, desired_ranks):
    ncbi = NCBITaxa()
    lineage = ncbi.get_lineage(taxid)
    names = ncbi.get_taxid_translator(lineage)
    lineage2ranks = ncbi.get_rank(names)
    ranks2lineage = dict(
        (rank, taxid) for (taxid, rank) in lineage2ranks.items())
    return [ranks2lineage.get(rank, '0') for rank in desired_ranks]
Example #3
0
    def annotate_ncbi_taxa(self,
                           taxid_attr='species',
                           tax2name=None,
                           tax2track=None,
                           tax2rank=None,
                           dbfile=None):
        """Add NCBI taxonomy annotation to all descendant nodes. Leaf nodes are
        expected to contain a feature (name, by default) encoding a valid taxid
        number.

        All descendant nodes (including internal nodes) are annotated with the
        following new features:

        `Node.spname`: scientific spcies name as encoded in the NCBI taxonomy database

        `Node.named_lineage`: the NCBI lineage track using scientific names 

        `Node.taxid`: NCBI taxid number 

        `Node.lineage`: same as named_lineage but using taxid codes. 
        

        Note that for internal nodes, NCBI information will refer to the first
        common lineage of the grouped species.

        :param name taxid_attr: the name of the feature that should be used to access the taxid number associated to each node. 

        :param None tax2name: A dictionary where keys are taxid numbers and
        values are their translation into NCBI scientific name. Its use is
        optional and allows to avoid database queries when annotating many trees
        containing the same set of taxids.

        :param None tax2track: A dictionary where keys are taxid numbers and
        values are their translation into NCBI lineage tracks (taxids). Its use is
        optional and allows to avoid database queries when annotating many trees
        containing the same set of taxids.

        :param None tax2rank: A dictionary where keys are taxid numbers and
        values are their translation into NCBI rank name. Its use is optional
        and allows to avoid database queries when annotating many trees
        containing the same set of taxids.

        :param None dbfile : If provided, the provided file will be used as a
        local copy of the NCBI taxonomy database.

        :returns: tax2name (a dictionary translating taxid numbers into
        scientific name), tax2lineage (a dictionary translating taxid numbers
        into their corresponding NCBI lineage track) and tax2rank (a dictionary translating taxid numbers into
        rank names).

        """

        ncbi = NCBITaxa(dbfile=dbfile)
        return ncbi.annotate_tree(self,
                                  taxid_attr=taxid_attr,
                                  tax2name=tax2name,
                                  tax2track=tax2track,
                                  tax2rank=tax2rank)
def get_family(taxid):
    ncbi = NCBITaxa()
    lineage = ncbi.get_lineage(taxid)
    ranks = ncbi.get_rank(lineage)
    for rank in ranks.keys():
        if ranks[rank] == 'family':
            return rank

    return "None"
    def get_rank(self, rank_level):
        ncbi = NCBITaxa()
        lineage = ncbi.get_lineage(self.tax_id)
        ranks = ncbi.get_rank(lineage)
        for rank in ranks.keys():
            if ranks[rank] == rank_level:
                return rank

        return "N/A"
Example #6
0
def get_rank(taxid, rank_level):
    ncbi = NCBITaxa()
    lineage = ncbi.get_lineage(taxid)
    ranks = ncbi.get_rank(lineage)
    for rank in ranks.keys():
        if ranks[rank] == rank_level:
            return rank

    return "None"
Example #7
0
    def test_ncbiquery(self):
        ncbi = NCBITaxa(dbfile=DATABASE_PATH)

        id2name = ncbi.get_taxid_translator(["9606", "7507"])
        self.assertEqual(id2name[7507], "Mantis religiosa")
        self.assertEqual(id2name[9606], "H**o sapiens")

        name2id = ncbi.get_name_translator(["Mantis religiosa", "h**o sapiens"])
        self.assertEqual(name2id["Mantis religiosa"], 7507)
        self.assertEqual(name2id["h**o sapiens"], 9606)
Example #8
0
def taxo_msa(outfile='taxo_msa.svg',taxids=[],annotation='',msa=[],title='',width=2000):
    """
    Visualize MSA together with a taxonomy tree
    taxids - list of taxids in the same order as seqs in msa
    """
    # taxid2gi={f_df.loc[f_df.gi==int(gi),'taxid'].values[0]:gi for gi in list(f_df['gi'])}
    # gi2variant={gi:f_df.loc[f_df.gi==int(gi),'hist_var'].values[0] for gi in list(f_df['gi'])}

    # msa_dict={i.id:i.seq for i in msa_tr}
    ncbi = NCBITaxa()
    taxids=map(int,taxids)

    t = ncbi.get_topology(taxids,intermediate_nodes=False)
    a=t.add_child(name='annotation')
    a.add_feature('sci_name','annotation')
    t.sort_descendants(attr='sci_name')
    ts = TreeStyle()
    def layout(node):
        # print node.rank
        # print node.sci_name
        if getattr(node, "rank", None):
            if(node.rank in ['order','class','phylum','kingdom']):   
                rank_face = AttrFace("sci_name", fsize=7, fgcolor="indianred")
                node.add_face(rank_face, column=0, position="branch-top")
        if node.is_leaf():
            sciname_face = AttrFace("sci_name", fsize=9, fgcolor="steelblue")
            node.add_face(sciname_face, column=0, position="branch-right")
        if node.is_leaf() and not node.name=='annotation':
            s=str(msa[taxids.index(int(node.name))].seq)
            seqFace = SeqMotifFace(s,[[0,len(s), "seq", 10, 10, None, None, None]],scale_factor=1)
            add_face_to_node(seqFace, node, 0, position="aligned")
            # gi=taxid2gi[int(node.name)]
            add_face_to_node(TextFace(' '+msa[taxids.index(int(node.name))].id),node,column=1, position = "aligned")
            # add_face_to_node(TextFace('      '+str(int(node.name))+' '),node,column=2, position = "aligned")
            # add_face_to_node(TextFace('      '+str(gi2variant[gi])+' '),node,column=3, position = "aligned")

        if node.is_leaf() and node.name=='annotation':
            if(annotation):
                s=annotation
                # get_hist_ss_in_aln_as_string(msa_tr)
            else:
                s=' '*len(msa[0].seq)
            seqFace = SeqMotifFace(s,[[0,len(s), "seq", 10, 10, None, None, None]],scale_factor=1)
            add_face_to_node(seqFace, node, 0, position="aligned")
            add_face_to_node(TextFace(' '+'SEQ_ID'),node,column=1, position = "aligned")
            # add_face_to_node(TextFace('       '+'NCBI_TAXID'+' '),node,column=2, position = "aligned")
            # add_face_to_node(TextFace('       '+'Variant'+'       '),node,column=3, position = "aligned")



    ts.layout_fn = layout
    ts.show_leaf_name = False
    ts.title.add_face(TextFace(title, fsize=20), column=0)
    t.render(outfile, w=width, dpi=300, tree_style=ts)
Example #9
0
def getNcbiTaxonomy():
	ncbi = NCBITaxa()	
	nameToTaxIdList = ncbi.get_name_translator(ORGANISM_NAMES_LIST)
        #print (str(nameToTaxIdList))
	with open (OUTPUT_FILE, "w") as outputFile:
		for name in ORGANISM_NAMES_LIST:
		#for name, taxIds in nameToTaxIdList.items():
			taxIds = nameToTaxIdList[name]
			for eachId in taxIds:
				lineage = ncbi.get_lineage(str(eachId))
				names = ncbi.get_taxid_translator(lineage)
				outputFile.write("\t".join([names[taxid] for taxid in lineage]) + "\n")
Example #10
0
    def annotate_ncbi_taxa(self, taxid_attr="species", tax2name=None, tax2track=None, tax2rank=None, dbfile=None):
        """Add NCBI taxonomy annotation to all descendant nodes. Leaf nodes are
        expected to contain a feature (name, by default) encoding a valid taxid
        number.

        All descendant nodes (including internal nodes) are annotated with the
        following new features:

        `Node.spname`: scientific spcies name as encoded in the NCBI taxonomy database

        `Node.named_lineage`: the NCBI lineage track using scientific names 

        `Node.taxid`: NCBI taxid number 

        `Node.lineage`: same as named_lineage but using taxid codes. 
        

        Note that for internal nodes, NCBI information will refer to the first
        common lineage of the grouped species.

        :param name taxid_attr: the name of the feature that should be used to access the taxid number associated to each node. 

        :param None tax2name: A dictionary where keys are taxid numbers and
        values are their translation into NCBI scientific name. Its use is
        optional and allows to avoid database queries when annotating many trees
        containing the same set of taxids.

        :param None tax2track: A dictionary where keys are taxid numbers and
        values are their translation into NCBI lineage tracks (taxids). Its use is
        optional and allows to avoid database queries when annotating many trees
        containing the same set of taxids.

        :param None tax2rank: A dictionary where keys are taxid numbers and
        values are their translation into NCBI rank name. Its use is optional
        and allows to avoid database queries when annotating many trees
        containing the same set of taxids.

        :param None dbfile : If provided, the provided file will be used as a
        local copy of the NCBI taxonomy database.

        :returns: tax2name (a dictionary translating taxid numbers into
        scientific name), tax2lineage (a dictionary translating taxid numbers
        into their corresponding NCBI lineage track) and tax2rank (a dictionary translating taxid numbers into
        rank names).

        """

        ncbi = NCBITaxa(dbfile=dbfile)
        return ncbi.annotate_tree(
            self, taxid_attr=taxid_attr, tax2name=tax2name, tax2track=tax2track, tax2rank=tax2rank
        )
Example #11
0
    def ncbi_compare(self, autodetect_duplications=True, cached_content=None):
        if not cached_content:
            cached_content = self.get_cached_content()
        cached_species = set([n.species for n in cached_content[self]])

        if len(cached_species) != len(cached_content[self]):
            print cached_species
            ntrees, ndups, target_trees = self.get_speciation_trees(
                autodetect_duplications=autodetect_duplications,
                map_features=["taxid"])
        else:
            target_trees = [self]

        ncbi = NCBITaxa()
        for t in target_trees:
            ncbi.get_broken_branches(t, cached_content)
Example #12
0
    def ncbi_compare(self, autodetect_duplications=True, cached_content=None):
        if not cached_content:
            cached_content = self.get_cached_content()
        cached_species = set([n.species for n in cached_content[self]])

        if len(cached_species) != len(cached_content[self]):
            print cached_species
            ntrees, ndups, target_trees = self.get_speciation_trees(
                autodetect_duplications=autodetect_duplications, map_features=["taxid"]
            )
        else:
            target_trees = [self]

        ncbi = NCBITaxa()
        for t in target_trees:
            ncbi.get_broken_branches(t, cached_content)
Example #13
0
  def test_ncbiquery(self):
    ncbi = NCBITaxa(dbfile=DATABASE_PATH)

    id2name = ncbi.get_taxid_translator(['9606', '7507'])
    self.assertEqual(id2name[7507], 'Mantis religiosa')
    self.assertEqual(id2name[9606], 'H**o sapiens')

    name2id = ncbi.get_name_translator(['Mantis religiosa', 'h**o sapiens'])
    self.assertEqual(name2id['Mantis religiosa'], [7507])
    self.assertEqual(name2id['h**o sapiens'], [9606])

    name2id = ncbi.get_name_translator(['Bacteria'])
    self.assertEqual(set(name2id['Bacteria']), set([2, 629395]))

    out = ncbi.get_descendant_taxa("9605", intermediate_nodes=True)
    #Out[9]: [1425170, 741158, 63221, 9606]
    self.assertEqual(set(out), set([1425170, 741158, 63221, 9606]))
    
    out = ncbi.get_descendant_taxa("9605", intermediate_nodes=False)
    #Out[10]: [1425170, 741158, 63221]
    self.assertEqual(set(out), set([1425170, 741158, 63221]))
    
    out = ncbi.get_descendant_taxa("9605", intermediate_nodes=False, rank_limit="species")
    #Out[11]: [9606, 1425170]
    self.assertEqual(set(out), set([9606, 1425170]))
Example #14
0
def bio_tree(names):
    from collections import OrderedDict
    from Bio import Entrez
    from ete2 import NCBITaxa, PhyloTree
    from lxml import etree

    Entrez.email = '*****@*****.**'
    ncbi = NCBITaxa()
    ids = []
    for name in names:
        handle = Entrez.esearch(db='taxonomy', term=name)
        while True:
            line = handle.readline()
            if not line: break
            if '<Id>' in line:
                ids.append(int(line.strip('<Id></Id>\n')))
    scientific_tree = ncbi.get_topology(ids)
    return scientific_tree.get_ascii(attributes=['sci_name'])
def get_desired_ranks(taxid, desired_ranks):
    """ Gets the parent TaxID for a particular Taxon node.

        Args:
            taxid: Taxon Node whose parent TaxID at a given level needs
                   to be determined
            desired_ranks: Parent Taxon level at which Taxon ID must be
                           determined.

        Returns:
            List of TaxIDs at particular taxon level

    """
    ncbi = NCBITaxa()
    lineage = ncbi.get_lineage(taxid)
    names = ncbi.get_taxid_translator(lineage)
    lineage2ranks = ncbi.get_rank(names)
    ranks2lineage = dict(
        (rank, taxid) for (taxid, rank) in lineage2ranks.items())

    return [ranks2lineage.get(rank, '0') for rank in desired_ranks]
def parseVSearchOutputAgainstNCBI(vsearch_out, database, output_file,
                                  min_coverage, min_similarity):
    """Resolves vsearch matches in a vsearch output file to the taxonomic name taken from BOLD.
        Takes in a vsearch output file from usearch__global, parses the result for good matches, and
        writes an output file mapping sequence name to taxa name.

    :param vsearch_out: An output file from vsearch's usearch__global program.
    :param database: The database used as part of the vsearch usearch__global operation.
    :param output_file: Where to write the resulting file that maps sequence ID to taxanomic name.
    :param min_coverage: The minimum coverage for an acceptable vsearch match.
    :param min_similarity: The minimum simmilarity for an acceptable vsearch match.
    """
    min_simm = float(min_similarity)
    min_coverage = float(min_coverage)
    ncbi = NCBITaxa()
    conn = sqlite3.connect(database)
    c = conn.cursor()

    query = "select taxid from gi_taxid where gi=%s"

    def getTaxFromId(taxId,
                     taxonomy=[
                         "species", "genus", 'family', 'order', 'class',
                         'phylum'
                     ]):
        myTaxonomy = dict([(a, "") for a in taxonomy])
        taxId = int(taxId)
        for lin in ncbi.get_lineage(taxId):
            rank = ncbi.get_rank([lin]).values()[0]
            if rank in taxonomy:
                val = ncbi.get_taxid_translator([lin]).values()[0]
                myTaxonomy[rank] = val

        return ":".join([myTaxonomy[x] for x in taxonomy[::-1]])

    with open(output_file, 'w') as out:
        for line in open(vsearch_out, 'r'):
            data = line.split()

            if float(data[4]) > min_coverage or float(data[2]) > min_simm:
                hit = c.execute(query % data[1]).fetchone()
                if hit:
                    taxonomy = getTaxFromId(hit[0])
                    data.append(taxonomy)
                    printVerbose("\t".join(data))
                    out.write("\t".join(data))
                    out.write("\n")
                else:
                    printErrorMissingID(out, data[1])
Example #17
0
    def __init__(self, data_path, workbench=None, genomes=[], taxDb=None):
        self.data_path = data_path
        self.workbench = workbench
        self.metadata_path = pjoin(self.data_path, "metadata")
        if not os.path.exists(self.metadata_path):
            os.makedirs(self.metadata_path)

        self.metadata_file = pjoin(self.metadata_path, "metadata.csv")

        if taxDb:
            self.taxDb = taxDb
        else:
            self.taxDb = NCBITaxa()

        self.genomes = genomes
Example #18
0
def main():
    args = parser.parse_args()

    cazy_fp = args.cazy_fp
    p2taxid_fp = args.p2taxid_fp
    output_fp = args.output_fp
    len_fp = args.len_fp

    ncbi = NCBITaxa()

    # read in ncbi prot to taxid, keep in memory as a dict

    prot_to_taxid = read_NCBI_prot_to_taxid_gz(p2taxid_fp)

    # open

    hmm_len = read_hmm_len_fp(len_fp)

    # read in cazy

    cazy_f = fasta_iter(cazy_fp)

    # for each cazy,
    with open(output_fp, 'w') as f:
        for header, seq in cazy_f:

            acc = header.split('|')[0]
            fam = header.split('|')[1]

            try:
                gene_length = hmm_len[fam]
            except KeyError:
                gene_length = 1000

            try:
                taxid = prot_to_taxid[acc]
                taxonomy = '.'.join(get_taxon_path(taxid, ncbi))
            except KeyError:
                taxonomy = 'unclassified'

            outline = '{0}\t{1}\t{2}\t{3}\n'.format(header, fam, gene_length,
                                                    taxonomy)

            f.write(outline)
Example #19
0
def blast2summary_dict(db, blastpath):  # (Path, Path) -> list[dict]
    """Reading in a blast output file, lookup all seqids to get taxids with a single blastdbcmd.
  Then, lookup the taxonomy using ETE2 via the taxid, and add that info to the blast info."""
    rows = csv.DictReader(open(blastpath),
                          delimiter='\t',
                          fieldnames=[
                              'qseqid', 'sseqid', 'pid', 'alnlen', 'gapopen',
                              'qstart', 'qend', 'sstart', 'send', 'evalue',
                              'bitscore'
                          ])
    rows = list(rows)
    seqids = map(get('sseqid'), rows)
    taxids = get_taxid(db, seqids)
    gis = (s.split('|')[1] for s in seqids)
    matches = dict(
        (taxids[gi], row) for gi, row in zip(gis, rows) if gi in taxids)
    ncbi = NCBITaxa(
    )  # downloads database and creates SQLite database if needed
    return dictmap(lambda tid, row: merge(row, taxonomy(ncbi, tid)), matches)
Example #20
0
def check_taxa_db_age(dbLocation):
    # if file doesn't exist, catch the error and run the update, as it will create the file.
    ncbi = NCBITaxa()

    try:
        filetime = datetime.fromtimestamp(path.getctime(dbLocation))
        one_month_ago = datetime.now() - timedelta(days=30)
        if filetime < one_month_ago:
            # File older than 1 month, update it:
            logInfo = '<> NCBITaxa Database older than 1 month, updating it <>'
            ncbi.update_taxonomy_database()
        else:
            logInfo = '<> NCBITaxa Database up to date <>'
    except:
        logInfo = "<> NCBITaxa Database didn't exist, downloaded it <>"
        ncbi.update_taxonomy_database()

    return(logInfo)
Example #21
0
def blast2summary_dict(db, blastpath, ete2_db):  # (Path, Path) -> list[dict]
    """Reading in a blast output file, lookup all seqids to get taxids with a single blastdbcmd.
  Then, lookup the taxonomy using ETE2 via the taxid, and add that info to the blast info."""
    # rows = csv.DictReader(open(blastpath), delimiter='\t',fieldnames=[SEQID, 'sseqid','pid', 'alnlen','gapopen','qstart','qend','sstart','send','evalue','bitscore'])
    rows = csv.DictReader(open(blastpath),
                          delimiter='\t',
                          fieldnames=blast_columns)
    rows = list(rows)
    seqids = map(get('sseqid'), rows)
    taxids = get_taxid(db, seqids)

    def get_gi(s):
        fields = s.split('|')
        if len(fields) > 1:
            return fields[1]
        else:
            raise ValueError("Seq ID %s is missing GI fields and '|'" % s)

    gis = imap(get_gi, seqids)
    #TODO: change matches to use something unique--not the TAXID! actually, why is it a dict
    # in the first place? it should be a list of dictionaries, and then map over
    # the dictionaries to merge them with the taxonomy info
    # this will replace the lines:
    # matches = . . .
    # items = . . .
    #matches = dict((taxids[gi], row) for gi, row in zip(gis,rows) if gi in taxids)
    ncbi = NCBITaxa(
        ete2_db)  # downloads database and creates SQLite database if needed
    # items = dictmap(lambda tid,row: merge(row, taxonomy(ncbi, tid)), matches)
    matches = [
        assoc(row, 'taxid', taxids[gi]) for gi, row in zip(gis, rows)
        if gi in taxids
    ]
    items = [merge(row1, taxonomy(ncbi, row1['taxid'])) for row1 in matches]
    res = imap(partial(keyfilter, csv_fields.__contains__), items)
    return res
	#ete2 module
	ete2_error = '''
	ERROR: This program requires the Python module ete2.
	Please install it and try running the program again.
	'''
	try:
		from ete2 import NCBITaxa
	except:
		print ete2_error
		sys.exit(0)
	from rpy2.robjects.packages import importr

	print 'Loading NCBI taxonomic data...'
	print
	ncbi= NCBITaxa()


	#------------------------------------------------
	# Checking/installing/loading ontoCAT (BioconductoR) and PSI-MI
	#------------------------------------------------

	# ontoCAT installed here as variable for later use
	try:
		ontoCAT = importr('ontoCAT')
	except RRuntimeError:
		print '''
        This program requires ontoCAT (BioconductoR package) to run.

        Currently installing ontoCat to your machine...
Example #23
0
def run(args):
    # add lineage profiles/stats

    import re
    from ete2 import PhyloTree, NCBITaxa

    if not args.taxonomy and not args.info:
        args.taxonomy = True

    ncbi = NCBITaxa()

    all_taxids = {}
    all_names = set()
    queries = []

    if not args.search:
        log.error('Search terms should be provided (i.e. --search) ')
        sys.exit(-1)
    for n in args.search:
        queries.append(n)
        try:
            all_taxids[int(n)] = None
        except ValueError:
            all_names.add(n.strip())

    # translate names
    name2tax = ncbi.get_name_translator(all_names)
    all_taxids.update([(v, None) for v in name2tax.values()])

    not_found_names = all_names - set(name2tax.keys())
    if args.fuzzy and not_found_names:
        log.warn("%s unknown names", len(not_found_names))
        for name in not_found_names:
            # enable extension loading
            tax, realname, sim = ncbi.get_fuzzy_name_translation(
                name, args.fuzzy)
            if tax:
                all_taxids[tax] = None
                name2tax[name] = tax
                name2realname[name] = realname
                name2score[name] = "Fuzzy:%0.2f" % sim

    if args.taxonomy:
        log.info("Dumping NCBI taxonomy of %d taxa..." % (len(all_taxids)))
        t = ncbi.get_topology(all_taxids.keys(),
                              intermediate_nodes=args.full_lineage,
                              rank_limit=args.rank_limit,
                              collapse_subspecies=args.collapse_subspecies)

        id2name = ncbi.get_taxid_translator([n.name for n in t.traverse()])
        for n in t.traverse():
            n.add_features(taxid=n.name)
            n.add_features(sci_name=str(id2name.get(int(n.name), "?")))
            n.name = "%s - %s" % (id2name.get(int(n.name), n.name), n.name)
            lineage = ncbi.get_lineage(n.taxid)
            n.add_features(
                named_lineage='|'.join(ncbi.translate_to_names(lineage)))
        dump(t,
             features=[
                 "taxid", "name", "rank", "bgcolor", "sci_name",
                 "collapse_subspecies", "named_lineage"
             ])
    elif args.info:
        print '# ' + '\t'.join(
            ["Taxid", "Sci.Name", "Rank", "Named Lineage", "Taxid Lineage"])
        translator = ncbi.get_taxid_translator(all_taxids)
        ranks = ncbi.get_rank(all_taxids)
        for taxid, name in translator.iteritems():
            lineage = ncbi.get_lineage(taxid)
            named_lineage = ','.join(ncbi.translate_to_names(lineage))
            lineage_string = ','.join(map(str, lineage))
            print '\t'.join([
                str(taxid), name,
                ranks.get(taxid, ''), named_lineage, lineage_string
            ])
import sys
import os
import urllib2
import gzip
import biom
import shutil
from numpy import random as np_rand
from ete2 import NCBITaxa
from scripts.loggingwrapper import LoggingWrapper as logger
try:
    from configparser import ConfigParser
except ImportError:
    from ConfigParser import ConfigParser

ncbi = NCBITaxa()
RANKS = ['species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom']
MAX_RANK = 'family'
_log = None

"""
Reads a BIOM file and creates map of OTU: lineage, abundance
BIOM file format needs to have a taxonomy field in metadata which contains the taxonomy in the format:
RANK__SCINAME; LOWERRANK_LOWERSCINAME
"""
def read_taxonomic_profile(biom_profile, config, no_samples = None):
    table = biom.load_table(biom_profile)
    ids = table.ids(axis="observation")
    samples = table.ids()

    if no_samples is None:
        no_samples = len(samples)
Example #25
0
#This function is DEPRECATED
def check_tax_id_clade(clade_top_tax_id,check_tax_id):
    """Checks if given tax_id is inside a clade formed by taxa described by its top taxid"""

    p=pd.read_table(PATH_to_NCBI_nodes_dmp,sep='|',usecols=[0,1],header=None)
    G=nx.DiGraph()
    #Load all taxonomy as graph
    G.add_edges_from(zip(p.ix[:,1],p.ix[:,0]))
    clade=nx.dfs_tree(G,clade_top_tax_id)
    if(not clade.nodes()):
        clade.add_node(clade_top_tax_id)
    return(clade.has_node(check_tax_id))


ncbi = NCBITaxa()


def subsample_taxids(taxids,rank='species'):
    """
    For a given set of taxids leaves only one representative per selected rank
    Eg. for a set of subspecies - leave only species.
    """
    rank_dict={'superkingdom':0,'kingdom':1,'phylum':2,'class':3,'superorder':4,'order':5,'suborder':6,'infraorder':7,'parvorder':8,'superfamily':9,'family':10,'subfamily':11,'genus':12,'subgenus':13,'species':14,'subspecies':15}

    tree = ncbi.get_topology(taxids,intermediate_nodes=True)
    #We have now a phylogenetic tree with all annotations for our taxids.
    subsampled_taxids=set()
    #We are iterating through the taxids and for every we are determining only one representative from this group.
    #These representatives will be the same for the taxids in one group - and hence subsampling will happen.
    for t in taxids:
Example #26
0
import csv
from os import listdir
from os.path import isfile, join
from collections import Counter
from collections import defaultdict
from ete2 import NCBITaxa


d = defaultdict(list)
taxids = []
mypath = raw_input("Enter path to csv files (i.e. /home/user/csv/files/): ")
allfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
onlyfiles = [s for s in allfiles if '.csv' in s]
filecount = len(onlyfiles)
level = raw_input("1: phylum ----> 2: class ----> 3: order ----> 4: family ----> 5: genus ----> 6: species\nEnter rank number: ")
ncbi = NCBITaxa()
DEFAULT_TAXADB = os.path.join(os.environ.get('HOME', '/'), '.etetoolkit', 'taxa.sqlite')
DB_VERSION = 2
rank_dict = {"1": "phylum", "2": "class", "3": "order", "4": "family", "5": "genus", "6": "species"}


def is_taxadb_up_to_date(dbfile=DEFAULT_TAXADB):
    """
    Check if a valid and up-to-date taxa.sqlite database exists
    If dbfile is not specified, DEFAULT_TAXADB is assumed
    :param dbfile:
    :return:
    """
    db = sqlite3.connect(dbfile)
    try:
        r = db.execute('SELECT version FROM stats;')
Example #27
0
from ete2 import NCBITaxa
from ete2 import Tree, TreeStyle, AttrFace

ncbi = NCBITaxa()

input = [l.rstrip("\n") for l in open("db/example_input", "r")]

taxid = ncbi.get_name_translator(input)
tree  = ncbi.get_topology(taxid.values())

#print tree.get_ascii(attributes=["sci_name", "rank", "taxid"])

# custom layout: adds "rank" on top of branches, and sci_name as tip names
def my_layout(node):
    if getattr(node, "rank", None):
        rank_face = AttrFace("rank", fsize=7, fgcolor="indianred")
        node.add_face(rank_face, column=0, position="branch-top")
    if node.is_leaf():
        sciname_face = AttrFace("sci_name", fsize=9, fgcolor="steelblue")
        node.add_face(sciname_face, column=0, position="branch-right")

ts = TreeStyle()
ts.layout_fn = my_layout
ts.show_leaf_name = False

tree.render("tree.pdf", tree_style=ts)
import numpy as np
import pandas as pd
import cPickle as pickle
from ete2 import NCBITaxa
from pprint import pprint
import os.path
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC


sys.path.append('/Volumes/MDBD/Dropbox/work/MYSOFT/ALIGNMENT_TOOLS/')
# Entrez.email = "*****@*****.**" 
from hist_ss import get_core_lendiff
rank_dict={'superkingdom':0,'kingdom':1,'phylum':2,'class':3,'superorder':4,'order':5,'suborder':6,'infraorder':7,'parvorder':8,'superfamily':9,'family':10,'subfamily':11,'genus':12,'subgenus':13,'species':14,'subspecies':15}

ncbi = NCBITaxa()

def check_hist_length(seq,hist_type,hist_var=None,dev_percent=10):
    """
    This simple check compares the length of sequence provided
    to a range of curated sequences in histone DB +- dev_percent %.
    """
    if(os.path.isfile('int_data/cur_length.csv')):
        cur_df=pd.read_csv('int_data/cur_length.csv')
    else:
        hist_df=pd.read_csv('inp_data/seqs.csv') #Histone types info
        fasta_dict=pickle.load( open( "int_data/fasta_dict.p", "rb" )) #Sequences
        #construct df with length
        cur_df=hist_df[(hist_df['curated']==True)]
        cur_df['length']=cur_df['gi'].map(lambda x: len(fasta_dict[str(x)].seq))
        # print cur_df.groupby(['hist_type','hist_var']).agg([np.max,np.min])
Example #29
0
def taxo_seq_architecture(seqreclist=[],
                          outfile='taxo_arch.svg',
                          taxids=[],
                          annotation='',
                          title='',
                          width=2000):
    """
    Visualize sequence architecture together with a taxonomy tree
    seqreclist - contains a list of seqres.
    each seqrec should have a list of features in biobython SeqFeature format.

    features of type "domain" will be plotted as boxes 
    features of type "xxxx" will be plotted as ...

    taxids - list of taxids in the same order as seqs in msa, if now provided will assume that seqrecs
    are in genbank format and attempt to get taxids from there.
    """
    aa = [
        'A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F',
        'P', 'S', 'T', 'W', 'Y', 'V', 'B', 'Z', 'X', '.', '-'
    ]

    def get_color(str):
        colorlist = [
            'red', 'green', 'yellow', 'lightblue', 'cyan', 'magenta', 'orange',
            'pink', 'lightgreen'
        ]
        return colorlist[hash(str) % 9]

    if len(taxids) == 0:
        taxids = map(get_taxid_from_gbrec, seqreclist)

    ncbi = NCBITaxa()
    taxids = map(int, taxids)

    t = ncbi.get_topology(taxids, intermediate_nodes=False)
    # a=t.add_child(name='annotation')
    # a.add_feature('sci_name','annotation')
    t.sort_descendants(attr='sci_name')
    ts = TreeStyle()

    def layout(node):
        # print node.rank
        # print node.sci_name
        if getattr(node, "rank", None):
            if (node.rank in ['order', 'class', 'phylum', 'kingdom']):
                rank_face = AttrFace("sci_name", fsize=7, fgcolor="indianred")
                node.add_face(rank_face, column=0, position="branch-top")
        if node.is_leaf():
            sciname_face = AttrFace("sci_name", fsize=9, fgcolor="steelblue")
            node.add_face(sciname_face, column=0, position="branch-right")
        if node.is_leaf() and not node.name == 'annotation':
            #here we are adding faces and we need to play with seqmotif face
            seq = str(seqreclist[taxids.index(int(node.name))].seq)
            motifs = []  #[[0,len(seq), "seq", 10, 10, None, None, None]]
            for f in seqreclist[taxids.index(int(node.name))].features:
                if f.type == 'domain':
                    motifs.append([
                        f.location.start, f.location.end, "[]", None, 10,
                        "blue",
                        get_color(f.qualifiers['name']),
                        "arial|8|black|%s" % f.qualifiers['name']
                    ])
                if f.type == 'motif':
                    #It turns out that we need to solve overlap problem here, here it is solved only in case of one overlap
                    s = f.location.start
                    e = f.location.end
                    flag = True
                    overlappedm = []
                    for m in motifs:
                        if m[2] == 'seq' and m[0] < e and m[
                                1] > s:  #we have an overlap, four cases, preceding motife always is on top
                            flag = False
                            overlappedm.append(m)
                    if not flag:  #we have to solve multiple overlap problem
                        #let's do it by scanning
                        sflag = False
                        eflag = False
                        for x in range(s, e + 1):
                            if not sflag:  #check if we can start
                                overlap = False
                                for m in overlappedm:
                                    if x >= m[0] and x < m[1]:
                                        overlap = True
                                if not overlap:
                                    ts = x
                                    sflag = True

                            #check if is time to end
                            if sflag and not eflag:
                                overlap = False
                                for m in overlappedm:
                                    if x == m[0]:
                                        overlap = True
                                if overlap or x == e:
                                    te = x
                                    eflag = True

                            if sflag and eflag:
                                motifs.append([
                                    ts, te, "seq", 10, 10, "black",
                                    get_color(f.qualifiers['name']), None
                                ])
                                sflag = False
                                eflag = False
                    if flag:
                        motifs.append([
                            f.location.start, f.location.end, "seq", 10, 10,
                            "black",
                            get_color(f.qualifiers['name']), None
                        ])
            seqFace = SeqMotifFace(seq,
                                   motifs,
                                   scale_factor=1,
                                   seq_format="[]")
            seqFace.overlaping_motif_opacity = 1.0
            # seqFace.fg=aafgcolors
            # seqFace.bg=aabgcolors_gray

            add_face_to_node(seqFace, node, 0, position="aligned")
            # gi=taxid2gi[int(node.name)]
            add_face_to_node(
                TextFace(' ' + seqreclist[taxids.index(int(node.name))].id +
                         '         '),
                node,
                column=1,
                position="aligned")
            # add_face_to_node(TextFace('      '+str(int(node.name))+' '),node,column=2, position = "aligned")
            # add_face_to_node(TextFace('      '+str(gi2variant[gi])+' '),node,column=3, position = "aligned")

        #We currently disable annotation
        if node.is_leaf() and node.name == 'annotation':
            if (annotation):
                s = annotation
                # get_hist_ss_in_aln_as_string(msa_tr)
            else:
                s = ' ' * max(map(lambda x: len(x.seq), seqreclist))
            # seqFace = SeqMotifFace(s,[[0,len(s), "seq", 10, 10, None, None, None]],scale_factor=1)
            # add_face_to_node(seqFace, node, 0, position="aligned")
            # add_face_to_node(TextFace(' '+'SEQ_ID'),node,column=1, position = "aligned")

            # add_face_to_node(TextFace('       '+'NCBI_TAXID'+' '),node,column=2, position = "aligned")
            # add_face_to_node(TextFace('       '+'Variant'+'       '),node,column=3, position = "aligned")

    ts.layout_fn = layout
    ts.show_leaf_name = False
    ts.title.add_face(TextFace(title, fsize=20), column=0)
    t.render(outfile, w=width, dpi=300, tree_style=ts)
Example #30
0
def taxo_msa(outfile='taxo_msa.svg',
             taxids=[],
             annotation='',
             msa=[],
             title='',
             width=2000):
    """
    Visualize MSA together with a taxonomy tree
    taxids - list of taxids in the same order as seqs in msa
    """
    # taxid2gi={f_df.loc[f_df.gi==int(gi),'taxid'].values[0]:gi for gi in list(f_df['gi'])}
    # gi2variant={gi:f_df.loc[f_df.gi==int(gi),'hist_var'].values[0] for gi in list(f_df['gi'])}

    # msa_dict={i.id:i.seq for i in msa_tr}
    ncbi = NCBITaxa()
    taxids = map(int, taxids)

    t = ncbi.get_topology(taxids, intermediate_nodes=False)
    a = t.add_child(name='annotation')
    a.add_feature('sci_name', 'annotation')
    t.sort_descendants(attr='sci_name')
    ts = TreeStyle()

    def layout(node):
        # print node.rank
        # print node.sci_name
        if getattr(node, "rank", None):
            if (node.rank in ['order', 'class', 'phylum', 'kingdom']):
                rank_face = AttrFace("sci_name", fsize=7, fgcolor="indianred")
                node.add_face(rank_face, column=0, position="branch-top")
        if node.is_leaf():
            sciname_face = AttrFace("sci_name", fsize=9, fgcolor="steelblue")
            node.add_face(sciname_face, column=0, position="branch-right")
        if node.is_leaf() and not node.name == 'annotation':
            s = str(msa[taxids.index(int(node.name))].seq)
            seqFace = SeqMotifFace(
                s, [[0, len(s), "seq", 10, 10, None, None, None]],
                scale_factor=1)
            add_face_to_node(seqFace, node, 0, position="aligned")
            # gi=taxid2gi[int(node.name)]
            add_face_to_node(TextFace(' ' +
                                      msa[taxids.index(int(node.name))].id),
                             node,
                             column=1,
                             position="aligned")
            # add_face_to_node(TextFace('      '+str(int(node.name))+' '),node,column=2, position = "aligned")
            # add_face_to_node(TextFace('      '+str(gi2variant[gi])+' '),node,column=3, position = "aligned")

        if node.is_leaf() and node.name == 'annotation':
            if (annotation):
                s = annotation
                # get_hist_ss_in_aln_as_string(msa_tr)
            else:
                s = ' ' * len(msa[0].seq)
            seqFace = SeqMotifFace(
                s, [[0, len(s), "seq", 10, 10, None, None, None]],
                scale_factor=1)
            add_face_to_node(seqFace, node, 0, position="aligned")
            add_face_to_node(TextFace(' ' + 'SEQ_ID'),
                             node,
                             column=1,
                             position="aligned")
            # add_face_to_node(TextFace('       '+'NCBI_TAXID'+' '),node,column=2, position = "aligned")
            # add_face_to_node(TextFace('       '+'Variant'+'       '),node,column=3, position = "aligned")

    ts.layout_fn = layout
    ts.show_leaf_name = False
    ts.title.add_face(TextFace(title, fsize=20), column=0)
    t.render(outfile, w=width, dpi=300, tree_style=ts)
Example #31
0
def run(args):
    # add lineage profiles/stats
    
    import re
    from ete2 import PhyloTree, NCBITaxa

    # dump tree by default
    if not args.tree and not args.info and not args.descendants:
        args.tree = True
    
    ncbi = NCBITaxa()

    all_taxids = {}
    all_names = set()
    queries = []

    if not args.search:
        log.error('Search terms should be provided (i.e. --search) ')
        sys.exit(-1)
    for n in args.search:
        queries.append(n)
        try:
            all_taxids[int(n)] = None
        except ValueError:
            all_names.add(n.strip())
            
    # translate names
    name2tax = ncbi.get_name_translator(all_names)
    all_taxids.update([(v, None) for v in name2tax.values()])

    not_found_names = all_names - set(name2tax.keys())
    if args.fuzzy and not_found_names:
        log.warn("%s unknown names", len(not_found_names))
        for name in not_found_names:
            # enable extension loading
            tax, realname, sim = ncbi.get_fuzzy_name_translation(name, args.fuzzy)
            if tax:
                all_taxids[tax] = None
                name2tax[name] = tax
                name2realname[name] = realname
                name2score[name] = "Fuzzy:%0.2f" %sim
                
    if not_found_names:
        log.warn("[%s] could not be translated into taxids!" %','.join(not_found_names))
                
    if args.tree:
        if len(all_taxids) == 1:
            target_taxid = all_taxids.keys()[0]
            log.info("Dumping NCBI descendants tree for %s" %(target_taxid))
            t = ncbi.get_descendant_taxa(target_taxid, collapse_subspecies=args.collapse_subspecies, rank_limit=args.rank_limit, return_tree=True)
        else:
            log.info("Dumping NCBI taxonomy of %d taxa..." %(len(all_taxids)))
            t = ncbi.get_topology(all_taxids.keys(),
                              intermediate_nodes=args.full_lineage,
                              rank_limit=args.rank_limit,
                              collapse_subspecies=args.collapse_subspecies)
        
        id2name = ncbi.get_taxid_translator([n.name for n in t.traverse()])        
        for n in t.traverse():
            n.add_features(taxid=n.name)
            n.add_features(sci_name=str(id2name.get(int(n.name), "?")))
            n.name = "%s - %s" %(id2name.get(int(n.name), n.name), n.name)
            lineage = ncbi.get_lineage(n.taxid)
            n.add_features(named_lineage = '|'.join(ncbi.translate_to_names(lineage)))
        dump(t, features=["taxid", "name", "rank", "bgcolor", "sci_name",
                          "collapse_subspecies", "named_lineage"])
    elif args.descendants:
        log.info("Dumping NCBI taxonomy of %d taxa..." %(len(all_taxids)))
        print '# ' + '\t'.join(["Taxid", "Sci.Name", "Rank", "descendant_taxids", "descendant_names"])
        translator = ncbi.get_taxid_translator(all_taxids)
        ranks = ncbi.get_rank(all_taxids)         
        for taxid in all_taxids:
            descendants = ncbi.get_descendant_taxa(taxid, collapse_subspecies=args.collapse_subspecies, rank_limit=args.rank_limit)
            print '\t'.join([str(taxid), translator.get(taxid, taxid), ranks.get(taxid, ''),
                             '|'.join(map(str, descendants)),
                             '|'.join(map(str, ncbi.translate_to_names(descendants)))])
        
    elif args.info:
        print '# ' + '\t'.join(["Taxid", "Sci.Name", "Rank", "Named Lineage", "Taxid Lineage"])
        translator = ncbi.get_taxid_translator(all_taxids)
        
        ranks = ncbi.get_rank(all_taxids) 
        for taxid, name in translator.iteritems():
            lineage = ncbi.get_lineage(taxid)            
            named_lineage = ','.join(ncbi.translate_to_names(lineage))
            lineage_string = ','.join(map(str, lineage))
            print '\t'.join([str(taxid), name, ranks.get(taxid, ''), named_lineage, lineage_string])
Example #32
0
def taxo_seq_architecture(seqreclist=[],outfile='taxo_arch.svg',taxids=[],annotation='',title='',width=2000):
    """
    Visualize sequence architecture together with a taxonomy tree
    seqreclist - contains a list of seqres.
    each seqrec should have a list of features in biobython SeqFeature format.

    features of type "domain" will be plotted as boxes 
    features of type "xxxx" will be plotted as ...

    taxids - list of taxids in the same order as seqs in msa, if now provided will assume that seqrecs
    are in genbank format and attempt to get taxids from there.
    """
    aa=['A','R','N','D','C','Q','E','G','H','I','L','K','M','F','P','S','T','W','Y','V','B','Z','X','.','-']
    
    def get_color(str):
        colorlist=['red','green','yellow','lightblue','cyan','magenta','orange','pink','lightgreen']
        return colorlist[hash(str)%9]

    if len(taxids)==0:
        taxids=map(get_taxid_from_gbrec,seqreclist)

    ncbi = NCBITaxa()
    taxids=map(int,taxids)

    t = ncbi.get_topology(taxids,intermediate_nodes=False)
    # a=t.add_child(name='annotation')
    # a.add_feature('sci_name','annotation')
    t.sort_descendants(attr='sci_name')
    ts = TreeStyle()
    def layout(node):
        # print node.rank
        # print node.sci_name
        if getattr(node, "rank", None):
            if(node.rank in ['order','class','phylum','kingdom']):   
                rank_face = AttrFace("sci_name", fsize=7, fgcolor="indianred")
                node.add_face(rank_face, column=0, position="branch-top")
        if node.is_leaf():
            sciname_face = AttrFace("sci_name", fsize=9, fgcolor="steelblue")
            node.add_face(sciname_face, column=0, position="branch-right")
        if node.is_leaf() and not node.name=='annotation':
            #here we are adding faces and we need to play with seqmotif face
            seq=str(seqreclist[taxids.index(int(node.name))].seq)
            motifs=[]#[[0,len(seq), "seq", 10, 10, None, None, None]]
            for f in seqreclist[taxids.index(int(node.name))].features:
                if f.type=='domain':
                    motifs.append([f.location.start,f.location.end,"[]",None,10,"blue", get_color(f.qualifiers['name']), "arial|8|black|%s"%f.qualifiers['name']])
                if f.type=='motif':
                    #It turns out that we need to solve overlap problem here, here it is solved only in case of one overlap
                    s=f.location.start
                    e=f.location.end
                    flag=True
                    overlappedm=[]
                    for m in motifs:
                        if m[2]=='seq' and m[0]<e and m[1]>s: #we have an overlap, four cases, preceding motife always is on top
                            flag=False
                            overlappedm.append(m)
                    if not flag: #we have to solve multiple overlap problem
                    #let's do it by scanning
                        sflag=False
                        eflag=False
                        for x in range(s,e+1):
                            if not sflag: #check if we can start
                                overlap=False
                                for m in overlappedm:
                                    if x>=m[0] and x<m[1]:
                                        overlap=True
                                if not overlap:
                                    ts=x
                                    sflag=True

                            #check if is time to end
                            if sflag and not eflag:
                                overlap=False
                                for m in overlappedm:
                                    if x==m[0]:
                                        overlap=True
                                if overlap or x==e:
                                    te=x
                                    eflag=True


                            if sflag and eflag:
                                motifs.append([ts,te,"seq",10,10,"black",get_color(f.qualifiers['name']),None])
                                sflag=False
                                eflag=False
                    if flag:
                        motifs.append([f.location.start,f.location.end,"seq",10,10,"black",get_color(f.qualifiers['name']),None])
            seqFace = SeqMotifFace(seq,motifs,scale_factor=1,seq_format="[]")
            seqFace.overlaping_motif_opacity = 1.0
            # seqFace.fg=aafgcolors
            # seqFace.bg=aabgcolors_gray

            add_face_to_node(seqFace, node, 0, position="aligned")
            # gi=taxid2gi[int(node.name)]
            add_face_to_node(TextFace(' '+seqreclist[taxids.index(int(node.name))].id+'         '),node,column=1, position = "aligned")
            # add_face_to_node(TextFace('      '+str(int(node.name))+' '),node,column=2, position = "aligned")
            # add_face_to_node(TextFace('      '+str(gi2variant[gi])+' '),node,column=3, position = "aligned")


        #We currently disable annotation
        if node.is_leaf() and node.name=='annotation':
            if(annotation):
                s=annotation
                # get_hist_ss_in_aln_as_string(msa_tr)
            else:
                s=' '*max(map(lambda x: len(x.seq),seqreclist))
            # seqFace = SeqMotifFace(s,[[0,len(s), "seq", 10, 10, None, None, None]],scale_factor=1)
            # add_face_to_node(seqFace, node, 0, position="aligned")
            # add_face_to_node(TextFace(' '+'SEQ_ID'),node,column=1, position = "aligned")


            # add_face_to_node(TextFace('       '+'NCBI_TAXID'+' '),node,column=2, position = "aligned")
            # add_face_to_node(TextFace('       '+'Variant'+'       '),node,column=3, position = "aligned")



    ts.layout_fn = layout
    ts.show_leaf_name = False
    ts.title.add_face(TextFace(title, fsize=20), column=0)
    t.render(outfile, w=width, dpi=300, tree_style=ts)
Example #33
0
def main():
    args = parser.parse_args()

    input_fp = args.input_fp
    tdt_out_fp = args.tdt_out_fp
    h2gt_out_fp = args.h2gt_out_fp
    rank_headers = args.rank_headers
    ranks = args.ranks

    ncbi = NCBITaxa()

    # input_fp = './R1_trimmed_CAT_rare_genefamilies_cpm_ko.tsv'

    h2gt = read_humann2_genetable_generator(open(input_fp))

    rank_headers = rank_headers.split(',')
    ranks = ranks.split(',')

    tax_dict = {}

    if tdt_out_fp:
        tdt_out_f = open(tdt_out_fp, 'w')
    if h2gt_out_fp:
        h2gt_out_f = open(h2gt_out_fp, 'w')

    first_h = True
    first_t = True
    for gene, header, line, tax in h2gt:
        lineage = list(rank_headers)

        if tax and tax not in tax_dict:
            best_id = None

            family, genus, species = clean_humann2_taxon(tax)
            best_id = get_best_ncbi_id(family, genus, species, ncbi)

            if best_id is not None:
                lineage = get_taxon_path(best_id,
                                         ncbi,
                                         ranks=ranks,
                                         rank_headers=rank_headers)

            tax_dict[tax] = lineage

        if tax:
            lineage = tax_dict[tax]

        if tdt_out_fp:
            if first_t:
                first_t = False
                tdt_out_f.write('Gene Family\t{0}\t{1}\n'.format(
                    '\t'.join(header), '\t'.join(ranks)))
            elif tax:
                tdt_out_f.write('{0}\t{1}\t{2}\n'.format(
                    gene, '\t'.join(line), '\t'.join(lineage)))

        if h2gt_out_fp:
            if first_h:
                h2gt_out_f.write('# Gene Family\t{0}\n'.format(
                    '\t'.join(header)))
                first_h = False
            if tax:
                if lineage == rank_headers:
                    h2gt_out_f.write('{0}|{1}\t{2}\n'.format(
                        gene, 'unknown', '\t'.join(line)))
                else:
                    h2gt_out_f.write('{0}|{1}\t{2}\n'.format(
                        gene, '.'.join(lineage), '\t'.join(line)))
            else:
                h2gt_out_f.write('{0}\t{1}\n'.format(gene, '\t'.join(line)))

    if tdt_out_fp:
        tdt_out_f.close()
    if h2gt_out_f:
        h2gt_out_f.close()
Example #34
0
def get_name(taxid):
    ncbi = NCBITaxa()
    names = ncbi.get_taxid_translator([taxid])
    return names[taxid]
Example #35
0
from pandas import DataFrame
from Bio import SeqIO
from pandas import Index
from ete2 import NCBITaxa

data_path = "/home/moritz/people/MoreData/genomes/img_od1s"
img_fasta = "/home/moritz/people/MoreData/raw_imgs/od1s.fasta"
img_xls = "/home/moritz/people/MoreData/raw_imgs/od1s.xls"
name = "parcu_from_img_"
taxDb = NCBITaxa()

contigs = DataFrame.from_csv(img_xls, sep="\t", header=0, index_col=0)
manual_taxo = taxDb.get_name_translator(['Candidatus Parcubacteria'
                                         ]).values()[0][0]
metadata = {
    name + str(g): {
        'IMG_ID': g,
        'name': name + str(g),
        'species_taxid': manual_taxo,
        'long_name': contigs.loc[contigs['Genome ID'] == g]['Genome'].iloc[0]
    }
    for g in set(contigs['Genome ID'])
}

seq_dict = {k: [] for k in metadata}

with open(img_fasta, "r") as file:
    for i, c in enumerate(SeqIO.parse(file, "fasta")):
        seq_dict[name + str(contigs.iloc[i]['Genome ID'])] += [c]
val = [color[2] for color in hsv]

ind = np.lexsort((val, sat, hue))
sorted_colors = [colors_[i] for i in ind]
colors_final = []

for i, (name, color) in enumerate(sorted_colors):
    colors_final.append(color)

import random
random.shuffle(colors_final)
colors_mapping = {}

#set up NCBI database
from ete2 import NCBITaxa
ncbi = NCBITaxa('/dfs/scratch0/manans/.etetoolkit/taxa.sqlite')

# read .nemb file
EMBEDDING_FILE = 'emb/n2v-avg.nemb'
histograms = []
species_ids = []

with open(EMBEDDING_FILE, 'r') as tf:
    for line in tf:
        ls = line.split(' ')
        species_ids.append(ls[0])
        v = []
        for n in ls[1:]:
            v.append(float(n))
        histograms.append(v)
Example #37
0
from ete2 import NCBITaxa
from ete2 import Tree, TreeStyle, AttrFace

ncbi = NCBITaxa()

input = [l.rstrip("\n") for l in open("db/example_input", "r")]

taxid = ncbi.get_name_translator(input)
tree = ncbi.get_topology(taxid.values())

#print tree.get_ascii(attributes=["sci_name", "rank", "taxid"])


# custom layout: adds "rank" on top of branches, and sci_name as tip names
def my_layout(node):
    if getattr(node, "rank", None):
        rank_face = AttrFace("rank", fsize=7, fgcolor="indianred")
        node.add_face(rank_face, column=0, position="branch-top")
    if node.is_leaf():
        sciname_face = AttrFace("sci_name", fsize=9, fgcolor="steelblue")
        node.add_face(sciname_face, column=0, position="branch-right")


ts = TreeStyle()
ts.layout_fn = my_layout
ts.show_leaf_name = False

tree.render("tree.pdf", tree_style=ts)