def funGenusLocalBlast(sFastaFileName, sGBKFileName, dbName): """Import packages used """ from Bio.Blast.Applications import NcbiblastnCommandline from Bio import SeqIO from Bio.SeqUtils import GC import subprocess import xlsxwriter from funReadBlast import funReadBlast from funANICalc import funANICalc from collections import Counter from ete3 import NCBITaxa #sFastaFileName = "TestFolderFasta/AMERTCC_31.fasta" #sGBKFileName = "TestFolderGenBank/AMERTCC_31.annotation.20161209.gbk" #dbName = "ref_prok_rep_genomes" columnTitleRow = [ "FDAARGOS_ID", #0 "Num_Contig", #1 "Assembly_Size", #2 "N_50", #3 "Largest_Contig_Size", #4 "Contig_ID", #5 "Contig_Length", #6 "Contig_GC", #7 "Proposed Organism", #8 "Blast_Hit", #9 "ACCESSION", #10 "Score", #11 "Percent_Query_Identity", #12 "Percent_Query_Coverage", #13 "Scientific_Name", #14 "Query_ANI_Coverage", #15 "Subject_ANI_Coverage", #16 "Query_ANI_Length", #17 "Subject_ANI_Length", #18 "Query_ANI_HD", #19 "Subject_ANI_HD", #20 "Query_ANI_Identity", #21 "Subject_ANI_Identity", #22 "Query_ANI_SE", #23 "Subject_ANI_SE" ] sFileName = sFastaFileName[0:-6] + '_Genus_Blast.xlsx' workbook = xlsxwriter.Workbook(sFileName) lARGOSID = sFastaFileName.split("/") sARGOSID = lARGOSID[-1][0:-6] """Import Fasta sequence from assembly file""" lSeqRecord = [] for seq_record in SeqIO.parse(sFastaFileName, "fasta"): lSeqRecord.append(seq_record) """Import Annotation""" all_species = [] if sGBKFileName == "N/A": for seq_record in lSeqRecord: all_species.append('N/A-N/A') else: f = open(sGBKFileName, 'r', errors='ignore') for line in f: if "ORGANISM" in line: print(line) sSpecie = line all_species.append(sSpecie) f.close """Calculate Contig Statistics""" lSize = [] lGC = [] for seq_record in lSeqRecord: lSize.append(len(seq_record.seq)) lGC.append(GC(seq_record.seq)) nTotalAssemblySize = sum(lSize) nNumContig = len(lSize) nLargestContig = max(lSize) #nTotalGC = np.multiply(lGC,lSize) #nTotalPercentGC = sum(nTotalGC)/nTotalAssemblySize nThreshold = 0.5 * nTotalAssemblySize lTempSize = sorted(lSize, reverse=True) nSize = 0 count = 0 while nSize <= nThreshold: nSize = nSize + lTempSize[count] out = count count = count + 1 nN50 = lTempSize[out] #Run Blast sOutFileName = sARGOSID + ".txt" ncbi = NCBITaxa() Genus = all_species[0].split() Genus = Genus[1] name2taxid = ncbi.get_name_translator([Genus]) sOrganism = "\"txid" + str(name2taxid[Genus][0]) + " [ORGN]\"" sOutFileName = sARGOSID + ".txt" blastn_cline = NcbiblastnCommandline(task = "megablast", \ query = sFastaFileName, \ db = "nt",\ evalue = 0.001, \ max_target_seqs = 5, \ outfmt = "\"6 " +\ "qseqid "+\ "qlen "+\ "sscinames "+\ "sacc "+\ "stitle "+\ "length "+\ "score "+\ "pident "+\ "qcovs\"", entrez_query = sOrganism, remote = 1, out = sOutFileName) process = subprocess.Popen("export BLASTDB=/Users/yi.yan/Documents/db/:$BLASTDB"\ +"&&/usr/local/ncbi/blast/bin/"\ +str(blastn_cline),\ shell=True,\ stdout = subprocess.PIPE,\ stderr = subprocess.PIPE) proc_out, proc_err = process.communicate() tblComplete = funReadBlast(sOutFileName,all_species,sARGOSID,nNumContig,nTotalAssemblySize,\ nN50,nLargestContig,lGC,lSize) #Run ANI FinalTbl = funANICalc(tblComplete, lSeqRecord, 'nt') s = sorted(FinalTbl, key=lambda x: (x[6], x[11]), reverse=True) ContigNames = [i[5] for i in s] lContigName = list(set(ContigNames)) SummaryTbl = [] for Name in lContigName: SummaryTbl.append(s[ContigNames.index(Name)]) SummaryTbl = sorted(SummaryTbl, key=lambda x: (x[6], x[11]), reverse=True) worksheet = workbook.add_worksheet('NT_Genus') for i in range(0, len(s[0])): worksheet.write(0, i, columnTitleRow[i]) for i in range(0, len(s)): for j in range(0, len(s[0])): worksheet.write(i + 1, j, s[i][j]) #Specie Distribution nRowStart = i + 5 lSciNames = [i[14] for i in s] lGenus = [] lSpecies = [] for i in lSciNames: temp = i.replace('[', '') temp = temp.replace(']', '') lGenus.append(temp.split()[0]) lSpecies.append(temp.split()[0] + ' ' + temp.split()[1]) c = Counter(lGenus) nItem = len(lGenus) Genus = list(c.keys()) tempHeader = ['Genus', 'Count', 'Percentage'] for i in range(0, 3): worksheet.write(nRowStart, i, tempHeader[i]) for i in range(0, len(Genus)): worksheet.write(nRowStart + i, 0, Genus[i]) worksheet.write(nRowStart + i, 1, c.get(Genus[i])) worksheet.write(nRowStart + i, 2, c.get(Genus[i]) / nItem) nRowStart = nRowStart + i + 5 c = Counter(lSpecies) nItem = len(lSpecies) Species = list(c.keys()) tempHeader = ['Genus', 'Count', 'Percentage'] for i in range(0, 3): worksheet.write(nRowStart, i, tempHeader[i]) for i in range(0, len(Species)): worksheet.write(nRowStart + i, 0, Species[i]) worksheet.write(nRowStart + i, 1, c.get(Species[i])) worksheet.write(nRowStart + i, 2, c.get(Species[i]) / nItem) #Summary worksheet = workbook.add_worksheet('NT_Summary_Genus') for i in range(0, len(s[0])): worksheet.write(0, i, columnTitleRow[i]) for i in range(0, len(SummaryTbl)): for j in range(0, len(s[0])): worksheet.write(i + 1, j, SummaryTbl[i][j]) #Specie Distribution nRowStart = i + 5 lSciNames = [i[14] for i in SummaryTbl] lGenus = [] lSpecies = [] for i in lSciNames: temp = i.replace('[', '') temp = temp.replace(']', '') lGenus.append(temp.split()[0]) lSpecies.append(temp.split()[0] + ' ' + temp.split()[1]) c = Counter(lGenus) nItem = len(lGenus) Genus = list(c.keys()) tempHeader = ['Genus', 'Count', 'Percentage'] for i in range(0, 3): worksheet.write(nRowStart, i, tempHeader[i]) for i in range(0, len(Genus)): worksheet.write(nRowStart + i, 0, Genus[i]) worksheet.write(nRowStart + i, 1, c.get(Genus[i])) worksheet.write(nRowStart + i, 2, c.get(Genus[i]) / nItem) nRowStart = nRowStart + i + 5 c = Counter(lSpecies) nItem = len(lSpecies) Species = list(c.keys()) for i in range(0, len(Species)): worksheet.write(nRowStart + i, 0, Species[i]) worksheet.write(nRowStart + i, 1, c.get(Species[i])) worksheet.write(nRowStart + i, 2, c.get(Species[i]) / nItem) workbook.close()
Get the protein sequences for a list of genera. The module uses a text file with a number of TaxIDs to download the protein sequences belonging to the respective genera. The taxonomic lineage will be added to the protein header. """ import logging import os import re import urllib.parse import urllib.request from ete3 import NCBITaxa from mptk import general_functions logger = logging.getLogger("pies.use_amplicon") NCBI = NCBITaxa() def get_taxid(input_file): """ Return a list of tax IDs based on tax names. Each line of the input_file has a tax name on each line. The function `get_taxid` returns a list with tax IDs with the same length. Parameters ---------- input_file: file with tax names on each line Returns -------
def lineage_extractor(query_taxid, TaxInfo_object): list_of_taxa_ranks = [ 'superkingdom', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species' ] ncbi = NCBITaxa() lineage = ncbi.get_lineage(query_taxid) ranks = ncbi.get_rank(lineage) names = ncbi.get_taxid_translator(lineage) # get known data for key, val in ranks.items(): if val == list_of_taxa_ranks[0]: TaxInfo_object.Superkingdom = names[key] TaxInfo_object.Superkingdom_TaxId = key elif val == list_of_taxa_ranks[1]: TaxInfo_object.Kingdom = names[key] TaxInfo_object.Kingdom_TaxId = key elif val == list_of_taxa_ranks[2]: TaxInfo_object.Phylum = names[key] TaxInfo_object.Phylum_TaxId = key elif val == list_of_taxa_ranks[3]: TaxInfo_object.Class = names[key] TaxInfo_object.Class_TaxId = key elif val == list_of_taxa_ranks[4]: TaxInfo_object.Order = names[key] TaxInfo_object.Order_TaxId = key elif val == list_of_taxa_ranks[5]: TaxInfo_object.Family = names[key] TaxInfo_object.Family_TaxId = key elif val == list_of_taxa_ranks[6]: TaxInfo_object.Genus = names[key] TaxInfo_object.Genus_TaxId = key elif val == list_of_taxa_ranks[7]: TaxInfo_object.Species = names[key] TaxInfo_object.Species_TaxId = key # fill in the blanks if TaxInfo_object.Superkingdom is None: TaxInfo_object.Superkingdom = "unk_sk" if TaxInfo_object.Kingdom is None: TaxInfo_object.Kingdom = "unk_k" if TaxInfo_object.Phylum is None: TaxInfo_object.Phylum = "unk_p" if TaxInfo_object.Class is None: TaxInfo_object.Class = "unk_c" if TaxInfo_object.Order is None: TaxInfo_object.Order = "unk_o" if TaxInfo_object.Family is None: TaxInfo_object.Family = "unk_f" if TaxInfo_object.Genus is None: TaxInfo_object.Genus = "unk_g" if TaxInfo_object.Species is None: TaxInfo_object.Species = "unk_s" return TaxInfo_object
#q = 50 #d = 0.2 #p = 0.05 #st = 99 #gt = 98 #ft = 95 #ot = 80 #ct = 0 #pt = 0 #du = 'kma' ##### Checks: # Run implicitly ete3.NCBITaxa.__init__() to check for valid taxonomy database NCBITaxa() # Warning if RefDatabase is unknown if ref_database not in ("UNITE", "RefSeq","nt"): print (""" Reference database (-r) must be either UNITE, RefSeq or nt. the input is case sensitive and the default is nt.""") sys.exit("Try again.") ##### Read input files and output a pandas dataframe print ("") print ("Reading file %s" %(f)) print ("") df = pd.read_csv(f, sep='\t', index_col=0, encoding='latin1')
import sys import time import random import os import pandas as pd import numpy as np import sqlite3 import sqlalchemy from ete3 import NCBITaxa ncbi_taxdump_names = ["taxdump_2019-01-01.tar.gz", "taxdump_2019-06-01.tar"] ncbi_dbs = [] if not os.path.exists(ncbi_taxdump_names[0].replace(".tar.gz", ".sqlite")): ncbi_dbs.append( NCBITaxa(ncbi_taxdump_names[0].replace(".tar.gz", ".sqlite"), taxdump_file=ncbi_taxdump_names[0])) else: ncbi_dbs.append( NCBITaxa(ncbi_taxdump_names[0].replace(".tar.gz", ".sqlite"))) if not os.path.exists(ncbi_taxdump_names[1].replace(".tar", ".sqlite")): ncbi_dbs.append( NCBITaxa(ncbi_taxdump_names[1].replace(".tar", ".sqlite"), taxdump_file=ncbi_taxdump_names[1])) else: ncbi_dbs.append(NCBITaxa(ncbi_taxdump_names[1].replace(".tar", ".sqlite"))) ncbi_new = NCBITaxa() #ncbi_new.update_taxonomy_database() ncbi_dbs.append(ncbi_new) from lca_functions import *
def get_tax_lineage(taxonid, source, tax_rank_id={}): """Return taxonomy lineage information This function uses either Biopython library to connect NCBI database and search for taxonomy information or searches the information locally by using ete3 taxdump file or taxit program to create sql version of it. Parameters ------------- taxonid : string Taxonomic id of the species source : string Source to be used to collect the info about the taxonid tax_rank_id: dict Taxonomic rank and id Returns ------------- lineage: dict Species lineage """ if taxonid not in LINEAGES: if source == "taxdump": ncbi_taxdump = NCBITaxa() lineage_ids = ncbi_taxdump.get_lineage(taxonid) ranks = ncbi_taxdump.get_rank(lineage_ids) names = ncbi_taxdump.get_taxid_translator(lineage_ids) lineage = {ranks[i]: names[i] for i in lineage_ids} LINEAGES[taxonid] = lineage return LINEAGES[taxonid] if source == "taxit": lineage = { level: tax_rank_id[tax_rank_id[taxonid][level]]["tax_name"] for level in TAX_LEVELS } LINEAGES[taxonid] = lineage return LINEAGES[taxonid] while True: data = "" try: Entrez.email = "*****@*****.**" handle = Entrez.efetch(id=taxonid, db="taxonomy", retmode="xml") data = Entrez.read(handle) handle.close() except Exception as e: with open(LOG, "a") as log: print("Error when searching information about {}".format( taxonid), file=log) if data: break lineage = { d["Rank"]: d["ScientificName"] for d in data[0]["LineageEx"] } lineage[data[0]["Rank"]] = data[0]["ScientificName"] LINEAGES[taxonid] = lineage return LINEAGES[taxonid]
def create_CAMI_profile(data_file, sample_id): """ CSV Parser for converting information to the CAMI profiling format. Input: csv file with the required information, sample ID and the name of the file to write to Output: header and contents of the CAMI profile file (see format linked above) """ dataframe = pd.read_csv(data_file) subset = dataframe[dataframe["sample"] == sample_id] taxa = subset["Assignment"] total_percentages = subset["percentage_of_total_reads"] ncbi = NCBITaxa() rank_list_list = [] #save all taxonomies to find the longest #I use the longest, because virus taxonomy is diverse... output_list = [] #stores the CAMI profiles as strings for name in taxa: #remove names that have some addition in brackets, # like " (segment 1)" if ' (' in name: ncbi_name = name[:name.index(' (')] else: ncbi_name = name taxon_and_id = ncbi.get_name_translator([ncbi_name]) #ncbi.get_name_translator() returns a dictionary { 'taxon' : [id]} taxid = taxon_and_id[ncbi_name] #taxid is a list with one number taxid_nr = taxid[0] rank_dict = ncbi.get_rank(taxid) #ncbi.get_rank() requires a list of IDs, and returns a dictionary: # {id: 'rank'} rank = rank_dict[taxid_nr] tax_path_dict = ncbi.get_lineage_translator(taxid) #[taxid_nr] #ncbi.get_lineage_translator() requires a list of IDs, and returns # a dictionary {leaf_id: [root_id, node_id, leaf_id]} tax_path = tax_path_dict[taxid_nr][1:] tax_path_sn = [] #with a for-loop you can translate the taxids in the list # 'tax_path' to their corresponding scientific names (sn) for t in tax_path: tax_path_sn.append(ncbi.get_taxid_translator([t])[t]) rank_list = [] #Making this list requires using a for-loop; # using the function on a list makes an UNORDERED dictionary #Also, since the path differs between branches, I will look # for the longest using a list of lists for taxid in tax_path: rank_dict = ncbi.get_rank([taxid]) rank = rank_dict[taxid] rank_list.append(rank) rank_list_list.append(rank_list) tax_path_string = '|'.join(map(str, tax_path)) tax_path_sn_string = '|'.join(tax_path_sn) percentage = subset.loc[subset["Assignment"] == name]["percentage_of_total_reads"].values[0] output_line = "%s\t%s\t%s\t%s\t%s" % (taxid_nr, rank, tax_path_string, tax_path_sn_string, percentage) output_list.append(output_line) longest_taxonomy = '|'.join(max(rank_list_list, key=len)) #Read the specification for details about this header: #https://github.com/bioboxes/rfc/blob/60263f34c57bc4137deeceec4c68a7f9f810f6a5/data-format/profiling.mkd header = """# Taxonomic Profiling Output @SampleID:%s @Version:0.9.3 @Ranks:%s\t#the longest path in this sample: virus taxonomy is messy @TaxonomyID:ncbi-taxonomy_2018-05-25 @@TAXID\tRANK\tTAXPATH\tTAXPATHSN\tPERCENTAGE """ % (sample_id, longest_taxonomy) return (header, output_list)
def tdb_from_hits(hits, minPerc=50, testing=False): ''' Determines the lowest taxonomic level with at least minPerc certainty For every hit: reconstruct the lineage (kingdom, phylum, class, ect.) add a count to every rank in the lineage For every rank: see if the number of hits matching one taxa at that rank is above the minPerc the denominator for this equation is the number of hits that have a phyla rank * Note: this is complicated because some lower ranks don't have higher ranks For example, species [Eubacterium] rectale (taxID 39491) has no genus Also, species [artifical construct] (taxID 32630) has no anything but species ''' from ete3 import NCBITaxa ncbi = NCBITaxa() Levels = [ 'superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species' ] # generate nested dictionary for levels countDic = {} for level in Levels: countDic[level] = {} # fill in nested dictionary for t in hits['taxID'].tolist(): if t == 0: continue # This try / except thing is trying to catch sporatic errors of: # sqlite3.OperationalError: disk I/O error try: lin = ncbi.get_lineage(t) lin2name = ncbi.get_taxid_translator(lin) name2rank = ncbi.get_rank(lin) except: time.sleep(1) lin = ncbi.get_lineage(t) lin2name = ncbi.get_taxid_translator(lin) name2rank = ncbi.get_rank(lin) for i in lin: rank = name2rank[i] name = lin2name[i] if rank in countDic: countDic[rank][i] = countDic[rank].get(i, 0) + 1 # make the table total = sum(countDic['phylum'].values()) table = { 'tax_ID': [], 'tax_confidence': [], 'tax_level': [], 'taxonomy': [] } count = None for level in Levels: dic = countDic[level] for name in sorted(dic, key=dic.get, reverse=True): count = dic[name] break if count == None: table['tax_ID'].append(None) table['tax_confidence'].append(0) table['tax_level'].append(level) table['taxonomy'].append('unk') else: lin = ncbi.get_lineage(name) lin2name = ncbi.get_taxid_translator(lin) name2rank = ncbi.get_rank(lin) rank2name = {v: k for k, v in name2rank.items()} tax = (lin2name[rank2name[level]]) table['tax_ID'].append(name) table['tax_confidence'].append(((count / total) * 100)) table['tax_level'].append(level) table['taxonomy'].append(tax) count = None tdb = pd.DataFrame(table) # find and mark the best hit best = tdb['tax_ID'][tdb['tax_confidence'] >= minPerc].tolist()[-1] tdb['best_hit'] = [True if i == best else False for i in tdb['tax_ID']] # get the full taxonomy for the best hit tdb['full_tax'] = [lineage_from_taxId(t) if b else False for t, b in zip(\ tdb['tax_ID'], tdb['best_hit'])] return tdb
# BOILER PLATE ------------------------------------------------------------------------------------------------------------ import sqlite3 from ete3 import NCBITaxa ncbi_taxa = NCBITaxa() db = sqlite3.connect("/u/home/c/cloeffle/scratch/sql/S_aureus.db") cur = db.cursor() db.execute( "CREATE TABLE TAX_INFO(DBNAME TEXT, GIVENTAXID INT, GENUSTAXID INT, SPECIESTAXID INT, STRAINTAXID INT);" ) cur.commit() # FUNCTIONS --------------------------------------------------------------------------------------------------------------- # Takes a list of lists and prints each list # used in debugging def look_at(entry): for i in entry: print i return # manually handle a taxid where taxid lineage could not be obtained using NCBITaxa # mainly used for debugging def handle(taxid): entry = [] print("This is the taxid: " + str(taxid)) int(taxid)
def get_lineage(taxId): ncbi_taxa = NCBITaxa() lineage = ncbi_taxa.get_lineage(taxId) return lineage
def get_name(taxId): ncbi_taxa = NCBITaxa() name_dict = ncbi_taxa.get_taxid_translator([taxId]) for taxid, name in name_dict.items(): # print(name) return name
def get_rank(taxId): ncbi_taxa = NCBITaxa() rank_dict = ncbi_taxa.get_rank([taxId]) for taxid, rank in rank_dict.items(): # print(rank) return rank
def taxId2Species(taxid): return NCBITaxa().get_taxid_translator([taxid])
def extract_taxa(mpwt_taxon_file, taxon_output_file, tree_output_file, taxonomy_level="phylum"): """From NCBI taxon ID, extract taxonomy rank and create a tree file Args: mpwt_taxon_file (str): mpwt taxon file for species in sbml folder taxon_output_file (str): path to taxonomy output file tree_output_file (str): path to tree output file taxonomy_level (str): taxonomy level, must be: phylum, class, order, family, genus or species. """ ncbi = NCBITaxa() # Map the taxonomy level to the taxonomy index in the ranks list. map_taxonomy_index = { 'phylum': 0, 'class': 1, 'order': 2, 'family': 3, 'genus': 4, 'species': 5 } taxonomy_index = map_taxonomy_index[taxonomy_level] taxon_ids = [] taxon_count = {} taxonomy_file_datas = [] with open(mpwt_taxon_file, "r") as taxon_file: csvfile = csv.reader(taxon_file, delimiter="\t") next(csvfile) for line in csvfile: taxon_ids.append(line[1]) lineage = ncbi.get_lineage(line[1]) lineage2ranks = ncbi.get_rank(lineage) names = ncbi.get_taxid_translator(lineage) ranks2lineage = dict((rank, names[taxid]) for (taxid, rank) in lineage2ranks.items()) ranks = [ ranks2lineage.get(rank, "unknown") for rank in ["phylum", "class", "order", "family", "genus", "species"] ] if ranks[taxonomy_index] != "unknown": taxon = ranks[taxonomy_index] else: taxon = "unknown" taxon = taxon.replace(' ', '_').replace('.', '') if taxon not in taxon_count: taxon_count[taxon] = 1 elif taxon == "unknown": taxon_count[taxon] = "" else: taxon_count[taxon] += 1 row = ([line[0], line[1]] + [taxon + '__' + str(taxon_count[taxon])] + ranks) taxonomy_file_datas.append(row) with open(taxon_output_file, "w") as taxonomy_file: csvwriter = csv.writer(taxonomy_file, delimiter="\t") csvwriter.writerow([ "organism_id", "taxid", "taxon_number", "phylum", "class", "order", "family", "genus", "species" ]) for taxonomy_file_data in taxonomy_file_datas: csvwriter.writerow(taxonomy_file_data) tree = ncbi.get_topology(taxon_ids) with open(tree_output_file, "w") as tree_file: tree_file.write(tree.get_ascii(attributes=["sci_name", "rank"]))
def parse_args(parser): args = parser.parse_args() if args.version: print get_version() sys.exit(0) if args.data_dir: set_data_path(args.data_dir) if not args.no_annot and not pexists(get_eggnogdb_file()): print colorify( 'Annotation database data/eggnog.db not present. Use download_eggnog_database.py to fetch it', 'red') raise emapperException() if args.mode == 'diamond': dmnd_db = args.dmnd_db if args.dmnd_db else get_eggnog_dmnd_db() if not pexists(dmnd_db): print colorify( 'DIAMOND database %s not present. Use download_eggnog_database.py to fetch it' % dmnd_db, 'red') raise emapperException() if args.cpu == 0: args.cpu = multiprocessing.cpu_count() # No --servermode available for diamond if args.mode == 'diamond' and args.servermode: parser.error( '--mode [diamond] and --servermode are mutually exclusive') # Output file required unless running in servermode if not args.servermode and not args.output: parser.error('An output project name is required (-o)') # Servermode implies using mem-based databases if args.servermode: args.usemem = True # Direct annotation implies no searches if args.annotate_hits_table: args.no_search = True args.no_annot = False # Sets GO evidence bases if args.go_evidence == 'experimental': args.go_evidence = set(["EXP", "IDA", "IPI", "IMP", "IGI", "IEP"]) args.go_excluded = set(["ND", "IEA"]) elif args.go_evidence == 'non-electronic': args.go_evidence = None args.go_excluded = set(["ND", "IEA"]) else: raise ValueError('Invalid --go_evidence value') # Check inputs for running sequence searches if not args.no_search and not args.servermode: if not args.input: parser.error('An input fasta file is required (-i)') # HMM if args.mode == 'hmmer': if not args.db and not args.guessdb: parser.error( 'HMMER mode requires specifying a target database (i.e. -d, --guessdb ))' ) if args.db and args.guessdb: parser.error('-d and --guessdb options are mutually exclusive') if args.guessdb: from ete3 import NCBITaxa ncbi = NCBITaxa() lineage = ncbi.get_lineage(args.guessdb) for tid in reversed(lineage): if tid in TAXID2LEVEL: print tid, TAXID2LEVEL[tid] args.db = TAXID2LEVEL[tid] break # DIAMOND elif args.mode == 'diamond': #if args.db or args.guessdb: # parser.error('diamond mode does not require -d or --guessdb options') pass return args
def connect_ncbitaxa(): # ncbi = NCBITaxa("/data/collaborations/spongilla_web/webplugin_py2/ete3_webserver/taxa.sqlite") ncbi = NCBITaxa("./taxa.sqlite") return(ncbi)
def get_child_taxa(taxid): """get child taxids using """ ncbi = NCBITaxa(dbfile=app.config['TAXA_SQLITE']) child_taxids = ncbi.get_descendant_taxa(int(taxid), intermediate_nodes=True) return child_taxids
def __init__(self, file_loc, input_type, artifact_threshold=0, verbose=0, extension=None, include_strains=False): """ Creates OTUdata object for manipulation/transformation Parameters ------------ file_loc: str, location/file name of the input file (taxa profiling output file) input_type: str, what type of taxa profiling tool was used to generate the file, "E.g. Kaiju" Returns ------------ N/A """ self.file_id = '' self.include_strains = include_strains self.verbose = verbose self.ncbi = NCBITaxa() self.cumulated = False self.basic_ranks = [ 'superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species' ] self.ranks = { 'superkingdom': set(), 'phylum': set(), 'class': set(), 'order': set(), 'family': set(), 'genus': set(), 'species': set() } self.superkingdom = { 'virus': set(), 'bacteria': set(), 'eukaryote': set(), 'archaea': set() } if extension is None: self.file_id = self.process_file_name(file_loc) else: self.file_id = self.process_file_name_with_known_extension( file_loc, extension) if input_type.lower() not in ['kaiju', 'old kaiju', 'old kaiju v2']: print( "[ERROR] only 'kaiju','old kaiju','old kaiju v2' input type supported" ) if input_type.lower() == 'kaiju': self.otufile = read_kj(file_loc, artifact_threshold) if input_type.lower() == 'old kaiju': self.otufile = read_old_kj(file_loc, artifact_threshold) if input_type.lower() == 'old kaiju v2': self.otufile = read_old_kjV2(file_loc, artifact_threshold) #root_id = [-1, 1, 131567] root_id = [-1] for root in root_id: if root in self.otufile: del self.otufile[root] ### these two are problematic ncbi taxa id that have been deleted, I am now replacing them with their updated taxa id fix_taxa = get_dict() for k, v in fix_taxa.items(): if k in self.otufile: self.otufile[v] = self.otufile[k] del self.otufile[k] self.update_ranks() self.update_superkingdom()
def run(args): # add lineage profiles/stats import re from ete3 import PhyloTree, NCBITaxa # dump tree by default if not args.tree and not args.info and not args.descendants: args.tree = True ncbi = NCBITaxa() all_taxids = {} all_names = set() queries = [] if not args.search: log.error('Search terms should be provided (i.e. --search) ') sys.exit(-1) for n in args.search: queries.append(n) try: all_taxids[int(n)] = None except ValueError: all_names.add(n.strip()) # translate names name2tax = ncbi.get_name_translator(all_names) all_taxids.update([(v, None) for v in list(name2tax.values())]) not_found_names = all_names - set(name2tax.keys()) if args.fuzzy and not_found_names: log.warn("%s unknown names", len(not_found_names)) for name in not_found_names: # enable extension loading tax, realname, sim = ncbi.get_fuzzy_name_translation( name, args.fuzzy) if tax: all_taxids[tax] = None name2tax[name] = tax name2realname[name] = realname name2score[name] = "Fuzzy:%0.2f" % sim if not_found_names: log.warn("[%s] could not be translated into taxids!" % ','.join(not_found_names)) if args.tree: if len(all_taxids) == 1: target_taxid = next(all_taxids.keys()) log.info("Dumping NCBI descendants tree for %s" % (target_taxid)) t = ncbi.get_descendant_taxa( target_taxid, collapse_subspecies=args.collapse_subspecies, rank_limit=args.rank_limit, return_tree=True) else: log.info("Dumping NCBI taxonomy of %d taxa..." % (len(all_taxids))) t = ncbi.get_topology(list(all_taxids.keys()), intermediate_nodes=args.full_lineage, rank_limit=args.rank_limit, collapse_subspecies=args.collapse_subspecies) id2name = ncbi.get_taxid_translator([n.name for n in t.traverse()]) for n in t.traverse(): n.add_features(taxid=n.name) n.add_features(sci_name=str(id2name.get(int(n.name), "?"))) n.name = "%s - %s" % (id2name.get(int(n.name), n.name), n.name) lineage = ncbi.get_lineage(n.taxid) n.add_features( named_lineage='|'.join(ncbi.translate_to_names(lineage))) dump(t, features=[ "taxid", "name", "rank", "bgcolor", "sci_name", "collapse_subspecies", "named_lineage" ]) elif args.descendants: log.info("Dumping NCBI taxonomy of %d taxa..." % (len(all_taxids))) print('# ' + '\t'.join([ "Taxid", "Sci.Name", "Rank", "descendant_taxids", "descendant_names" ])) translator = ncbi.get_taxid_translator(all_taxids) ranks = ncbi.get_rank(all_taxids) for taxid in all_taxids: descendants = ncbi.get_descendant_taxa( taxid, collapse_subspecies=args.collapse_subspecies, rank_limit=args.rank_limit) print('\t'.join([ str(taxid), translator.get(taxid, taxid), ranks.get(taxid, ''), '|'.join(map(str, descendants)), '|'.join(map(str, ncbi.translate_to_names(descendants))) ])) elif args.info: print('# ' + '\t'.join( ["Taxid", "Sci.Name", "Rank", "Named Lineage", "Taxid Lineage"])) translator = ncbi.get_taxid_translator(all_taxids) ranks = ncbi.get_rank(all_taxids) for taxid, name in six.iteritems(translator): lineage = ncbi.get_lineage(taxid) named_lineage = ','.join(ncbi.translate_to_names(lineage)) lineage_string = ','.join(map(str, lineage)) print('\t'.join([ str(taxid), name, ranks.get(taxid, ''), named_lineage, lineage_string ]))
#!/usr/bin/env python import sys import argparse import logging from collections import Counter, defaultdict from ete3 import NCBITaxa # Constants ncbi = NCBITaxa( dbfile="/apps/etetoolkit/taxa.sqlite") # location of ete3 database # Parse and check arguments parser = argparse.ArgumentParser( description='Parse eggNOG file and determine OGs') parser.add_argument( '-min_occurence', metavar="PERCENT", action="store", type=float, default=0, help= "Minimum occurence (percent of genomes where gene is present); should be 0-100, default=0" ) parser.add_argument( '-min_uniqueness', metavar="PERCENT", action="store", type=float, default=0, help=
def contig_tax(annot_df, ncbi_db, min_prot, prop_annot, tax_thres): '''This function takes the annotation table generated by viral_contig_maps.py and generates a table that provides the taxonomic lineage of each viral contig, based on the corresponding ViPhOG annotations''' ncbi = NCBITaxa(dbfile=ncbi_db) tax_rank_order = ["genus", "subfamily", "family", "order"] contig_list = list(annot_df["Contig"].value_counts().index) df_rows = [] def get_tax_rank(label): try: tax_id = ncbi.get_name_translator([label])[label] tax_rank = ncbi.get_rank(tax_id)[tax_id[0]] except: tax_rank = "" return tax_rank for contig in contig_list: assigned_taxa = [] assigned_taxa.append(contig) contig_df = annot_df[annot_df["Contig"] == contig] filtered_df = contig_df[contig_df["Label"].notnull()] filtered_df = filtered_df.reset_index(drop=True) total_annot_prot = len(filtered_df) if total_annot_prot < max(min_prot, prop_annot * len(contig_df)): assigned_taxa.extend([""]*4) else: filtered_df["Rank"] = filtered_df["Label"].apply(get_tax_rank) for item in tax_rank_order: tax_hits = {} if item == "genus": for row, column in filtered_df.iterrows(): if column["Rank"] == item: if column["Label"] not in tax_hits.keys(): tax_hits[column["Label"]] = 1 else: tax_hits[column["Label"]] += 1 if len(tax_hits) < 1: assigned_taxa.append("") else: annot_ratio = max(tax_hits.items(), key=operator.itemgetter(1))[ 1]/total_annot_prot if annot_ratio < tax_thres: assigned_taxa.append(str(annot_ratio)) else: max_tax = [] for key, value in tax_hits.items(): if value == max(tax_hits.items(), key=operator.itemgetter(1))[1]: max_tax.append(key) if len(max_tax) > 1: assigned_taxa.append("-".join(max_tax)) else: assigned_taxa.append(max_tax[0]) else: for row, column in filtered_df.iterrows(): if column["Rank"] == item: if column["Label"] not in tax_hits.keys(): tax_hits[column["Label"]] = 1 else: tax_hits[column["Label"]] += 1 else: try: name2taxid = ncbi.get_name_translator( [column["Label"]]) label_lineage = ncbi.get_lineage( name2taxid[column["Label"]][0]) lineage_names = ncbi.get_taxid_translator( label_lineage) lineage_ranks = ncbi.get_rank(label_lineage) if item in lineage_ranks.values(): for x, y in lineage_ranks.items(): if y == item: if lineage_names[x] not in tax_hits.keys(): tax_hits[lineage_names[x]] = 1 else: tax_hits[lineage_names[x]] += 1 break except: continue if len(tax_hits) < 1: assigned_taxa.append("") else: annot_ratio = max(tax_hits.items(), key=operator.itemgetter(1))[ 1]/total_annot_prot if annot_ratio < tax_thres: assigned_taxa.append(str(annot_ratio)) else: max_tax = [] for key, value in tax_hits.items(): if value == max(tax_hits.items(), key=operator.itemgetter(1))[1]: max_tax.append(key) if len(max_tax) > 1: assigned_taxa.append("-".join(max_tax)) else: assigned_taxa.append(max_tax[0]) df_rows.append(assigned_taxa) final_df = pd.DataFrame( df_rows, columns=["contig_ID", "genus", "subfamily", "family", "order"]) return final_df
def main(): args = arguments() database_directory = os.path.abspath(args.database_directory) if os.path.exists(database_directory) == False: os.mkdir(database_directory) else: logger.info("Database directory folder already exists at {}".format(database_directory)) # Helper function to simplify adding database_directory to everything prepend_db_dir = functools.partial(os.path.join, database_directory) lockfilepath=os.path.join(database_directory,".lock") status_file = prepend_db_dir('status.txt') if os.path.exists(lockfilepath) == False: try: open(file=lockfilepath, mode="w").close() logger.info("Placed lock file at {}".format(lockfilepath)) except Exception as e: logger.error("Failed to place a lock file at {}. Database diretory can not be accessed. Wrong path?".format(lockfilepath)) logger.error("{}".format(e)) exit(-1) else: while os.path.exists(lockfilepath): elapsed_time = time.time() - os.path.getmtime(lockfilepath) logger.info("Lock file found at {}. Waiting for other processes to finish database init ...".format(lockfilepath)) logger.info("Elapsed time {} min. Will continue processing after 16 min mark.".format(int(elapsed_time/60))) if elapsed_time >= 1000: logger.info("Elapsed time {} min. Assuming previous process completed all init steps. Continue ...".format(int(elapsed_time/60))) try: #if previous process failed, no processes are running and > 16 min passed since the lock was created os.remove(lockfilepath) except: #continue if file was removed by other process pass break time.sleep(60) #recheck every 1 min if lock file was removed logger.info("Lock file no longer exists. Assuming init process completed successfully") return 0 logger.info('Initializing databases...this will take some time') # Find available threads and use the maximum number available for mash sketch but cap it at 32 num_threads = min(multiprocessing.cpu_count(), 32) if not os.path.exists(database_directory): os.makedirs(database_directory) zip_file = prepend_db_dir('data.tar.gz') plasmid_database_fasta_file = prepend_db_dir('ncbi_plasmid_full_seqs.fas') repetitive_fasta_file = prepend_db_dir('repetitive.dna.fas') mash_db_file = prepend_db_dir('ncbi_plasmid_full_seqs.fas.msh') logger.info('Downloading databases...this will take some time') for db_mirror in config['db_mirrors']: try: logger.info('Trying mirror {}'.format(db_mirror)) download_to_file(db_mirror, zip_file) break except Exception as e: logger.error("Download failed with error {}. Removing lock file".format(str(e))) os.remove(lockfilepath) sys.exit(-1) logger.info("Downloading databases successful, now building databases at {}".format(database_directory)) extract(zip_file, database_directory) files = [prepend_db_dir(f) for f in os.listdir(database_directory) if f.endswith('.gz')] for file in files: extract(file, database_directory) #Initialize blast and mash databases try: logger.info('Building repetitive mask database') blast_runner = BlastRunner(repetitive_fasta_file, database_directory) blast_runner.makeblastdb(repetitive_fasta_file, 'nucl',logger) logger.info('Building complete plasmid database') blast_runner = BlastRunner(plasmid_database_fasta_file, database_directory) blast_runner.makeblastdb(plasmid_database_fasta_file, 'nucl',logger,True) logger.info('Sketching complete plasmid database') mObj = mash() mObj.mashsketch(plasmid_database_fasta_file, mash_db_file, num_threads=num_threads) except Exception as e: logger.error('Downloading databases failed, please check your internet connection and retry') logger.error("Process failed with error {}. Removing lock file".format(e)) os.remove(lockfilepath) sys.exit(-1) try: logger.info("Init ete3 library ...") ete3taxadbpath = os.path.abspath(os.path.join(database_directory,"taxa.sqlite")) ncbi = NCBITaxa() ncbi.dbfile=ete3taxadbpath ncbi.update_taxonomy_database() except Exception as e: logger.error("Init of ete3 library failed with error {}. Removing lock file".format(e)) os.remove(lockfilepath) sys.exit(-1) try: os.remove(os.path.join(os.getcwd(), "taxdump.tar.gz")) logger.info("Removed residual taxdump.tar.gz as ete3 is not doing proper cleaning job.") except: pass with open(status_file, 'w') as f: download_date = datetime.datetime.today().strftime('%Y-%m-%d') f.write("Download date: {}. Removing lock file.".format(download_date)) try: os.remove(lockfilepath) except: logger.warning("Lock file is already removed by some other process.") pass logger.info("MOB init completed successfully") return 0
def get_taxa_info() -> NCBITaxa: return NCBITaxa(dbfile=DEFAULT_TAXADB, taxdump_file=None if os.path.exists(DEFAULT_TAXADB) else TAXA_DUMP)
def analysis(): args = setting() cwd = args.workdir #os.getcwd() ncbi = NCBITaxa() home = str(Path.home()) pathogens = args.pathogens_species.split(",") file_combined_fastq = os.path.join(os.getcwd(), args.fastq) if not os.path.isfile(file_combined_fastq): fastq_files = [os.path.join(file_combined_fastq, f) for f in listdir(file_combined_fastq) if isfile(join(file_combined_fastq, f)) and f.endswith("fastq")] k = file_combined_fastq.rfind("/") file_combined_fastq = file_combined_fastq[:k] + ".fastq" + file_combined_fastq[k + 1:] with open(file_combined_fastq, 'wb') as wfd: for file in fastq_files: with open(file, 'rb') as fd: shutil.copyfileobj(fd, wfd, 1024 * 1024 * 10) reads_fastq = [] if file_combined_fastq.endswith("fastq") or file_combined_fastq.endswith("fq"): for record in SeqIO.parse(file_combined_fastq, "fastq"): reads_fastq.append(str(record.id)) elif file_combined_fastq.endswith("fasta") or file_combined_fastq.endswith("fa"): for record in SeqIO.parse(file_combined_fastq, "fasta"): reads_fastq.append(str(record.id)) else: print("Not known reads file format") number_reads = len(reads_fastq) if args.host_specie == "" and args.pathogens_species == "": species = "" elif args.host_specie == "" and not args.pathogens_species == "": species = pathogens elif not args.host_specie == "" and args.pathogens_species == "": species = [args.host_specie] else: species = [args.host_specie] + pathogens species.sort() name_database = "_".join(species).replace(" ", "_") genome_db = os.path.join(cwd, name_database + ".fasta") genome_db_id = os.path.join(cwd, name_database + ".txt") all_genomes = False if "refseq" in args.NCBIdatabase: table_file = "assembly_summary_refseq.txt" if "assembly" in args.NCBIdatabase: all_genomes = True table_file = "assembly_summary_genbank.txt" if os.path.exists(os.path.join(cwd,table_file)): os.remove(os.path.join(cwd,table_file)) cmd = WGET % table_file wget = sb.Popen(cmd, shell=True, stdout=sb.PIPE, stderr=sb.PIPE, cwd=cwd) wget.communicate() sys.stdout.write("### UPDATING THE DATABASE\n") # This part checks for a new version of the taxdump.tar.gz; the code looks for a new version every day ete = os.path.expanduser("~/.etetoolkit/taxa.sqlite.traverse.pkl") modified = os.path.getmtime(ete) modificationTime = time.strftime('%m', time.localtime(modified)) today = datetime.date.today() month = today.strftime("%m") if modificationTime != month: ncbi.update_taxonomy_database() dict_species = {} # here we set if is an not know pathogen or we have and idea of which pathogen to investigate with open(os.path.join(cwd, table_file), "r") as fh: descendants_all = [] for specie in species: name2taxid = ncbi.get_name_translator([specie]) if args.host_specie in specie: plant = name2taxid[specie] for key in name2taxid[specie]: descendants = ncbi.get_descendant_taxa(key, collapse_subspecies=True) for sstaxa in descendants: descendants_all.append(str(sstaxa)) for line in fh: if not line.startswith("#"): if line.split("\t")[6] in descendants_all:# and "subsp" in line: ssname = " ".join([line.split("\t")[7].split(" ")[0], line.split("\t")[7].split(" ")[1]]) tax = line.split("\t")[6] ftp = line.split("\t")[19] genome = ftp.split("/")[-1] + "_genomic.fna.gz" ftp_genome = os.path.join(ftp, genome) path_genome = os.path.join(cwd, genome) #species_assembly = " ".join([line.split("\t")[7]].split(" ")[0], [line.split("\t")[7]].split(" ")[1]) if ssname in dict_species: dict_species[ssname] = dict_species[ssname] + [(ftp_genome, path_genome, tax, genome, ssname)] else: dict_species[ssname] = [(ftp_genome, path_genome, tax, genome, ssname)] db_file = os.path.join(home, ".db_monica." + name_database) if all_genomes: print("DOWNLOADING MULTIPLE GENOMES FOR THE SAME SPECIES") genomes_select = [name for specie in dict_species for name in dict_species[specie]] else: print("DOWNLOADING ONE GENOME FOR SPECIES") genomes_select = [dict_species[specie][-1] for specie in dict_species] print("I WILL DOWNLOAD %s GENOMES" % str(len(genomes_select))) if not os.path.exists(db_file) or not os.path.exists(genome_db): with open(genome_db, "w") as output_handle, open(genome_db_id, "w") as output_handle_id: with open(db_file, "w") as fh: for names in genomes_select: ftp_genome, path_genome, tax, genome, ssname = names if genome.startswith("GC"): genome_used = cwd + genome + "\n" fh.write(genome_used) if not os.path.exists(path_genome): cmd = WGET_GENOME % ftp_genome wget_gen = sb.Popen(cmd, shell=True, stdout=sb.PIPE, stderr=sb.PIPE, cwd=cwd) wget_gen.communicate() with gzip.open(path_genome, "rt") as handle: print("PARSING " + genome + " GENOME") for record in SeqIO.parse(handle, "fasta"): record.id = tax + "_" + str(record.id) record.description = genome.split(".")[0] SeqIO.write(record, output_handle, "fasta") output_handle_id.write(str(record.name) + "%" + str(record.description) + "\n") sys.stdout.write("### PREPARING FOR MAPPING\n") genome_to_contig = {} with open(genome_db_id, "r") as fhtxt: for record in fhtxt: #txt SeqIO.parse(genome_db, "fasta"): line = record.split("%") genome_to_contig[line[0]] = line[1].rsplit() genome_to_species= {} with open(os.path.join(cwd, table_file), "r") as fh: for line in fh: line = line.rstrip().split("\t") genome = line[0].split(".")[0] if len(line) > 9 and not line[0].startswith("#"): subspecies = line[7].split(" ")[:2] subspecie = "_".join(subspecies) #+ " " + line[8].split("=")[1:] tribu = "_".join(line[8].split("=")[1:]) genome_to_species[genome] = subspecie + "-" + tribu sam_output = file_combined_fastq + ".sam" cmd = MINIMAP % (str(args.threads), genome_db, file_combined_fastq, sam_output) sys.stdout.write("RUNNING MINIMAP2\n") minimap = sb.Popen(cmd, shell=True, cwd=cwd) minimap.communicate() reads_dict = {} count = 0 with open(sam_output) as fh: for sam in fh: if sam != "" and not sam.startswith("@"): fields = sam.split("\t") if not fields[2] == "*": for entry in fields: if entry.startswith("MD"): md = entry.split(":")[-1] mismatch = len(re.findall("[A-Z]", md)) match = sum([int(number) for number in re.sub('[A-Z]|\^', ',', md).split(",") if number != "" and number.isdigit()]) if match > 0: if mismatch > 0: iden = (match - mismatch) / match * 100 if fields[0] in reads_dict: if iden == reads_dict[fields[0]][0]: if reads_dict[fields[0]][1].startswith(fields[2].split("_")[0]): continue else: count += 1 reads_dict.pop(fields[0], None) elif iden > reads_dict[fields[0]][0]: reads_dict[fields[0]] = (iden, fields[2], fields[0]) else: reads_dict[fields[0]] = (iden, fields[2], fields[0]) else: iden = 100 if fields[0] in reads_dict: if iden == reads_dict[fields[0]][0]: if reads_dict[fields[0]][1].startswith(fields[2].split("_")[0]): continue else: count += 1 reads_dict.pop(fields[0], None) elif iden > reads_dict[fields[0]][0]: reads_dict[fields[0]] = (iden, fields[2], fields[0]) else: reads_dict[fields[0]] = (iden, fields[2], fields[0]) out_file = file_combined_fastq + ".reads.txt" with open(out_file, "w") as csv: for key in reads_dict: csv.write("\t".join([reads_dict[key][1], reads_dict[key][2]]) + " \n") print(count) count = {} number_reads_mapped = 0 for read in reads_dict: match = reads_dict[read][1].split("_") if len(match) > 1: number_reads_mapped += 1 if all_genomes: contig = match[1] #+ "_" + match[2] else: contig = match[1] + "_" + match[2] genome_map = genome_to_contig[contig] species_ss = genome_to_species[genome_map[0]] uniq_name = match[0] + "_" + species_ss if not uniq_name in count: count[uniq_name] = 1 else: count[uniq_name] = count[uniq_name] + 1 print("Name sample: " + file_combined_fastq) print("Number reads:" + str(number_reads)) print("Number reads mapped:" + str(number_reads_mapped) + "\nPercentage of reads mapped:" + str( number_reads_mapped/number_reads * 100) + " %\n") header = [] reads_mapped = [] partial_tree = [] for clade in types: header.append(clade[0]) reads_mapped.append("") header.append("A") reads_mapped.append(str(number_reads-number_reads_mapped)) total = [header] + [reads_mapped] tribu_dict = {} sorted_list = [] for value in count: key = value.split("_")[0] if not str(key).startswith(str(plant[0])): sorted_list.append((value[1],(count[value]/number_reads_mapped*100))) lineage = ncbi.get_lineage(int(key)) a = ncbi.get_rank(lineage) tribu = value.split("-")[1] tribu_dict["tribu"] = tribu tree = [] for match in types: combination = [match[1]] if match[0] in tribu_dict: combination.append("".join([tribu_dict[match[0]]])) else: for tax in a: if match[0].startswith(a[tax]) and match[0].endswith(a[tax]): combination.append(ncbi.get_taxid_translator([int(tax)])[tax].replace(" ","_")) tree.append("".join(combination)) tree.append(str(count[value])) partial_tree = partial_tree + [tree] partial_tree.sort() total = total + partial_tree out_file = file_combined_fastq + ".txt" with open(out_file, "w") as csv: for line in total: csv.write(",".join(line) + " \n") plot_circ(out_file, file_combined_fastq) print("done")
def build_tree(seqs, taxa2acc, red_factor, root, log): # Important: you should update ETE DB before running this script. # This is done automatically only if it has not been downloaded yet. ncbi = NCBITaxa() taxa = [] for s in seqs.values(): try: taxa.append(s['taxid']) except KeyError: continue built = False while not built: try: t = ncbi.get_topology(taxa, intermediate_nodes=True) built = True except KeyError as e: taxid_not_found = int(e.args[0]) taxa.remove(taxid_not_found) if log: print( '[prophyle_ncbi_tree] ERROR: TaxID ' + str(taxid_not_found) + ' not found in ETE DB (try updating it)', file=log ) pass # [Issue #153] Ignore internal nodes with fasta associated till we find a solution for it if log: internal_with_fasta = 0 for node in t.traverse('postorder'): if not node.is_leaf() and node.taxid in taxa: internal_with_fasta += len([acc for acc in taxa2acc[node.taxid] if acc in seqs.keys()]) print( '[prophyle_ncbi_tree] ' + str(internal_with_fasta) + ' sequences' + ' ignored because associated to internal node (see issue #153)', file=log ) leaves_taxa = [leaf.taxid for leaf in t if leaf.taxid in taxa2acc] t = ncbi.get_topology(leaves_taxa, intermediate_nodes=True) if red_factor: i = 0 red_taxa = [] for leaf in t: if i % red_factor == 0: red_taxa.append(leaf.taxid) i += 1 t = ncbi.get_topology(red_taxa, intermediate_nodes=True) if root: taxa_to_keep = [] for leaf in t: if root in leaf.named_lineage: taxa_to_keep.append(leaf.taxid) t = ncbi.get_topology(taxa_to_keep, intermediate_nodes=True) node_count = len(t.get_descendants()) + 1 seq_count = 0 for node in t.traverse('postorder'): node.name = node.taxid if node.is_leaf(): first = True for acc in taxa2acc[node.taxid]: try: s = seqs[acc] if first: accession = '@'.join([acc] * (s['offset'].count('@') + 1)) path = s['fn'] base_len = s['seqlen'] infasta_offset = s['offset'] first = False else: accession += ('@' + acc) * (s['offset'].count('@') + 1) path += '@' + s['fn'] base_len += '@' + s['seqlen'] infasta_offset += '@' + s['offset'] seq_count += 1 except KeyError: pass node.add_features(path=path, base_len=base_len, infasta_offset=infasta_offset, accession=accession) if not hasattr(t, 'taxid'): t.add_features(taxid=0) t.name = t.taxid return t, seq_count, node_count
def plot_phylum_counts(domain_id, rank='phylum', colapse_low_species_counts=4, remove_unlassified=True): ''' 1. get phylum tree 2. foreach species => get phylum 3. build phylum2count dictionnary 3. plot barchart # merge eukaryotes into 5 main clades # merge virus as a single clade ATTENTION: no-rank groups and no-rank species... ''' import MySQLdb import os from chlamdb.biosqldb import manipulate_biosqldb from ete3 import NCBITaxa, Tree, TextFace, TreeStyle, StackedBarFace ncbi = NCBITaxa() sqlpsw = os.environ['SQLPSW'] conn = MySQLdb.connect( host="localhost", # your host, usually localhost user="******", # your username passwd=sqlpsw, # your password db="interpro") # name of the data base cursor = conn.cursor() sql = 'select * from pfam.leaf2n_genomes_%s' % rank cursor.execute(sql, ) leaf_taxon2n_species = manipulate_biosqldb.to_dict(cursor.fetchall()) leaf_taxon2n_species_with_domain = get_domain_taxonomy(domain_id, rank) sql = 'select phylogeny from pfam.phylogeny where rank="%s"' % (rank) cursor.execute(sql, ) tree = Tree(cursor.fetchall()[0][0], format=1) sql = 'select * from pfam.taxid2label_%s' % rank cursor.execute(sql, ) taxon_id2scientific_name_and_rank = manipulate_biosqldb.to_dict( cursor.fetchall()) taxon_id2scientific_name_and_rank = { str(k): v for k, v in taxon_id2scientific_name_and_rank.items() } tss = TreeStyle() tss.draw_guiding_lines = True tss.guiding_lines_color = "blue" keep = [] for lf in tree.iter_leaves(): # n genomes if remove_unlassified: label = taxon_id2scientific_name_and_rank[str(lf.name)][0] if 'unclassified' in label: continue n_genomes = int(leaf_taxon2n_species[lf.name]) if n_genomes > colapse_low_species_counts: keep.append(lf.name) print('number of leaves:', len(keep)) tree.prune(keep) header_list = ['Rank', 'N genomes', 'N with %s' % domain_id, 'Percentage'] for col, header in enumerate(header_list): n = TextFace('%s' % (header)) n.margin_top = 0 n.margin_right = 1 n.margin_left = 20 n.margin_bottom = 1 n.rotation = 270 n.hz_align = 2 n.vt_align = 2 n.inner_background.color = "white" n.opacity = 1. tss.aligned_header.add_face(n, col) for lf in tree.iter_leaves(): # n genomes n_genomes = int(leaf_taxon2n_species[lf.name]) if n_genomes <= colapse_low_species_counts: continue n = TextFace(' %s ' % str(leaf_taxon2n_species[lf.name])) n.margin_top = 1 n.margin_right = 1 n.margin_left = 0 n.margin_bottom = 1 n.fsize = 7 n.inner_background.color = "white" n.opacity = 1. lf.add_face(n, 2, position="aligned") # n genomes with domain m = TextFace(' %s ' % str(leaf_taxon2n_species_with_domain[lf.name])) m.margin_top = 1 m.margin_right = 1 m.margin_left = 0 m.margin_bottom = 1 m.fsize = 7 m.inner_background.color = "white" m.opacity = 1. lf.add_face(m, 3, position="aligned") # rank ranks = ncbi.get_rank([lf.name]) try: r = ranks[max(ranks.keys())] except: r = '-' n = TextFace(' %s ' % r, fsize=14, fgcolor='red') n.margin_top = 1 n.margin_right = 1 n.margin_left = 0 n.margin_bottom = 1 n.fsize = 7 n.inner_background.color = "white" n.opacity = 1. lf.add_face(n, 1, position="aligned") # percent with target domain percentage = (float(leaf_taxon2n_species_with_domain[lf.name]) / float(leaf_taxon2n_species[lf.name])) * 100 m = TextFace(' %s ' % str(round(percentage, 2))) m.fsize = 1 m.margin_top = 1 m.margin_right = 1 m.margin_left = 0 m.margin_bottom = 1 m.fsize = 7 m.inner_background.color = "white" m.opacity = 1. lf.add_face(m, 4, position="aligned") b = StackedBarFace([percentage, 100 - percentage], width=100, height=10, colors=["#7fc97f", "white"]) b.rotation = 0 b.inner_border.color = "grey" b.inner_border.width = 0 b.margin_right = 15 b.margin_left = 0 lf.add_face(b, 5, position="aligned") n = TextFace('%s' % taxon_id2scientific_name_and_rank[str(lf.name)][0], fgcolor="black", fsize=9) # , fstyle = 'italic' lf.name = " %s (%s)" % (taxon_id2scientific_name_and_rank[str( lf.name)][0], str(lf.name)) n.margin_right = 10 lf.add_face(n, 0) tss.show_leaf_name = False for node in tree.traverse("postorder"): try: r = taxon_id2scientific_name_and_rank[str(node.name)][1] except: pass try: if r in ['phylum', 'superkingdom', 'class', 'subphylum' ] or taxon_id2scientific_name_and_rank[str( node.name)][0] in ['FCB group']: hola = TextFace( "%s" % (taxon_id2scientific_name_and_rank[str(node.name)][0])) node.add_face(hola, column=0, position="branch-top") except: pass return tree, tss
from ete3 import NCBITaxa ncbiTaxa = NCBITaxa()
def get_rank_summary_statistics(rank='phylum'): ''' Get phylogeny from the ncbi taxonomy database given the taxon list in the table pfam.refseq_ref_repres_genomes Keep rank phylogeny in the table pfam.phylogeny Calculate genome counts for each taxon at the specified rank. Save taxid2count in the table: pfam.<rank>_leaf2n_genomes :param rank: :return: ''' import MySQLdb import os from ete3 import NCBITaxa, Tree, TextFace, TreeStyle, StackedBarFace ncbi = NCBITaxa() sqlpsw = os.environ['SQLPSW'] conn = MySQLdb.connect( host="localhost", # your host, usually localhost user="******", # your username passwd=sqlpsw, # your password db="pfam") # name of the data base cursor = conn.cursor() sql = 'create table if not exists pfam.phylogeny (rank varchar(400), phylogeny TEXT)' cursor.execute(sql, ) conn.commit() sql2 = 'CREATE table if not exists pfam.leaf2n_genomes_%s(taxon_id INT, n_genomes INT)' % rank cursor.execute(sql2, ) conn.commit() sql_taxid_list = 'select taxid from pfam.refseq_ref_repres_genomes' cursor.execute(sql_taxid_list, ) taxid_list = [i[0] for i in cursor.fetchall()] tree = ncbi.get_topology(taxid_list, rank_limit=rank) taxon_id_list = [int(i.name) for i in tree.traverse("postorder")] taxon_id2scientific_name = ncbi.get_taxid_translator(taxon_id_list) sql = 'CREATE table if not exists pfam.taxid2label_%s(taxon_id INT, scientific_name TEXT, rank TEXT)' % ( rank) cursor.execute(sql, ) taxon_id2rank = {} for taxon in taxon_id2scientific_name: ranks = ncbi.get_rank([taxon]) try: r = ranks[max(ranks.keys())] except: r = '-' taxon_id2rank[taxon] = r for taxon in taxon_id2scientific_name: sql = 'insert into taxid2label_%s values(%s, "%s", "%s")' % ( rank, taxon, taxon_id2scientific_name[taxon], taxon_id2rank[taxon]) cursor.execute(sql, ) conn.commit() collapse = [ 'Opisthokonta', 'Alveolata', 'Amoebozoa', 'Stramenopiles', 'Viridiplantae', 'Rhodophyta', 'Trypanosomatidae', 'Viruses', 'unclassified Bacteria', 'Leptospiraceae', 'unclassified Gammaproteobacteria', 'unclassified Alphaproteobacteria', 'unclassified Epsilonproteobacteria', 'unclassified Deltaproteobacteria', 'unclassified Cyanobacteria (miscellaneous)', 'unclassified Firmicutes sensu stricto', 'unclassified Actinobacteria (class) (miscellaneous)', 'unclassified Tissierellia', 'Dehalogenimonas' ] #def collapsed_leaf(node): # collapse = ['Opisthokonta', 'Alveolata','Amoebozoa','Stramenopiles','Viridiplantae','Rhodophyta', 'Trypanosomatidae', 'Viruses'] # name = taxon_id2scientific_name[int(node.name)] # if name in collapse: # return True # else: # return False # colapse major euk clades some clades for node in tree.traverse("postorder"): name = taxon_id2scientific_name[int(node.name)] to_detach = [] if name in collapse: to_detach.extend(node.children) print('ok-------------------', node.name) for n in to_detach: n.detach() leaves_list = [i.name for i in tree.iter_leaves()] leaf_taxon2n_species = {} leaf_taxon2n_species_with_domain = {} for leaf_taxon in leaves_list: print('leaf', leaf_taxon) leaf_taxon2n_species[leaf_taxon] = 0 leaf_taxon2n_species_with_domain[leaf_taxon] = 0 for taxon in taxid_list: lineage = ncbi.get_lineage(taxon) if int(leaf_taxon) in lineage: leaf_taxon2n_species[leaf_taxon] += 1 #if taxon in taxid_with_domain_list: # leaf_taxon2n_species_with_domain[leaf_taxon]+=1 for leaf_taxon in leaf_taxon2n_species: sql = 'insert into pfam.leaf2n_genomes_%s values(%s, %s)' % ( rank, leaf_taxon, leaf_taxon2n_species[leaf_taxon]) cursor.execute(sql, ) conn.commit() sql = 'insert into pfam.phylogeny values("%s","%s")' % ( rank, tree.write(format=1)) cursor.execute(sql, ) conn.commit()
import os import sys from ete3 import NCBITaxa ncbi = NCBITaxa() # lineage = ncbi.get_lineage(9606) # print (ncbi.get_rank(lineage)) # print (ncbi.get_taxid_translator(lineage)) acc_list = {} with open(sys.argv[1]) as f: for line in f: val = line.strip() acc_list[val] = 1 levels = { 'superkingdom': 1, 'phylum': 1, 'class': 1, 'order': 1, 'family': 1, 'genus': 1, 'species': 1 } with open(sys.argv[2]) as f: for line in f: if line.startswith('#'): continue val = line.strip().split('\t')
def __init__(self, category): self.ncbi = NCBITaxa() self.species = list( self.ncbi.get_descendant_taxa(category, collapse_subspecies=True)) self.ranks = self.ncbi.get_rank(self.species) self.taxas = filter(lambda x: self.ranks[x] == 'species', self.species)