def get_parent_taxa(self): """ Get parent taxa Returns: :obj:`list` of :obj:`Taxon`: list of parent taxa """ if self.id_of_nearest_ncbi_taxon is None: return None cls = self.__class__ ncbi_taxa = NCBITaxa() lineage = [ cls(ncbi_id=id) for id in ncbi_taxa.get_lineage(self.id_of_nearest_ncbi_taxon) ] if self.additional_name_beyond_nearest_ncbi_taxon: base_name = ncbi_taxa.translate_to_names( [self.id_of_nearest_ncbi_taxon])[0] names = self.additional_name_beyond_nearest_ncbi_taxon[1:].split( ' ') for i_rank, name, in enumerate(names): lineage.append( cls(name=base_name + ''.join(' ' + n for n in name[0:i_rank + 1]))) return lineage[0:-1]
def main(): """Make queries against NCBI Taxa databases""" # Get commandline args args = get_args() # Instantiate the ete NCBI taxa object ncbi = NCBITaxa() if args.verbose > 1: print("Taxa database is stored under ~/.etetoolkit/taxa.sqlite") # Update the database if required. if args.update is True: if args.verbose > 1: print( "Updating the taxonomy database. This may take several minutes..." ) ncbi.update_taxonomy_database() # If a name was provided instead of a TaxID, convert and store it. if args.name: args.taxid = ncbi.get_name_translator([args.name])[args.name][0] if args.verbose > 0: tax_dict = {} # If a name was provided, simply add it to dict if args.name: tax_dict['Name'] = args.name # If not, do the opposite conversion to the above and store that else: tax_dict['Name'] = ncbi.get_taxid_translator([args.taxid ])[args.taxid] # Continue to populate the taxa dict with other information tax_dict['TaxID'] = args.taxid tax_dict['Rank'] = ncbi.get_rank([args.taxid]) tax_dict['Lineage'] = ncbi.get_taxid_translator( ncbi.get_lineage(args.taxid)) print("Information about your selected taxa:") pretty(tax_dict) # Main feature of the script is to get all taxa within a given group. descendent_taxa = ncbi.get_descendant_taxa(args.taxid) descendent_taxa_names = ncbi.translate_to_names(descendent_taxa) print("Descendent taxa for TaxID: %s" % (args.taxid)) # Under python3, zip = izip. In python2, this list could be very large, and memory intensive # Suggest the script is run with python3 if args.verbose > 0: for dtn, dt in zip(descendent_taxa_names, descendent_taxa): print("%s\t%s" % (dtn, dt)) if args.outfile: with open(args.outfile, 'w') as ofh: for id in descendent_taxa: ofh.write(str(id) + '\n')
def get_taxonid_to_name(tree): ncbi = NCBITaxa() taxonid_to_name = {} for node in tree.traverse(): taxonid = int(node.name) taxonid_to_name[taxonid] = ncbi.translate_to_names([taxonid])[0] return taxonid_to_name
def __init__(self, id='', name='', ncbi_id=None, cross_references=None): """ Args: id (:obj:`str`, optional): identifier name (:obj:`str`, optional): name ncbi_id (:obj:`int`, optional): NCBI identifier cross_references (:obj:`list` of :obj:`CrossReference`, optional): list of cross references """ self.id = id self.name = name self.id_of_nearest_ncbi_taxon = None self.distance_from_nearest_ncbi_taxon = None self.additional_name_beyond_nearest_ncbi_taxon = None self.cross_references = cross_references or [] ncbi_taxa = NCBITaxa() if ncbi_id: self.id_of_nearest_ncbi_taxon = ncbi_id self.distance_from_nearest_ncbi_taxon = 0 self.additional_name_beyond_nearest_ncbi_taxon = '' self.name = ncbi_taxa.translate_to_names([ncbi_id])[0] if self.name == ncbi_id: raise ValueError( 'The NCBI taxonomy database does not contain a taxon with id {}' .format(ncbi_id)) else: rank_names = name.split(' ') for i_rank in range(len(rank_names)): partial_name = ' '.join(rank_names[0:len(rank_names) - i_rank]) result = ncbi_taxa.get_name_translator([partial_name]) if result: self.id_of_nearest_ncbi_taxon = result[partial_name][0] self.distance_from_nearest_ncbi_taxon = i_rank self.additional_name_beyond_nearest_ncbi_taxon = ''.join( ' ' + n for n in rank_names[len(rank_names) - i_rank:]) self.name = ncbi_taxa.translate_to_names([self.id_of_nearest_ncbi_taxon])[0] \ + self.additional_name_beyond_nearest_ncbi_taxon return self.name = name
def main(InputMSA, output): ncbi = NCBITaxa() #ncbi.update_taxonomy_database() headers, seqs = readAlg(InputMSA) sys.stdout.write("Annotating headers for %d sequences..." % len(headers)) for i in range(0, len(headers)): head_terms = read_header(headers[i]) lin = ncbi.get_lineage(head_terms["taxid"]) #sp_name = ncbi.translate_to_names([tid]) lin_name = ncbi.translate_to_names(lin) with open(output, 'w+') as output_fasta: output_fasta.write(">%s|%s|%s\n%s\n" % (head_terms["header"], lin_name[-1], ", ".join( lin_name[1:]), seqs[i])) sys.stdout.write("Done\n")
def main(): """main""" args = get_args() infile = args.infile email = args.email out_file = args.outfile Entrez.email = email acc_list = open(infile, 'r').read().splitlines() acc = ','.join(acc_list) handle = Entrez.esummary(db="nuccore", id=acc) records = Entrez.read(handle) handle.close() ncbi = NCBITaxa() with open(out_file, 'w') as out_f: for record in records: acc_version = record["AccessionVersion"] tax_id = record["TaxId"] lineage = ncbi.get_lineage(tax_id) name = ncbi.translate_to_names(lineage) print("{}\t{}".format(acc_version, '\t'.join(name)), file=out_f)
def run(args): # add lineage profiles/stats import re from ete3 import PhyloTree, NCBITaxa # dump tree by default if not args.tree and not args.info and not args.descendants: args.tree = True ncbi = NCBITaxa() all_taxids = {} all_names = set() queries = [] if not args.search: log.error('Search terms should be provided (i.e. --search) ') sys.exit(-1) for n in args.search: queries.append(n) try: all_taxids[int(n)] = None except ValueError: all_names.add(n.strip()) # translate names name2tax = ncbi.get_name_translator(all_names) all_taxids.update([(v, None) for v in list(name2tax.values())]) not_found_names = all_names - set(name2tax.keys()) if args.fuzzy and not_found_names: log.warn("%s unknown names", len(not_found_names)) for name in not_found_names: # enable extension loading tax, realname, sim = ncbi.get_fuzzy_name_translation( name, args.fuzzy) if tax: all_taxids[tax] = None name2tax[name] = tax name2realname[name] = realname name2score[name] = "Fuzzy:%0.2f" % sim if not_found_names: log.warn("[%s] could not be translated into taxids!" % ','.join(not_found_names)) if args.tree: if len(all_taxids) == 1: target_taxid = next(all_taxids.keys()) log.info("Dumping NCBI descendants tree for %s" % (target_taxid)) t = ncbi.get_descendant_taxa( target_taxid, collapse_subspecies=args.collapse_subspecies, rank_limit=args.rank_limit, return_tree=True) else: log.info("Dumping NCBI taxonomy of %d taxa..." % (len(all_taxids))) t = ncbi.get_topology(list(all_taxids.keys()), intermediate_nodes=args.full_lineage, rank_limit=args.rank_limit, collapse_subspecies=args.collapse_subspecies) id2name = ncbi.get_taxid_translator([n.name for n in t.traverse()]) for n in t.traverse(): n.add_features(taxid=n.name) n.add_features(sci_name=str(id2name.get(int(n.name), "?"))) n.name = "%s - %s" % (id2name.get(int(n.name), n.name), n.name) lineage = ncbi.get_lineage(n.taxid) n.add_features( named_lineage='|'.join(ncbi.translate_to_names(lineage))) dump(t, features=[ "taxid", "name", "rank", "bgcolor", "sci_name", "collapse_subspecies", "named_lineage" ]) elif args.descendants: log.info("Dumping NCBI taxonomy of %d taxa..." % (len(all_taxids))) print('# ' + '\t'.join([ "Taxid", "Sci.Name", "Rank", "descendant_taxids", "descendant_names" ])) translator = ncbi.get_taxid_translator(all_taxids) ranks = ncbi.get_rank(all_taxids) for taxid in all_taxids: descendants = ncbi.get_descendant_taxa( taxid, collapse_subspecies=args.collapse_subspecies, rank_limit=args.rank_limit) print('\t'.join([ str(taxid), translator.get(taxid, taxid), ranks.get(taxid, ''), '|'.join(map(str, descendants)), '|'.join(map(str, ncbi.translate_to_names(descendants))) ])) elif args.info: print('# ' + '\t'.join( ["Taxid", "Sci.Name", "Rank", "Named Lineage", "Taxid Lineage"])) translator = ncbi.get_taxid_translator(all_taxids) ranks = ncbi.get_rank(all_taxids) for taxid, name in six.iteritems(translator): lineage = ncbi.get_lineage(taxid) named_lineage = ','.join(ncbi.translate_to_names(lineage)) lineage_string = ','.join(map(str, lineage)) print('\t'.join([ str(taxid), name, ranks.get(taxid, ''), named_lineage, lineage_string ]))
class TaxIDExpander(object): def __init__(self, taxdump_filename: str = None, taxdb_filename: str = None) -> 'TaxIDExpander': """Constructor for TaxIDExpander Args: taxdump_filename(str): if specified, refers to a local copy of the NCBI taxdump.tar.gz file taxdb_filename(str): if specified will be used to look for a db containing the NCBI database to load. if both taxdump_filename and taxdb_filename are set, save to taxdb_filename """ if taxdump_filename is not None: taxdump_path = Path(taxdump_filename) if not (taxdump_path.exists() and taxdump_path.is_file()): raise ValueError(f'{taxdump_filename} must be a readable file') if taxdb_filename is not None: # we have both a taxdump file and a taxdb file # this means we load from taxdump file and save to taxdb file self.ncbi = NCBITaxa(taxdump_file=taxdump_filename, dbfile=taxdb_filename) else: # we have a taxdump file and no taxdb file # this means we load from the taxdump file and let ete3 save to its default location self.ncbi = NCBITaxa(taxdump_file=taxdump_filename) else: if taxdb_filename is not None: # we have a taxdb file and no taxdump file # this means we load the database from the taxdb file taxdb_path = Path(taxdb_filename) if not (taxdb_path.exists() and taxdb_path.is_file()): raise ValueError( f'{taxdb_filename} must be a readable file') self.ncbi = NCBITaxa(dbfile=taxdb_filename) else: # we have neither a taxdump file nor a taxdb file # this means ete3 loads the database over the network (and cache in local directory) # and let ete3 save the taxdb to its default location self.ncbi = NCBITaxa() def get_lineage(self, taxid: str, only_standard_ranks: Optional[bool] = False ) -> List[Tuple[str, str]]: """Return lineage for a given taxonomy ID Raises ValueError if taxonomy ID is not found. Args: taxid(str): NCBI taxonomy ID only_standard_ranks(bool): if True only return superkingdom, phylum, class, order, family, genus and species ranks Returns: list of tuples where the tuples have members (taxon rank, taxon name)""" lineage_ids = self.ncbi.get_lineage(taxid) names = self.ncbi.get_taxid_translator(lineage_ids) ranks = self.ncbi.get_rank(lineage_ids) standard_ranks = set([ 'superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species' ]) lineage = [] for id in lineage_ids: rank = ranks[id] if only_standard_ranks and rank not in standard_ranks: continue lineage.append((ranks[id], names[id])) return lineage def get_scientific_name(self, taxid: str): results = self.ncbi.translate_to_names([taxid]) if not results: return 'UNKNOWN' else: return results[0]
def run(args): # add lineage profiles/stats import re from ete3 import PhyloTree, NCBITaxa # dump tree by default if not args.tree and not args.info and not args.descendants: args.tree = True ncbi = NCBITaxa() all_taxids = {} all_names = set() queries = [] if not args.search: log.error('Search terms should be provided (i.e. --search) ') sys.exit(-1) for n in args.search: queries.append(n) try: all_taxids[int(n)] = None except ValueError: all_names.add(n.strip()) # translate names name2tax = ncbi.get_name_translator(all_names) all_taxids.update([(v, None) for v in list(name2tax.values())]) not_found_names = all_names - set(name2tax.keys()) if args.fuzzy and not_found_names: log.warn("%s unknown names", len(not_found_names)) for name in not_found_names: # enable extension loading tax, realname, sim = ncbi.get_fuzzy_name_translation(name, args.fuzzy) if tax: all_taxids[tax] = None name2tax[name] = tax name2realname[name] = realname name2score[name] = "Fuzzy:%0.2f" %sim if not_found_names: log.warn("[%s] could not be translated into taxids!" %','.join(not_found_names)) if args.tree: if len(all_taxids) == 1: target_taxid = next(all_taxids.keys()) log.info("Dumping NCBI descendants tree for %s" %(target_taxid)) t = ncbi.get_descendant_taxa(target_taxid, collapse_subspecies=args.collapse_subspecies, rank_limit=args.rank_limit, return_tree=True) else: log.info("Dumping NCBI taxonomy of %d taxa..." %(len(all_taxids))) t = ncbi.get_topology(list(all_taxids.keys()), intermediate_nodes=args.full_lineage, rank_limit=args.rank_limit, collapse_subspecies=args.collapse_subspecies) id2name = ncbi.get_taxid_translator([n.name for n in t.traverse()]) for n in t.traverse(): n.add_features(taxid=n.name) n.add_features(sci_name=str(id2name.get(int(n.name), "?"))) n.name = "%s - %s" %(id2name.get(int(n.name), n.name), n.name) lineage = ncbi.get_lineage(n.taxid) n.add_features(named_lineage = '|'.join(ncbi.translate_to_names(lineage))) dump(t, features=["taxid", "name", "rank", "bgcolor", "sci_name", "collapse_subspecies", "named_lineage"]) elif args.descendants: log.info("Dumping NCBI taxonomy of %d taxa..." %(len(all_taxids))) print('# ' + '\t'.join(["Taxid", "Sci.Name", "Rank", "descendant_taxids", "descendant_names"])) translator = ncbi.get_taxid_translator(all_taxids) ranks = ncbi.get_rank(all_taxids) for taxid in all_taxids: descendants = ncbi.get_descendant_taxa(taxid, collapse_subspecies=args.collapse_subspecies, rank_limit=args.rank_limit) print('\t'.join([str(taxid), translator.get(taxid, taxid), ranks.get(taxid, ''), '|'.join(map(str, descendants)), '|'.join(map(str, ncbi.translate_to_names(descendants)))])) elif args.info: print('# ' + '\t'.join(["Taxid", "Sci.Name", "Rank", "Named Lineage", "Taxid Lineage"])) translator = ncbi.get_taxid_translator(all_taxids) ranks = ncbi.get_rank(all_taxids) for taxid, name in six.iteritems(translator): lineage = ncbi.get_lineage(taxid) named_lineage = ','.join(ncbi.translate_to_names(lineage)) lineage_string = ','.join(map(str, lineage)) print('\t'.join([str(taxid), name, ranks.get(taxid, ''), named_lineage, lineage_string]))
from ete3 import NCBITaxa ncbi = NCBITaxa() descendants = ncbi.get_descendant_taxa('Salamandridae') print(ncbi.translate_to_names(descendants)) descendants = ncbi.get_descendant_taxa('Salamandridae', collapse_subspecies=True) print(ncbi.translate_to_names(descendants)) tree = ncbi.get_descendant_taxa('Salamandridae', collapse_subspecies=True, return_tree=True) print(tree.get_ascii(attributes=['sci_name', 'taxid'])) # ['Notophthalmus viridescens', 'Notophthalmus perstriatus', 'Notophthalmus meridionalis kallerti', # 'Notophthalmus meridionalis meridionalis', 'Pleurodeles waltl waltl', 'Pleurodeles poireti', # 'Pleurodeles nebulosus', 'Taricha granulosa', 'Taricha rivularis', 'Taricha torosa torosa', # 'Taricha torosa sierrae', 'Taricha sp. AMNH A168420', 'Triturus cristatus', # 'Triturus karelinii arntzeni', 'Triturus karelinii karelinii', 'Triturus carnifex carnifex', # 'Triturus dobrogicus dobrogicus', 'Triturus dobrogicus macrosomus', 'Triturus marmoratus marmoratus', # 'Triturus pygmaeus', 'Triturus macedonicus', 'Triturus cristatus x Triturus dobrogicus macrosomus', # 'Triturus cristatus s.l. AH-2007', "Triturus cf. karelinii 'eastern'", "Triturus cf. karelinii 'western'", # 'Triturus ivanbureschi', 'Triturus anatolicus', 'Cynops pyrrhogaster', 'Cynops ensicauda', # 'Cynops orientalis', 'Cynops cyanurus chuxiongensis', 'Cynops cyanurus cyanurus', 'Cynops orphicus', # 'Cynops fudingensis', 'Cynops glaucus', 'Euproctus montanus', 'Euproctus platycephalus', # 'Tylototriton taliangensis', 'Tylototriton verrucosus pulcherrima', 'Tylototriton shanjing', # 'Tylototriton kweichowensis', # Tylototriton sp. MH-2011', 'Tylototriton pseudoverrucosus', 'Tylototriton yangi', 'Tylototriton uyenoi', # 'Tylototriton shanorum', 'Tylototriton anguliceps', 'Tylototriton daweishanensis', # 'Tylototriton podichthys', 'Tylototriton himalayanus', 'Tylototriton ngarsuensis',
def main(in_file, out_file, in_type, out_filetype): ncbi = NCBITaxa() records = [] record = {'taxid': None, 'gi': None, 'id': None} with open(in_file, 'r') as fh: queries = fh.read().splitlines() if out_filetype == 'JSON': output = open(out_file, 'w') queries = [x.replace("_", " ") for x in queries if x] try: if in_type == "taxid": queries = list(ncbi.get_taxid_translator(queries).values()) taxons = ncbi.get_name_translator(queries) record['taxid'] == taxons[queries[0]][0] except: print("Unable to read keys.") print("Are you using the right type ('-t') option?") print( "Default is 'sciname', but 'taxid' available if using NCBI taxonomics IDs." ) exit(-1) count = 0 for q in queries: record['taxid'] = taxons[q][0] record['gi'] = count record['id'] = q count = count + 1 record['tax_path'] = [] lineage = ncbi.get_lineage(record['taxid']) lineage_names = ncbi.translate_to_names(lineage) lineage_ranks = ncbi.get_rank(lineage) lineage.pop() lineage.pop(0) print(lineage_names) for l in lineage: tax_path_entry = {} tax_path_entry['taxid'] = l tax_path_entry['rank_name'] = lineage_names[count] tax_path_entry['rank'] = lineage_ranks[l] lin = ncbi.get_lineage(l) tax_path_entry['parent_taxid'] = lin[-2] record['tax_path'].append(tax_path_entry) if out_file is not None: if out_filetype == 'JSON': output = open(out_file, 'w') output.write(json.dumps(record) + '\n') else: sys.stdout.write(json.dumps(record) + '\n') records.append(record) if out_filetype == 'CSV': with open(out_file, 'w') as csvfile: fieldnames = ['ID', 'GI', 'TAXID'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for record in records: writer.writerow({ 'ID': record['id'], 'GI': record['gi'], 'TAXID': record['taxid'] })
if refaa != '-' and frac > 0.66: valid_cols += 1 variants = a[a[col] != refaa][col].to_dict() refaas.append(refaa) for sp, var in variants.iteritems(): sp = sp.split(".")[0] spvariants[sp].update([(refaa, var)]) #if valid_cols > 500: # break refaacounter = Counter(refaas) for sp, varcounter in spvariants.iteritems(): try: sp_name = ncbi.translate_to_names([int(sp.split(".")[0])])[0] except ValueError: sp_name = "oxymonad-%s" % sp for varc in varcounter: ratio = varcounter[varc] / float(refaacounter[varc[0]]) if ratio > 0.25: print sp, sp_name, "%s\t%s/%s" % ( "->".join(varc), varcounter[varc], refaacounter[varc[0]]) # #print varcounter.most_common(1)[0], varcounter[('W', 'X')] # most_common = varcounter.most_common(1)[0] # ratio = (most_common[1] / float(refaacounter[most_common[0][0]])) # #if varcounter.most_common(1)[0][1] > 10 and "-" not in varcounter.most_common(1)[0][0]: # if ratio > 0.33:# and "-" not in most_common[0]: # print sp, ncbi.translate_to_names([int(sp.split(".")[0])])[0] # for varc in varcounter: # #if varcounter[varc] > 2:#/float(refaacounter[varc[0]]) > 0.2:
class preprocess(): def __init__(self, organism, input, name, outdir, reference, paired, input2, log, verbose, map, outlier, trim, kraken, db, taxon_id, n_results): self.organism = organism self.input = input self.name = name self.outdir = outdir self.reference = reference self.paired = paired self.input2 = input2 self.log = log self.verbose = verbose self.outlier = outlier self.map = map self.trim = trim self.kraken = kraken self.db = db self.taxid = taxon_id self.n_results = n_results self.logger = logging.getLogger() self.outlier_file = open(outdir + '/outlier_list.txt', 'a') self.ncbi = NCBITaxa() self.descendants = set( self.ncbi.get_descendant_taxa(self.taxid, intermediate_nodes=True)) """Shell Execution""" def runCommand(self, command, directory, write_output): process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=directory) out, err = process.communicate() if out: if write_output: return out self.logger.info("Standard output: \n" + out.decode('utf-8') + "\n") if err: self.logger.info("Standard error: \n" + err.decode('utf-8') + "\n") """Running Refseq_masher""" def refseq_masher(self): self.ifVerbose("Running Refseq_masher matching") self.runCommand([ 'refseq_masher', 'matches', '-o', self.name + '.match', '--output-type', 'tab', self.input, '-n', str(self.n_results + 5) ], os.path.join(self.outdir, 'mash'), write_output=False) """Running Kraken""" def run_kraken(self): self.ifVerbose("Running Kraken") gzip = "" if self.input[-3:] == ".gz": gzip = "--gzip-compressed" if self.paired: self.runCommand([ 'kraken', '--db', self.db, '--paired', '--output', self.name + '.kraken', '--fastq-input', "%s" % gzip, self.input, self.input2 ], os.path.join(self.outdir, 'kraken'), write_output=False) else: self.runCommand([ 'kraken', '--db', self.db, '--output', self.name + '.kraken', '--fastq-input', "%s" % gzip, self.input ], os.path.join(self.outdir, 'kraken'), write_output=False) """Parse Kraken resuts""" def parse_kraken_results(self): self.ifVerbose("Parsing Kraken results") kraken = {} # Store classification for each read with open(os.path.join(self.outdir, 'kraken/%s.kraken' % self.name), 'r') as classification: for line in classification: classified, read_id, tax_id, length, details = line.strip( ).split("\t") kraken[read_id] = tax_id # Classify each read kraken_class = {} with open(os.path.join(self.outdir, 'kraken/%s.log' % self.name), 'w') as log: for read_id, tax_id in kraken.items(): if int(tax_id) == 0: kraken_class[read_id] = "unclassified" elif int(tax_id) in self.descendants or int(tax_id) == int( self.taxid): kraken_class[read_id] = "target" else: kraken_class[read_id] = "other" log.write( "%s was trimmed because it was classified as %s (%s)\n" % (read_id, self.ncbi.translate_to_names( [int(tax_id)])[0], tax_id)) return kraken_class """Trim fastq reads not belonging to target organism""" def kraken_trim(self): kraken = self.parse_kraken_results() # Write new fastq file files = [self.input, self.input2] for fastq_in in files: with gzip.open(fastq_in, 'r') as f_in: fastq_out = os.path.split(fastq_in)[1] if fastq_out[-3:] == ".gz": # Eliminate .gz from filename fastq_out = fastq_out[:-3] with open( os.path.join(self.outdir, 'kraken_trim/%s' % fastq_out), 'w') as f_out: self.ifVerbose( "Trimming reads from %s that do not belong to the target organism" % fastq_out) for line in f_in: # Split ID with space, then remove "/1" or "/2" if it exists and ignore initial @ read_id = line.decode('utf-8').split(" ")[0].split( "/")[0][1:] if read_id in kraken and kraken[read_id] != "other": f_out.write(line.decode('utf-8')) for i in range(3): f_out.write(f_in.readline().decode('utf-8')) else: for i in range(3): f_in.readline() # Zip output files self.runCommand([ 'gzip', os.path.join(self.outdir, 'kraken_trim/%s' % fastq_out) ], None, write_output=False) """Run Trim_galore to preprocess fastq files""" def trim_galore(self): self.ifVerbose("Trimming fastq files using Trim_galore") if self.paired: self.runCommand([ 'trim_galore', '--fastqc_args', "\"--outdir " + os.path.join(self.outdir, "trimmed_fastq/fastqc") + "\"", '--gzip', '-o', os.path.join(self.outdir, "trimmed_fastq"), '--paired', self.input, self.input2 ], directory=None, write_output=False) self.input = os.path.join( os.path.join(self.outdir, "trimmed_fastq"), self.name + "_1_val_1.fq.gz") self.input2 = self.input = os.path.join( os.path.join(self.outdir, "trimmed_fastq"), self.name + "_2_val_2.fq.gz") else: self.runCommand([ 'trim_galore', '--fastqc', '--gzip', '-o', os.path.join(self.outdir, "trimmed_fastq"), self.input ], directory=None, write_output=False) self.input = os.path.join( os.path.join(self.outdir, "trimmed_fastq"), self.name + "_val.fq.gz") """Mapping with Smalt""" def smalt_map(self): self.ifVerbose("Mapping reads to reference using Smalt") if self.paired: self.runCommand([ 'smalt', 'map', '-i', '1000', '-j', '20', '-l', 'pe', '-o', self.name + ".BAM", 'reference', self.input, self.input2 ], os.path.join(self.outdir, 'mapping'), write_output=False) else: self.runCommand([ 'smalt', 'map', '-o', self.name + ".BAM", 'reference', self.input ], os.path.join(self.outdir, 'mapping'), write_output=False) """Mapping with BWA""" def bwa_map(self): self.ifVerbose("Mapping reads to reference using BWA") with open(os.path.join(self.outdir, 'mapping/%s.SAM' % self.name), 'wb') as sam: with open(os.path.join(self.outdir, 'mapping/%s.BAM' % self.name), 'wb') as bam: if self.paired: sam_output = self.runCommand( ['bwa', 'mem', 'reference', self.input, self.input2], os.path.join(self.outdir, 'mapping'), write_output=True) sam.write(sam_output) bam_output = self.runCommand( ['samtools', 'view', '-Sb', self.name + ".SAM"], os.path.join(self.outdir, 'mapping'), write_output=True) bam.write(bam_output) else: sam_output = self.runCommand([ 'bwa', 'mem', 'reference', self.input, '>', self.name + ".SAM" ], os.path.join( self.outdir, 'mapping'), write_output=True) sam.write(sam_output) bam_output = self.runCommand([ 'samtools', 'view', '-Sb', self.name + ".SAM", '>', self.name + ".BAM" ], os.path.join( self.outdir, 'mapping'), write_output=True) bam.write(bam_output) """Sort BAM files using Samtools""" def samtools(self): self.ifVerbose("Sorting BAM files using Samtools") self.runCommand([ 'samtools', 'sort', '-o', self.name + '_sorted.BAM', self.name + '.BAM' ], os.path.join(self.outdir, 'mapping'), write_output=False) """Checking mapping quality with Qualimap""" def qualimap(self): self.ifVerbose("Running qualimap BAM QC") self.runCommand([ 'qualimap', 'bamqc', '-bam', os.path.join(self.outdir, 'mapping/' + self.name + '_sorted.BAM'), '-outformat', 'HTML', '-outdir ', os.path.join(self.outdir, "qualimap/" + self.name) ], directory=None, write_output=False) """Parse through report file obtained from Qualimap or Refseq_masher""" def parser(self, refseq, qualimap): if self.outlier: outlier_flag = False class_flag = False distance_flag = False map_flag = False quality_flag = False n_count = 0 if refseq: self.ifVerbose("Parsing Refseq_masher report") with open( os.path.join(os.path.join(self.outdir, 'mash'), self.name + '.match')) as csvfile: for row in csv.DictReader(csvfile, delimiter='\t'): if (int(row['taxid']) != int(self.taxid) and int(row['taxid']) not in self.descendants): outlier_flag = True class_flag = int(row['taxid']) if n_count >= self.n_results: break else: n_count += 1 elif float(row['distance']) > 0.05: outlier_flag = True distance_flag = True break if qualimap: self.ifVerbose("Parsing Qualimap report") report = open(self.outdir + '/qualimap/' + self.name + '/genome_results.txt') for line in report: if "number of mapped reads" in line: mapped_percentage = line.split()[-1].strip('()%') if "mean mapping quality" in line: mean_mapping_quality = line.split()[-1] if float(mapped_percentage) < 90: outlier_flag = True map_flag = True elif float(mean_mapping_quality) < 10: outlier_flag = True quality_flag = True if outlier_flag: self.ifVerbose("%s is an outlier" % self.name) if class_flag: self.outlier_file.write( "%s\tClassified as %s (%d)\n" % (self.name, self.ncbi.translate_to_names( [class_flag])[0], class_flag)) if distance_flag: self.outlier_file.write( "%s\tDistance is greater than 0.05\n" % self.name) if map_flag: self.outlier_file.write( "%s\tMapping percentage is lower than 90%%\n" % self.name) if quality_flag: self.outlier_file.write( "%s\tMean mapping quality is lower than 10\n" % self.name) """Move outlier sample""" def outlier(self): self.ifVerbose("%s is an outlier" % self.name) # self.runCommand(['cp', self.input, os.path.join(self.outdir, 'outliers')], directory=None, write_output=False) # if self.paired: # self.runCommand(['cp', self.input2, os.path.join(self.outdir, 'outliers')], directory=None, write_output=False) self.outlier_file.write("%s\n" % self.name) def ifVerbose(self, msg): if self.verbose: self.logger.info(msg)
def main(): if len(sys.argv) != 4: mes = 'Usage: python {} "phylum,genus" <id2taxonid.table> <id2taxonid2taxon.table>\n' sys.stderr.write(mes.format(os.path.basename(sys.argv[0]))) sys.exit(1) REQUIRED_RANK = [i.strip() for i in sys.argv[1].strip().split(',')] infile = sys.argv[2] outfile = sys.argv[3] if infile == '-': infile = '/dev/stdin' if outfile == '-': outfile = '/dev/stdout' ncbi = NCBITaxa() rank_lis_needed = [ 'superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species' ] with open(infile) as fp, open(outfile, 'w') as fw: cnt = 0 for line in fp: line = line.rstrip() id, taxonid = line.split() try: lineage_num = ncbi.get_lineage(taxonid) except ValueError as e: mes = '*** Invalid TaxonID (not in NCBI Taxonomy db): {}\n' sys.stderr.write(mes.format(taxonid)) continue lineage_tax = ncbi.translate_to_names(lineage_num) rank_dict = ncbi.get_rank(lineage_num) st_rank = set(rank_dict.values()) if not set(REQUIRED_RANK).issubset(st_rank): mes = ('*** Irregular taxon format with TaxonID ({})' '(skipped): \n{}\n{}\n') sys.stderr.write( mes.format(taxonid, repr(lineage_tax), repr(st_rank))) cnt += 1 continue n_temp = -1 index_temp = -1 lis_lineage_taxon = [] for n, idd in enumerate(lineage_num): rank = rank_dict[idd] if rank in set(rank_lis_needed): index = rank_lis_needed.index(rank) if index != index_temp + 1: skipped = index - 1 - index_temp # add skipped levels while 1: lis_lineage_taxon.append('Other') skipped -= 1 if skipped == 0: break n_temp += 1 #assert n_temp < n, 'n_temp: {}; n: {}'.format(n_temp, n) lis_lineage_taxon.append(lineage_tax[n]) n_temp = n index_temp = index l1 = len(lis_lineage_taxon) l2 = len(rank_lis_needed) if l1 != l2: lis_lineage_taxon.extend(['Other'] * (l2 - l1)) assert len(lis_lineage_taxon) == len(rank_lis_needed) fw.write('{}\t{}\t{}\n'.format(id, taxonid, '\t'.join(lis_lineage_taxon))) mes = ('*** Number of taxonid with irregular taxonomy ' '(without {} info): {}') sys.stderr.write(mes.format(', '.join(REQUIRED_RANK), cnt))
print("Reading NCBI Taxa...") ncbi = NCBITaxa() print("Done...") if len(sys.argv) < 2: print("\nNeed exactly two parameters! None given...\n") print("Documentation:") print(__doc__) sys.exit(9) #root_taxon = 'Leptospira alexanderi' root_taxon = sys.argv[2] lineage = ncbi.get_descendant_taxa(root_taxon, intermediate_nodes=True) root_taxon_id = ncbi.get_name_translator([root_taxon])[root_taxon][0] lineage.append(root_taxon_id) names = ncbi.translate_to_names(lineage) seqs_by_taxon = dict() for name in names: seqs_by_taxon[name] = [] if DEBUG: print("Total # of Taxons: %s " % (len(seqs_by_taxon))) if DEBUG: print("First 10 taxons: %s" % seqs_by_taxon.keys()[:10]) FASTAFILE = sys.argv[1] FASTAFILE = os.path.expanduser(FASTAFILE) OUTFILE = os.path.splitext(FASTAFILE)[0] + '_' + root_taxon.replace( ' ', '_') + os.path.splitext(FASTAFILE)[1] if not os.path.isfile(FASTAFILE): raise OSError(2, 'No such file or directory:', FASTAFILE) else: with open(FASTAFILE, 'r') as f:
#!/usr/bin/env python # -*- coding: utf-8 -*- import re import json from sys import argv from ete3 import Tree from ete3 import NCBITaxa ncbi = NCBITaxa() ncbi.update_taxonomy_database() descendants = ncbi.get_descendant_taxa(argv[1], collapse_subspecies=False) #2759 - eukarya names = ncbi.translate_to_names(descendants) def Back(linn): linn=re.sub(r"^la ", "", linn,flags=re.UNICODE) #linn = r"la " + linn #- final linn = re.sub(r"\bku'a la\b", "x", linn,flags=re.UNICODE) #linn = re.sub(r"\bx\b", "ku'a la", linn,flags=re.UNICODE) #- special words linn = re.sub(r"c$", "", linn,flags=re.UNICODE) linn = re.sub(r"c(?!\')\b", "", linn,flags=re.UNICODE) linn = re.sub(r"y$", "", linn,flags=re.UNICODE) linn = re.sub(r"y(?!\')\b", "", linn,flags=re.UNICODE) #linn = re.sub(r"([aeiou])$", "\g<1>c", linn,flags=re.UNICODE) #linn = re.sub(r"([aeiou]) ", "\g<1>c ", linn,flags=re.UNICODE) #linn = re.sub(r"c(?![\'])(\b|$)", "cyc", linn,flags=re.UNICODE) linn = re.sub(r"kau'", "q", linn,flags=re.UNICODE) #treat 'q' #- add consonant to the end
def main(): # =============== # # PARAMETERS # # =============== # args = docopt(__doc__) ai_features = args['--aifeatures'] output_dir = get_outdir(args['--output']) fasta_inputfile = args['--fastafile'] blast_inputfile = args['--blastfile'] groups_yaml = args['--tax_groups'] config_yaml = args['--config_file'] ortho_groups = args['--ortho_groups'] cfg_file = args['--cfg'] # =============== # # MAIN # # =============== # """ 0. Setting up """ print("[+] Setting up") # Check if programs in path check_programs("blastdbcmd", "mafft") # Create folders fasta_folder = os.path.join(output_dir, "fastagroups") get_outdir(fasta_folder) mafft_folder = os.path.join(output_dir, "mafftgroups") get_outdir(mafft_folder) tmp_folder = os.path.join(output_dir, "tmp") get_outdir(tmp_folder) # Load proteome in memory record_dict = SeqIO.index(fasta_inputfile, "fasta") # Create taxonomic groups orgtag = "@StudiedOrganism" stream = open(groups_yaml, 'r') toi_egp = yaml.safe_load(stream) stream.close() if not cfg_file: cfg_file = os.path.join(sys.path[0], "depot", "taxonomy.yaml") stream = open(cfg_file, 'r') config_groups = yaml.safe_load(stream) stream.close() stream = open(config_yaml, 'r') config_opts = yaml.safe_load(stream) stream.close() threads = config_opts["max_threads"] trim = config_opts["trimal"] ai_cutoff = config_opts["ai_cutoff"] percent_identity = config_opts["percent_identity"] cutoffextend = config_opts["cutoffextend"] min_num_hits = config_opts["min_num_hits"] percentage_similar_hits = config_opts["percentage_similar_hits"] mode = config_opts["mode"] mafft_options = config_opts["mafft_options"] trimal_options = config_opts["trimal_options"] if trim: check_programs("trimal") trim_folder = os.path.join(output_dir, "trim") get_outdir(trim_folder) else: trim_folder = "" #Setting up NCBI Taxonomy ncbi = NCBITaxa() """ 1. Select HGT """ query_dict_set = {} queries_info = {} with open(ai_features, 'r', encoding='utf8') as csvfile: reader = csv.reader(csvfile, delimiter='\t') for row in reader: # Get index col i_query = row.index('query name') i_notoi = row.index('donor') i_toi = row.index('recipient') i_ai = row.index('AI') i_hgt = row.index('HGTindex') i_nbhits = row.index('query hits number') break for row in reader: L_notoi = row[i_notoi].rstrip('\n').rsplit(':', 4) L_toi = row[i_toi].rstrip('\n').rsplit(':', 4) if (row[i_notoi] != '::::'): #Skipping hits with only TOI if (float(row[i_nbhits]) >= min_num_hits and float(L_notoi[2]) <= percent_identity and float(row[i_ai]) >= ai_cutoff): notoi_pos = int(L_notoi[1]) if (row[i_toi] == '::::'): toi_pos = 0 else: toi_pos = int(L_toi[1]) #Select at least 50 hits last_pos = min( max(max(toi_pos, notoi_pos) + cutoffextend, 50), int(row[i_nbhits])) queries_info[row[i_query]] = {'pos': last_pos} query_dict_set[row[i_query]] = set() print("[!] Selected " + str(len(query_dict_set)) + " HGT candidates") """ 2. Parse Blast """ print("[+] Parsing Blast file and grouping similar queries") extract_hit_id_set = set() with open_file(blast_inputfile) as fhr_bl: for line in fhr_bl: if ('#' not in line): L_hitqline = line.rstrip('\n').split('\t') query_id = L_hitqline[0] if query_id in queries_info.keys( ): # Queries that pass the initial selection if (len(query_dict_set[query_id]) <= queries_info[query_id]["pos"]): if "@" in query_id: sys.exit("@ symbol is not allowed: " + query_id) if "@" in query_hit_id: sys.exit("@ symbol is not allowed: " + query_hit_id) query_hit_id = L_hitqline[1] extract_hit_id_set.add(query_hit_id) query_dict_set[query_id].add(query_hit_id) # GK # Group hits G = nx.Graph() if ortho_groups: num_groups = 0 with open_file(ortho_groups) as fhr_og: for line in fhr_og: num_groups += 1 members = line.split() for i in range(1, len(members), 1): G.add_node(members[i]) if i > 1: G.add_edge(members[i], members[i - 1]) print("[!] Found " + str(num_groups) + " groups") else: for protein_id, hits in query_dict_set.items(): G.add_node(protein_id) for protein_id_other, hitsc in query_dict_set.items(): if protein_id != protein_id_other: u = len(set.intersection(hits, hitsc)) m = min(len(hits), len(hitsc)) if (u / m) >= percentage_similar_hits: G.add_edge(protein_id, protein_id_other) print("[!] Formed " + str(len(list(nx.connected_components(G)))) + " groups") """ 3. Extract hits """ print("[+] Extracting hits from DB") extract_id_path = os.path.join(tmp_folder, "extract_id.txt") fhw_extract_id = open(extract_id_path, 'w') fhw_extract_id.write('\n'.join(extract_hit_id_set) + '\n') fhw_extract_id.close() setnrfa_path = os.path.join(tmp_folder, "setnr.fa") fhw_setnrfa = open(setnrfa_path, 'w') setnrlog_path = os.path.join(tmp_folder, "setnr.log") if mode == "nr": blastdbcmd_command = 'blastdbcmd -db ' + config_opts[ "nr_db_path"] + ' -dbtype prot -entry_batch ' + extract_id_path + ' -target_only -outfmt ">%a@%T\n%s" -logfile ' + setnrlog_path + ' -out ' + setnrfa_path subprocess.call(blastdbcmd_command, shell=True) else: # GK This is specific to SwissProt for now, have to test for UniProt in the future if mode == "sp": db_re = re.compile("OX=\d*") elif mode == "ur90": db_re = re.compile("TaxID=\d*") else: sys.exit(mode + " is not a valid mode") with open_file(config_opts["sp_fasta_path"]) as handle: for record in SeqIO.parse(handle, "fasta"): if record.id in extract_hit_id_set: ox, taxid = db_re.search( record.description).group().split("=") fhw_setnrfa.write(">" + record.id + "@" + taxid + "\n") seq = str(record.seq) fhw_setnrfa.write(seq + "\n") fhw_setnrfa.close() # Load hits to memory hits_dict = {} record_to_taxid = {} with open_file(setnrfa_path) as handle: for record in SeqIO.parse(handle, "fasta"): id, taxid = record.id.rstrip('\n').split('@') record_to_taxid[id] = taxid hits_dict[id] = str(record.seq) """ 4. Write fasta """ print("[+] Writing fasta files") taxonomy_nexus_path = os.path.join(tmp_folder, "taxonomy_nexus.txt") taxonomy_nexus = open(taxonomy_nexus_path, 'w') groups_tsv_path = os.path.join(output_dir, "groups.tsv") groups_tsv = open(groups_tsv_path, 'w') group_id = 1 final_number_of_candidates = 0 final_number_of_groups = 0 group_dict = {} number_of_lost_taxids = 0 number_of_lost_records = 0 for subgraph in nx.connected_components(G): num_of_seqs_in_group = 0 grouped_hits = [] queries_to_fasta = [] queries_to_group = [] for q in subgraph: queries_to_fasta.append(q) queries_to_group.append(q) grouped_hits.extend(query_dict_set[q]) grouped_hits = set(grouped_hits) if len(grouped_hits) >= min_num_hits: group_name_file = "gp" + str(group_id) + '.fa' gp_pathname = os.path.join(fasta_folder, group_name_file) fw_gp = open(gp_pathname, 'w') final_number_of_groups += 1 for q in queries_to_fasta: final_number_of_candidates += 1 fw_gp.write('>' + q + orgtag + '\n' + str(record_dict[q].seq) + '\n') num_of_seqs_in_group += 1 for record_id in grouped_hits: if record_id in hits_dict: taxid = record_to_taxid[record_id] try: ncbi.get_lineage(taxid) taxid_found = True except: taxid_found = False if not taxid or not taxid_found: # actually print a file containing lost taxids number_of_lost_taxids += 1 selectname = "Unknown" else: lnode = set(ncbi.get_lineage(taxid)) lname = ncbi.translate_to_names( lnode) # This does not output them in order #print(ncbi.get_rank(lnode)) Maybe try this with {1: 'no rank', 2: 'superkingdom'} taxonomy_nexus.write( str(record_id) + "\t" + str(lname) + "\n") egp_hit = list( lnode.intersection(set(toi_egp["EGP"].keys()))) toi_hit = list( lnode.intersection(set(toi_egp["TOI"].keys()))) cfg_hit = list( lnode.intersection( set(config_groups["Other"].keys()))) kdom_hit = list( lnode.intersection( set(config_groups["Kingdom"].keys()))) if egp_hit: selectname = "EGP-" + toi_egp["EGP"][egp_hit[0]] elif toi_hit: selectname = "TOI-" + toi_egp["TOI"][toi_hit[0]] elif cfg_hit: selectname = config_groups["Other"][cfg_hit[0]] elif kdom_hit: selectname = config_groups["Kingdom"][kdom_hit[0]] else: selectname = "Unknown" fw_gp.write(">" + record_id + "@" + selectname + "\n") fw_gp.write(hits_dict[record_id] + "\n") num_of_seqs_in_group += 1 else: # actually print a file containing lost gids number_of_lost_records += 1 groups_tsv.write(group_name_file + '\t' + str(num_of_seqs_in_group) + '\t' + '\t'.join(queries_to_group) + '\n') group_dict[group_name_file] = num_of_seqs_in_group fw_gp.close() group_id += 1 groups_tsv.close() print("[!] Skipped " + str(number_of_lost_records) + " hits and " + str(number_of_lost_taxids) + " taxids.") """ 5. Align fasta """ print("[+] Aligning fasta files") jobs = threads p = Pool(jobs) job_list = [] for group_name, value in sorted(group_dict.items(), key=itemgetter(1), reverse=True): g_list = [ group_name, fasta_folder, mafft_folder, trim, trim_folder, mafft_options, trimal_options ] job_list.append(g_list) i = 0 for i, _ in enumerate(p.imap_unordered(run_mafft, job_list), 1): progress(i, 1, len(job_list)) print("[!] Finished with " + str(final_number_of_candidates) + " HGT candidates in " + str(final_number_of_groups) + " groups")
def inferLineage(self, places): """ infer the lineage from looking at the location of placement looking at the leaves and their tax id and looking at the lineages of all these """ if self.cfg["touch"]: return ncbi = NCBITaxa() # fetch file and load taxinformation seqinfo = self.config.pkgfile("concat.refpkg", "seq_info") taxids = {} si = base.readCSV(seqinfo) # make dictionary for r in si: taxids[r["seqname"]] = r["tax_id"] # for each placement: logging.debug("Infering lineages now") for p in places: # get the GCA names children = p["sisters"] # fetch lineages for all lngs = [] for c in children: try: lngs.append(ncbi.get_lineage(taxids[c])) except ValueError as e: logging.warning(e) # find common elements: common = set(lngs[0]) for l in lngs[1:]: common = common & set(l) # common lineage lng = [] for v in lngs[0]: if v not in common: break # add common elements lng.append(v) nodetaxid = lng[-1] # now we can make it pretty if not self.cfg["fullineage"]: # limit to desired ranks2 desired_ranks = [ "superkingdom", "kingdom", "phylum", "class", "order", "family", "genus", "species", ] lineage2ranks = ncbi.get_rank(lng) ranks2lineage = dict( (rank, taxid) for (taxid, rank) in lineage2ranks.items()) ranks = { "{}_id".format(rank): ranks2lineage.get(rank, "NA") for rank in desired_ranks } lng = [i for i in lng if i in ranks.values()] # get translator and make string named = ncbi.translate_to_names(lng) # save to placed object p["lineage"] = "_".join(named) # replace names with spaces into points p["lineage"] = p["lineage"].replace(" ", ".") p["taxidlineage"] = "_".join([str(x) for x in lng]) p["taxid"] = nodetaxid return ()
if refaa != '-' and frac > 0.66: valid_cols += 1 variants = a[a[col] != refaa][col].to_dict() refaas.append(refaa) for sp, var in variants.iteritems(): sp = sp.split(".")[0] spvariants[sp].update([(refaa, var)]) #if valid_cols > 500: # break refaacounter = Counter(refaas) for sp, varcounter in spvariants.iteritems(): try: sp_name = ncbi.translate_to_names([int(sp.split(".")[0])])[0] except ValueError: sp_name = "oxymonad-%s" % sp for varc in varcounter: ratio = varcounter[varc] / float(refaacounter[varc[0]]) if ratio > 0.25: print sp, sp_name, "%s\t%s/%s" % ( "->".join(varc), varcounter[varc], refaacounter[varc[0]] ) # #print varcounter.most_common(1)[0], varcounter[('W', 'X')] # most_common = varcounter.most_common(1)[0] # ratio = (most_common[1] / float(refaacounter[most_common[0][0]])) # #if varcounter.most_common(1)[0][1] > 10 and "-" not in varcounter.most_common(1)[0][0]: # if ratio > 0.33:# and "-" not in most_common[0]: # print sp, ncbi.translate_to_names([int(sp.split(".")[0])])[0] # for varc in varcounter: # #if varcounter[varc] > 2:#/float(refaacounter[varc[0]]) > 0.2: # try:
host_gb = out.decode("utf-8").rstrip().replace('"', '').replace("[u'", "") host_gb = "" cleaned_host_gb = host_gb.split(' (', 1)[0] if cleaned_host_gb == "mosquito" or cleaned_host_gb == "mosquitoes": cleaned_host_gb = "Culicoidea" if cleaned_host_gb != '': liste.extend([ids, cleaned_host_gb]) liste_host.append(liste) name2taxid = ncbi.get_name_translator([cleaned_host_gb]) fieldnames.append(host_gb) if cleaned_host_gb in name2taxid: host_id = int(name2taxid[cleaned_host_gb][0]) lineage = ncbi.get_lineage(host_id) fieldnames.append(host_id) if cleaned_host_gb != '' and cleaned_host_gb in name2taxid: lineage2name = ncbi.translate_to_names(lineage) lineage2ranks = ncbi.get_rank(lineage) Dic_lineage2name = dict(zip(lineage, lineage2name)) for (taxid, rank) in lineage2ranks.items(): if rank in desired_ranks: if not rank in ranks2lineage: ranks2lineage[rank] = [taxid] else: ranks2lineage[rank].append(taxid) ranks2names = {} for rank in ranks2lineage: ranks2names[rank] = "" for i in range(len(ranks2lineage[rank])): ranks2names[rank] += Dic_lineage2name[ranks2lineage[rank] [i]] + ";" ranks2names[rank] = ranks2names[rank][:-1]
def update_leafcdsdict_fromxr(self,xrpathstr):#,fields=['pdbids','ecs','subfam','extragbs'],searchby='gbacc'): mds=xr.open_dataset(xrpathstr) ncbitaxa=NCBITaxa() knownra=mds.taxra[np.isnan(mds.taxra.loc[:,'species'].values)==False] known_accs=knownra.dbseq.values known_species=ncbitaxa.translate_to_names(knownra.loc[:,'species'].values) # known_phylum=ncbitaxa.translate_to_names(knownra.loc[:,'phylum'].values) # knowndict={ka:[ks,kc,kp] for ka,ks,kc,kp in zip(known_accs,known_species,known_class,known_phylum)} knowndict={ka:ks for ka,ks in zip(known_accs,known_species)} self.leaf_cds_dict['species']=[] # self.leaf_cds_dict['class']=[] # self.leaf_cds_dict['phylum']=[] for gbacc in self.leaf_cds_dict['gbacc']: if gbacc in known_accs: self.leaf_cds_dict['species'].append(knowndict[gbacc]) # self.leaf_cds_dict['class'].append(knowndict[gbacc][0]) # self.leaf_cds_dict['phylum'].append(knowndict[gbacc][0]) else: self.leaf_cds_dict['species'].append('Unknown') knownra=mds.taxra[np.isnan(mds.taxra.loc[:,'class'].values)==False] known_accs=knownra.dbseq.values known_class=ncbitaxa.translate_to_names(knownra.loc[:,'class'].values) knowndict={ka:kc for ka,kc in zip(known_accs,known_class)} self.leaf_cds_dict['class']=[] for gbacc in self.leaf_cds_dict['gbacc']: if gbacc in known_accs: self.leaf_cds_dict['class'].append(knowndict[gbacc]) else: self.leaf_cds_dict['class'].append('Unknown') knownra=mds.taxra[np.isnan(mds.taxra.loc[:,'phylum'].values)==False] known_accs=knownra.dbseq.values known_class=ncbitaxa.translate_to_names(knownra.loc[:,'phylum'].values) knowndict={ka:kc for ka,kc in zip(known_accs,known_class)} self.leaf_cds_dict['phylum']=[] for gbacc in self.leaf_cds_dict['gbacc']: if gbacc in known_accs: self.leaf_cds_dict['phylum'].append(knowndict[gbacc]) else: self.leaf_cds_dict['phylum'].append('Unknown') knownra=mds.taxra[np.isnan(mds.taxra.loc[:,'genus'].values)==False] known_accs=knownra.dbseq.values known_class=ncbitaxa.translate_to_names(knownra.loc[:,'genus'].values) knowndict={ka:kc for ka,kc in zip(known_accs,known_class)} self.leaf_cds_dict['genus']=[] for gbacc in self.leaf_cds_dict['gbacc']: if gbacc in known_accs: self.leaf_cds_dict['genus'].append(knowndict[gbacc]) else: self.leaf_cds_dict['genus'].append('Unknown') knownra=mds.taxra[np.isnan(mds.taxra.loc[:,'superkingdom'].values)==False] known_accs=knownra.dbseq.values known_class=ncbitaxa.translate_to_names(knownra.loc[:,'superkingdom'].values) knowndict={ka:kc for ka,kc in zip(known_accs,known_class)} self.leaf_cds_dict['superkingdom']=[] for gbacc in self.leaf_cds_dict['gbacc']: if gbacc in known_accs: self.leaf_cds_dict['superkingdom'].append(knowndict[gbacc]) else: self.leaf_cds_dict['superkingdom'].append('Unknown')