def createTree(arr): """ Create a tree using the provided list of tuples. In the tree, a node may contains multiple child nodes, but it can only have one parent node. :param arr: list of tuples (taxid, weight, lineage), where lineage is a list of taxid :return: Tree instance """ taxadb = TaxID() root = Node(0, 0, _get_name_for_taxid(0, taxadb)) catalog = {} for bucket in arr: taxid = int(bucket[0]) name = _get_name_for_taxid(taxid, taxadb) leaf = Node(taxid, bucket[1], name) iter = root for i in range(len(bucket[2])): lin = bucket[2][i] if catalog.get(lin) is None: node = Node(lin, 0, _get_name_for_taxid(lin, taxadb)) iter.addChildNode(node) catalog[lin] = node iter = node else: # below we check the compatibility of the data # each node can only have one parent node eid = catalog[lin].parentNode.taxid if i == 0: if eid != root.taxid: m = "node #{} got two parent nodes #{} and #{}".format( lin, eid, root.taxid) raise ValueError(m) else: if eid != bucket[2][i - 1]: m = "node #{} got two parent nodes #{} and #{}".format( lin, eid, bucket[2][i - 1]) raise ValueError(m) iter = catalog[lin] if catalog.get(leaf.taxid) is None: iter.addChildNode(leaf) catalog[leaf.taxid] = leaf else: eid = catalog[bucket[0]].parentNode.taxid if eid != bucket[2][-1]: m = "node #{} got two parent nodes #{} and #{}".format( bucket[0], eid, bucket[2][-1]) raise ValueError(m) iter = catalog[bucket[0]] iter.updateWeight(leaf.weight) return Tree(root, catalog, root.weight)
def harvest(outputs, taxadb): queue = [] # process command line args i = 0 while i < len(outputs): if outputs[i].endswith("="): assert i + 1 < len(outputs), f"missing path after {outputs[i]}" sample = outputs[i][:-1] rundir = outputs[i + 1] i += 2 else: parts = outputs[i].split("=") assert len(parts) == 2, f"invalid SAMPLE_ID=RUNDIR pair: {outputs[i]}" sample = parts[0] rundir = parts[1] i += 1 assert sample in BENCHMARKS["samples"], f"unknown sample {sample}" if rundir.startswith("s3:"): assert os.environ.get( "AWS_PROFILE", False ), f"set environment AWS_PROFILE to read from {rundir}" else: assert ( Path(rundir) / "outputs.json" ).is_file(), f"couldn't find outputs.json in {rundir}" queue.append((sample, rundir)) if taxadb: taxadb = TaxID(dbtype="sqlite", dbname=taxadb) # harvest each supplied sample rslt = {} for sample, rundir in queue: assert sample not in rslt, f"repeated sample {sample}" outputs_json = read_outputs_json(rundir) rslt[sample] = harvest_sample(sample, outputs_json, taxadb) rslt[sample]["outputs"] = outputs_json print(json.dumps(rslt, indent=2))
def main(): """ Main program function """ # Get arguments args = get_arguments() # Step 1 print("STEP 1: Extracting Genbank IDS from BLAST output...") accession = extract_genbank_id(args.blast_output_file) chunks = [accession[i:i + 999] for i in range(0, len(accession), 999)] print("Found {0} ids !".format(len(accession))) #print(accession) # Step 3 print("STEP 2: Writing results to file '{0}'...".format( args.taxonomy_file)) accession_db = AccessionID(dbtype='sqlite', dbname=args.taxadb_file) tax_db = TaxID(dbtype='sqlite', dbname=args.taxadb_file) taxids = accession_db.taxid(accession) #taxids = [accession_db.taxid(acc) for acc in accession] #print(taxids) write_results(chunks, accession_db, tax_db, args.taxonomy_file) print("DONE !")
#!/usr/bin/env python3 import sys from os.path import expanduser from taxadb.taxid import TaxID handler = TaxID(dbtype="sqlite", dbname=expanduser("~") + "/.taxadb/taxadb.sqlite") for taxid in sys.stdin: taxid = taxid.strip() lineage = handler.lineage_name(taxid, reverse=True) lineage_s = "" if len(lineage) > 0: lineage_s = "; ".join(lineage) print("{}\t{}".format(taxid, lineage_s))
def main(): parser = argparse.ArgumentParser( description='Filter a Uniref FASTA file by taxonomy') ## output file to be written parser.add_argument('-i', '--input_fasta', type=str, required=True, help='Path to an input file to be read') parser.add_argument('-o', '--output_fasta', type=str, required=True, help='Path to an output file to be created') parser.add_argument( '-c', '--clades', type=str, required=True, help='Comma-separated string of clades to be included.') parser.add_argument('-db', '--taxadb', type=str, required=True, help='Path to the taxadb sqlite3 file') args = parser.parse_args() if not os.path.exists(args.taxadb): raise Exception("ERROR: Couldn't find taxadb specified") taxid = TaxID(dbtype='sqlite', dbname=args.taxadb) clades = args.clades.split(',') record_count = 0 print_every = 1000 clade_counter = dict() for clade in clades: clade_counter[clade] = 0 # remembers for each ID if we're keeping it or not id_cache = dict() fout = open(args.output_fasta, 'wt') keep_entry = False for line in open(args.input_fasta): if line[0] == '>': record_count += 1 if record_count % print_every == 0: print("{0} records processed ...".format(record_count), flush=True) m = re.search('TaxID=(\d+)', line) if m: tax_id = m.group(1) if tax_id in id_cache: if id_cache[tax_id] == True: keep_entry = True else: keep_entry = False else: lineage = taxid.lineage_name(tax_id, reverse=True) clade_found = False if lineage is None: keep_entry = False continue for clade in clades: if clade in lineage: clade_found = True clade_counter[clade] += 1 break if clade_found: keep_entry = True id_cache[tax_id] = True else: keep_entry = False id_cache[tax_id] = False else: keep_entry = False if keep_entry: fout.write(line) fout.close() print("Number of entries exported by clade:") for clade in clade_counter: print("\t{0}: {1}".format(clade, clade_counter[clade]))
#!/usr/bin/env python3 import sys from taxadb.taxid import TaxID taxid = TaxID() for t in sys.argv[1:]: name = taxid.sci_name(int(t)) print(name)
def main(): parser = argparse.ArgumentParser( description='Filter a Uniref FASTA file by taxonomy') ## output file to be written parser.add_argument('-i', '--input_fasta', type=str, required=True, help='Path to an input file to be read' ) parser.add_argument('-o', '--output_fasta', type=str, required=True, help='Path to an output file to be created' ) parser.add_argument('-c', '--clades', type=str, required=True, help='Comma-separated string of clades to be included.' ) parser.add_argument('-db', '--taxadb', type=str, required=True, help='Path to the taxadb sqlite3 file' ) args = parser.parse_args() if not os.path.exists(args.taxadb): raise Exception("ERROR: Couldn't find taxadb specified") taxid = TaxID(dbtype='sqlite', dbname=args.taxadb) clades = args.clades.split(',') record_count = 0 print_every = 1000 clade_counter = dict() for clade in clades: clade_counter[clade] = 0 # remembers for each ID if we're keeping it or not id_cache = dict() fout = open(args.output_fasta, 'wt') keep_entry = False for line in open(args.input_fasta): if line[0] == '>': record_count += 1 if record_count % print_every == 0: print("{0} records processed ...".format(record_count), flush=True) m = re.search('TaxID=(\d+)', line) if m: tax_id = m.group(1) if tax_id in id_cache: if id_cache[tax_id] == True: keep_entry = True else: keep_entry = False else: lineage = taxid.lineage_name(tax_id, reverse=True) clade_found = False if lineage is None: keep_entry = False continue for clade in clades: if clade in lineage: clade_found = True clade_counter[clade] += 1 break if clade_found: keep_entry = True id_cache[tax_id] = True else: keep_entry = False id_cache[tax_id] = False else: keep_entry = False if keep_entry: fout.write(line) fout.close() print("Number of entries exported by clade:") for clade in clade_counter: print("\t{0}: {1}".format(clade, clade_counter[clade]))
# Download TinySeq XML format file from NCBI BioProject 33175: # https://www.ncbi.nlm.nih.gov/bioproject?db=bioproject&Cmd=DetailsSearch&Term=33175%5Buid%5D from bs4 import BeautifulSoup as BS soup = BS(open('./sequence.fasta.xml'), 'lxml') # You'll need to build the taxa taxadb database from taxadb.taxid import TaxID db = TaxID(dbtype='sqlite', dbname='taxadb.sqlite') import sys ''' soup.prettify()[:2100] == <html> <body> <tseqset> <tseq> <tseq_seqtype value="nucleotide"> </tseq_seqtype> <tseq_accver> NR_170543.1 </tseq_accver> <tseq_sid> gnl|REF_SSU16S|KU507537:1-1439 </tseq_sid> <tseq_taxid> 1849015 </tseq_taxid> <tseq_orgname> Pseudoarcobacter acticola </tseq_orgname> <tseq_defline>
#!/usr/bin/env python3 from taxadb.taxid import TaxID import fileinput taxid = TaxID() for line in fileinput.input(): print(taxid.sci_name(line.rstrip()))