def main(): parser = argparse.ArgumentParser( description='Filter a Uniref FASTA file by taxonomy') ## output file to be written parser.add_argument('-i', '--input_fasta', type=str, required=True, help='Path to an input file to be read') parser.add_argument('-o', '--output_fasta', type=str, required=True, help='Path to an output file to be created') parser.add_argument( '-c', '--clades', type=str, required=True, help='Comma-separated string of clades to be included.') parser.add_argument('-db', '--taxadb', type=str, required=True, help='Path to the taxadb sqlite3 file') args = parser.parse_args() if not os.path.exists(args.taxadb): raise Exception("ERROR: Couldn't find taxadb specified") taxid = TaxID(dbtype='sqlite', dbname=args.taxadb) clades = args.clades.split(',') record_count = 0 print_every = 1000 clade_counter = dict() for clade in clades: clade_counter[clade] = 0 # remembers for each ID if we're keeping it or not id_cache = dict() fout = open(args.output_fasta, 'wt') keep_entry = False for line in open(args.input_fasta): if line[0] == '>': record_count += 1 if record_count % print_every == 0: print("{0} records processed ...".format(record_count), flush=True) m = re.search('TaxID=(\d+)', line) if m: tax_id = m.group(1) if tax_id in id_cache: if id_cache[tax_id] == True: keep_entry = True else: keep_entry = False else: lineage = taxid.lineage_name(tax_id, reverse=True) clade_found = False if lineage is None: keep_entry = False continue for clade in clades: if clade in lineage: clade_found = True clade_counter[clade] += 1 break if clade_found: keep_entry = True id_cache[tax_id] = True else: keep_entry = False id_cache[tax_id] = False else: keep_entry = False if keep_entry: fout.write(line) fout.close() print("Number of entries exported by clade:") for clade in clade_counter: print("\t{0}: {1}".format(clade, clade_counter[clade]))
#!/usr/bin/env python3 import sys from os.path import expanduser from taxadb.taxid import TaxID handler = TaxID(dbtype="sqlite", dbname=expanduser("~") + "/.taxadb/taxadb.sqlite") for taxid in sys.stdin: taxid = taxid.strip() lineage = handler.lineage_name(taxid, reverse=True) lineage_s = "" if len(lineage) > 0: lineage_s = "; ".join(lineage) print("{}\t{}".format(taxid, lineage_s))
def main(): parser = argparse.ArgumentParser( description='Filter a Uniref FASTA file by taxonomy') ## output file to be written parser.add_argument('-i', '--input_fasta', type=str, required=True, help='Path to an input file to be read' ) parser.add_argument('-o', '--output_fasta', type=str, required=True, help='Path to an output file to be created' ) parser.add_argument('-c', '--clades', type=str, required=True, help='Comma-separated string of clades to be included.' ) parser.add_argument('-db', '--taxadb', type=str, required=True, help='Path to the taxadb sqlite3 file' ) args = parser.parse_args() if not os.path.exists(args.taxadb): raise Exception("ERROR: Couldn't find taxadb specified") taxid = TaxID(dbtype='sqlite', dbname=args.taxadb) clades = args.clades.split(',') record_count = 0 print_every = 1000 clade_counter = dict() for clade in clades: clade_counter[clade] = 0 # remembers for each ID if we're keeping it or not id_cache = dict() fout = open(args.output_fasta, 'wt') keep_entry = False for line in open(args.input_fasta): if line[0] == '>': record_count += 1 if record_count % print_every == 0: print("{0} records processed ...".format(record_count), flush=True) m = re.search('TaxID=(\d+)', line) if m: tax_id = m.group(1) if tax_id in id_cache: if id_cache[tax_id] == True: keep_entry = True else: keep_entry = False else: lineage = taxid.lineage_name(tax_id, reverse=True) clade_found = False if lineage is None: keep_entry = False continue for clade in clades: if clade in lineage: clade_found = True clade_counter[clade] += 1 break if clade_found: keep_entry = True id_cache[tax_id] = True else: keep_entry = False id_cache[tax_id] = False else: keep_entry = False if keep_entry: fout.write(line) fout.close() print("Number of entries exported by clade:") for clade in clade_counter: print("\t{0}: {1}".format(clade, clade_counter[clade]))