import os import sys import argparse import taxon import re ffdir = 'flatfiles' # the location of our genbank flatfiles blastpf = 'phage_genes.nr.blastp' genome = {} # hash with protein IDs as key and genome as value numprots = {} # hash with genome ID as key and number of proteins as value taxondir = "/home2/db/taxonomy/current/" sys.stderr.write("Reading taxonomy\n") taxa = taxon.read_nodes(directory=taxondir) names, blastname = taxon.read_names(directory=taxondir) sys.stderr.write("Read taxonomy\n") # first read how many proteins there are per genome # and create a list of protein ids->genomes. # Note also throw a fatal error if IDs duplicated because I forgot to check this earlier! for f in os.listdir(ffdir): with open(os.path.join(ffdir, f), 'r') as fin: for l in fin: p = l.strip().split("\t") if p[5] in genome: sys.stderr.write( "FATAL AND BUGGER: {} was found in {} and {}\n".format( p[5], genome[p[0]], p[0])) genome[p[5]] = p[0]
import os import sys import argparse import taxon if __name__ == '__main__': parser = argparse.ArgumentParser(description="Append taxonomy to the patric metadata file. This adds it at column 67") parser.add_argument('-f', help='patric metadata file', required=True) parser.add_argument('-o', help='output file', required=True) parser.add_argument('-t', help='taxonomy directory (default=/home2/db/taxonomy/current/)', default='/home2/db/taxonomy/current/') parser.add_argument('-v', help='verbose output', action="store_true") args = parser.parse_args() sys.stderr.write("Reading taxonomy\n") taxa = taxon.read_nodes(directory=args.t) names, blastname = taxon.read_names(directory=args.t) divs = taxon.read_divisions(directory=args.t) sys.stderr.write("Read taxonomy\n") want = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'] with open(args.o, 'w', encoding='utf-8') as out: with open(args.f, 'r', encoding='utf-8') as f: for l in f: p = l.strip().split("\t") while (len(p) <= 68): p.append("") if l.startswith("genome_id"): out.write("{}\t{}\n".format(l.strip(), "\t".join(want)))
import os import sys import argparse import taxon __author__ = 'Rob Edwards' if __name__ == "__main__": parser = argparse.ArgumentParser(description='Parse a tsv file and add taxonomy') parser.add_argument('-f', help='tab seperated values', required=True) args = parser.parse_args() want = ['superkingdom', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'] sys.stderr.write("Reading databases\n") taxa = taxon.read_nodes() names, blastname = taxon.read_names() sys.stderr.write("Done\n") with open(args.f, 'r') as f: for l in f: if l.startswith("#"): print("{}\t".format(l.strip()) + "\t".join(want)) continue p = l.strip().split("\t") m = ["" for w in want] i = p[2] if i in taxa: while taxa[i].parent != '1' and i != '1': if taxa[i].rank in want: m[want.index(taxa[i].rank)] = names[i].name
__author__ = 'Rob Edwards' if __name__ == "__main__": parser = argparse.ArgumentParser( description='Parse a tsv file and add taxonomy') parser.add_argument('-f', help='tab seperated values', required=True) args = parser.parse_args() want = [ 'superkingdom', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species' ] sys.stderr.write("Reading databases\n") taxa = taxon.read_nodes() names, blastname = taxon.read_names() sys.stderr.write("Done\n") with open(args.f, 'r') as f: for l in f: if l.startswith("#"): print("{}\t".format(l.strip()) + "\t".join(want)) continue p = l.strip().split("\t") m = ["" for w in want] i = p[2] if i in taxa: while taxa[i].parent != '1' and i != '1': if taxa[i].rank in want: m[want.index(taxa[i].rank)] = names[i].name
import argparse import taxon import re if __name__ == '__main__': parser = argparse.ArgumentParser(description="Read a blast file and create a tuple of [query / kingdom / phylum / genus / species]") parser.add_argument('-f', help='blast file(s). Note this must have taxids as column 14. You may specify more than one file', nargs='+') parser.add_argument('-t', help='taxonomy directory (default=/home2/db/taxonomy/current/)', default='/home2/db/taxonomy/current/') parser.add_argument('-v', help='verbose output', action="store_true") args = parser.parse_args() want = ['superkingdom', 'phylum', 'genus', 'species'] if args.v: sys.stderr.write("Reading taxonomy\n") taxa=taxon.read_nodes(directory=args.t) names,blastname = taxon.read_names(directory=args.t) if args.v: sys.stderr.write("Read taxonomy\n") for blastf in args.f: if args.v: sys.stderr.write("Reading {}\n".format(blastf)) with open(blastf, 'r') as fin: for l in fin: p=l.strip().split("\t") for tid in p[14].split(";"): level = {} results = [p[0], tid]