def main(argv=None): opts = parse_args(argv) # Configure if opts.verbose: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.WARNING) taxa_db = NcbiEutils(opts.cache_fp) taxa_db.load_cache() consensus_thresholds = [t for _, t in CONSENSUS_THRESHOLDS] assigner = Assigner( opts.min_cover, opts.min_species_id, opts.min_genus_id, opts.min_id, consensus_thresholds, opts.max_generic, taxa_db) # Read input files with open(opts.fasta_file) as f: sequences = list(iter_fasta(f)) with open(opts.blast_file) as f: blast_hits = read_blast(f) # Open output files if not os.path.exists(opts.output_directory): os.mkdir(opts.output_directory) output_file = open( os.path.join(opts.output_directory, "Full_Taxonomy.txt"), 'w') standard_taxa_file = open( os.path.join(opts.output_directory, "Standard_Taxonomy.txt"), "w") log_file = open(os.path.join(opts.output_directory, "brocc.log"), "w") log_file.write( "Sequence\tWinner_Votes\tVotes_Cast\tGenerics_Pruned\tLevel\t" "Classification\n") # Do the work for name, seq in sequences: seq_hits = blast_hits[name] # This is where the magic happens a = assigner.assign(name, seq, seq_hits) output_file.write(a.format_for_full_taxonomy()) standard_taxa_file.write(a.format_for_standard_taxonomy()) log_file.write(a.format_for_log()) # Close output files, write cache output_file.close() standard_taxa_file.close() log_file.close() taxa_db.save_cache()
def test_save_load_cache(self): lineages = { "taxon1": {'class': "a", "genus": "b"}, "taxon2": {'class': "c", "genus": "d"}, } taxon_ids = {"taxon1": "b", "taxon2": "d"} self.db.lineages = lineages self.db.taxon_ids = taxon_ids self.db._fresh = False self.db.save_cache() db2 = NcbiEutils(self.cache_file.name) db2.load_cache() self.assertEqual(db2.lineages, lineages) self.assertEqual(db2.taxon_ids, taxon_ids)
class NcbiEutilsTests(unittest.TestCase): def setUp(self): self.cache_file = tempfile.NamedTemporaryFile(suffix=".json") self.db = NcbiEutils(self.cache_file.name) def test_save_load_cache(self): lineages = { "taxon1": {'class': "a", "genus": "b"}, "taxon2": {'class': "c", "genus": "d"}, } taxon_ids = {"taxon1": "b", "taxon2": "d"} self.db.lineages = lineages self.db.taxon_ids = taxon_ids self.db._fresh = False self.db.save_cache() db2 = NcbiEutils(self.cache_file.name) db2.load_cache() self.assertEqual(db2.lineages, lineages) self.assertEqual(db2.taxon_ids, taxon_ids) def test_get_taxon_id(self): self.assertEqual(self.db.get_taxon_id("312434489"), "531911") self.assertEqual(self.db.taxon_ids, {"312434489": "531911"}) def test_get_lineage(self): observed_lineage = self.db.get_lineage("531911") expected_lineage = { 'Lineage': ( 'cellular organisms; Eukaryota; Opisthokonta; Fungi; Dikarya; ' 'Ascomycota; saccharomyceta; Pezizomycotina; leotiomyceta; ' 'sordariomyceta; Sordariomycetes; Xylariomycetidae; ' 'Xylariales; Amphisphaeriaceae; Pestalotiopsis'), 'class': 'Sordariomycetes', 'family': 'Amphisphaeriaceae', 'genus': 'Pestalotiopsis', 'kingdom': 'Fungi', 'no rank': 'sordariomyceta', 'order': 'Xylariales', 'phylum': 'Ascomycota', 'species': 'Pestalotiopsis maculiformans', 'subclass': 'Xylariomycetidae', 'subkingdom': 'Dikarya', 'subphylum': 'Pezizomycotina', 'superkingdom': 'Eukaryota', } self.assertEqual(observed_lineage, expected_lineage) self.assertEqual(self.db.lineages, {'531911': expected_lineage})
def test_get_lineage(self): db = NcbiEutils() observed_lineage = db.get_lineage("531911") expected_lineage = [('cellular organisms', 'no rank'), ('Eukaryota', 'superkingdom'), ('Opisthokonta', 'no rank'), ('Fungi', 'kingdom'), ('Dikarya', 'subkingdom'), ('Ascomycota', 'phylum'), ('saccharomyceta', 'no rank'), ('Pezizomycotina', 'subphylum'), ('leotiomyceta', 'no rank'), ('sordariomyceta', 'no rank'), ('Sordariomycetes', 'class'), ('Xylariomycetidae', 'subclass'), ('Xylariales', 'order'), ('Sporocadaceae', 'family'), ('Pestalotiopsis', 'genus'), ('Pestalotiopsis maculiformans', 'species')] self.assertEqual(observed_lineage, expected_lineage) self.assertEqual(db.lineages, {'531911': expected_lineage})
def main(argv=None): opts = parse_args(argv) # Configure if opts.verbose: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.WARNING) taxa_db = NcbiEutils(opts.cache_fp) taxa_db.load_cache() consensus_thresholds = [t for _, t in CONSENSUS_THRESHOLDS] assigner = Assigner(opts.min_cover, opts.min_species_id, opts.min_genus_id, opts.min_id, consensus_thresholds, opts.max_generic, taxa_db) # Read input files with open(opts.fasta_file) as f: sequences = list(iter_fasta(f)) with open(opts.blast_file) as f: blast_hits = read_blast(f) # Open output files if not os.path.exists(opts.output_directory): os.mkdir(opts.output_directory) output_file = open( os.path.join(opts.output_directory, "Full_Taxonomy.txt"), 'w') standard_taxa_file = open( os.path.join(opts.output_directory, "Standard_Taxonomy.txt"), "w") log_file = open(os.path.join(opts.output_directory, "brocc.log"), "w") log_file.write( "Sequence\tWinner_Votes\tVotes_Cast\tGenerics_Pruned\tLevel\t" "Classification\n") # Do the work for name, seq in sequences: seq_hits = blast_hits[name] # This is where the magic happens a = assigner.assign(name, seq, seq_hits) output_file.write(a.format_for_full_taxonomy()) standard_taxa_file.write(a.format_for_standard_taxonomy()) log_file.write(a.format_for_log()) # Close output files, write cache output_file.close() standard_taxa_file.close() log_file.close() taxa_db.save_cache()
def test_get_taxon_id(self): db = NcbiEutils() self.assertEqual(db.get_taxon_id("HQ608011.1"), "531911") self.assertEqual(db.taxon_ids, {"HQ608011.1": "531911"})
def setUp(self): self.cache_file = tempfile.NamedTemporaryFile(suffix=".json") self.db = NcbiEutils(self.cache_file.name)
def main(argv=None): opts = parse_args(argv) # Configure if opts.verbose: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.WARNING) if os.path.exists(opts.taxonomy_db): taxa_db = NcbiLocal(opts.taxonomy_db) else: sys.stderr.write( "Did not detect a local copy of the NCBI taxonomy.\n" "Using NCBI EUtils to get taxonomic info instead.\n\n" "The NCBI taxonomy can be dowloaded with the script " "create_local_taxonomy_db.py\n" "This will greatly speed up the assignment process.\n") taxa_db = NcbiEutils() consensus_thresholds = [t for _, t in CONSENSUS_THRESHOLDS] assigner = Assigner(opts.min_cover, opts.min_species_id, opts.min_genus_id, opts.min_id, consensus_thresholds, opts.min_winning_votes, taxa_db) # Read input files with open(opts.fasta_file) as f: sequences = list(iter_fasta(f)) with open(opts.blast_file) as f: blast_hits = read_blast(f) # Open output files if not os.path.exists(opts.output_directory): os.mkdir(opts.output_directory) standard_taxa_file = open( os.path.join(opts.output_directory, "Standard_Taxonomy.txt"), "w") log_file = open(os.path.join(opts.output_directory, "brocc.log"), "w") log_file.write( "Sequence\tWinner_Votes\tVotes_Cast\tGenerics_Pruned\tLevel\t" "Classification\n") # Set up log for voting details vote_logger = logging.getLogger("brocc.votes") vote_logger.setLevel(logging.DEBUG) vote_handler = logging.FileHandler( os.path.join(opts.output_directory, "voting_log.txt")) vote_handler.setLevel(logging.DEBUG) vote_formatter = logging.Formatter('%(message)s') vote_handler.setFormatter(vote_formatter) vote_logger.addHandler(vote_handler) vote_logger.propagate = False # Do the work for name, seq in sequences: seq_hits = blast_hits[name] # This is where the magic happens a = assigner.assign(name, seq, seq_hits) standard_taxa_file.write(a.format_for_standard_taxonomy()) log_file.write(a.format_for_log()) # Close output files standard_taxa_file.close() log_file.close()