Exemple #1
0
def main(argv=None):
    opts = parse_args(argv)

    # Configure
    
    if opts.verbose:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.WARNING)
    
    taxa_db = NcbiEutils(opts.cache_fp)
    taxa_db.load_cache()

    consensus_thresholds = [t for _, t in CONSENSUS_THRESHOLDS]
    assigner = Assigner(
        opts.min_cover, opts.min_species_id, opts.min_genus_id, opts.min_id,
        consensus_thresholds, opts.max_generic, taxa_db)

    # Read input files
    
    with open(opts.fasta_file) as f:
        sequences = list(iter_fasta(f))

    with open(opts.blast_file) as f:
        blast_hits = read_blast(f)

    # Open output files

    if not os.path.exists(opts.output_directory):
        os.mkdir(opts.output_directory)
    output_file = open(
        os.path.join(opts.output_directory, "Full_Taxonomy.txt"), 'w')
    standard_taxa_file = open(
        os.path.join(opts.output_directory, "Standard_Taxonomy.txt"), "w")
    log_file = open(os.path.join(opts.output_directory, "brocc.log"), "w")
    log_file.write(
        "Sequence\tWinner_Votes\tVotes_Cast\tGenerics_Pruned\tLevel\t"
        "Classification\n")

    # Do the work

    for name, seq in sequences:
        seq_hits = blast_hits[name]
        # This is where the magic happens
        a = assigner.assign(name, seq, seq_hits)

        output_file.write(a.format_for_full_taxonomy())
        standard_taxa_file.write(a.format_for_standard_taxonomy())
        log_file.write(a.format_for_log())

    # Close output files, write cache

    output_file.close()
    standard_taxa_file.close()
    log_file.close()

    taxa_db.save_cache()
Exemple #2
0
    def test_save_load_cache(self):
        lineages = {
            "taxon1": {'class': "a", "genus": "b"},
            "taxon2": {'class': "c", "genus": "d"},
            }
        taxon_ids = {"taxon1": "b", "taxon2": "d"}
        self.db.lineages = lineages
        self.db.taxon_ids = taxon_ids
        self.db._fresh = False
        self.db.save_cache()

        db2 = NcbiEutils(self.cache_file.name)
        db2.load_cache()
        self.assertEqual(db2.lineages, lineages)
        self.assertEqual(db2.taxon_ids, taxon_ids)
Exemple #3
0
class NcbiEutilsTests(unittest.TestCase):
    def setUp(self):
        self.cache_file = tempfile.NamedTemporaryFile(suffix=".json")
        self.db = NcbiEutils(self.cache_file.name)

    def test_save_load_cache(self):
        lineages = {
            "taxon1": {'class': "a", "genus": "b"},
            "taxon2": {'class': "c", "genus": "d"},
            }
        taxon_ids = {"taxon1": "b", "taxon2": "d"}
        self.db.lineages = lineages
        self.db.taxon_ids = taxon_ids
        self.db._fresh = False
        self.db.save_cache()

        db2 = NcbiEutils(self.cache_file.name)
        db2.load_cache()
        self.assertEqual(db2.lineages, lineages)
        self.assertEqual(db2.taxon_ids, taxon_ids)

    def test_get_taxon_id(self):
        self.assertEqual(self.db.get_taxon_id("312434489"), "531911")
        self.assertEqual(self.db.taxon_ids, {"312434489": "531911"})

    def test_get_lineage(self):
        observed_lineage = self.db.get_lineage("531911")
        expected_lineage = {
            'Lineage': (
                'cellular organisms; Eukaryota; Opisthokonta; Fungi; Dikarya; '
                'Ascomycota; saccharomyceta; Pezizomycotina; leotiomyceta; '
                'sordariomyceta; Sordariomycetes; Xylariomycetidae; '
                'Xylariales; Amphisphaeriaceae; Pestalotiopsis'),
            'class': 'Sordariomycetes',
            'family': 'Amphisphaeriaceae',
            'genus': 'Pestalotiopsis',
            'kingdom': 'Fungi',
            'no rank': 'sordariomyceta',
            'order': 'Xylariales',
            'phylum': 'Ascomycota',
            'species': 'Pestalotiopsis maculiformans',
            'subclass': 'Xylariomycetidae',
            'subkingdom': 'Dikarya',
            'subphylum': 'Pezizomycotina',
            'superkingdom': 'Eukaryota',
            }
        self.assertEqual(observed_lineage, expected_lineage)
        self.assertEqual(self.db.lineages, {'531911': expected_lineage})
Exemple #4
0
 def test_get_lineage(self):
     db = NcbiEutils()
     observed_lineage = db.get_lineage("531911")
     expected_lineage = [('cellular organisms', 'no rank'),
                         ('Eukaryota', 'superkingdom'),
                         ('Opisthokonta', 'no rank'), ('Fungi', 'kingdom'),
                         ('Dikarya', 'subkingdom'),
                         ('Ascomycota', 'phylum'),
                         ('saccharomyceta', 'no rank'),
                         ('Pezizomycotina', 'subphylum'),
                         ('leotiomyceta', 'no rank'),
                         ('sordariomyceta', 'no rank'),
                         ('Sordariomycetes', 'class'),
                         ('Xylariomycetidae', 'subclass'),
                         ('Xylariales', 'order'),
                         ('Sporocadaceae', 'family'),
                         ('Pestalotiopsis', 'genus'),
                         ('Pestalotiopsis maculiformans', 'species')]
     self.assertEqual(observed_lineage, expected_lineage)
     self.assertEqual(db.lineages, {'531911': expected_lineage})
Exemple #5
0
def main(argv=None):
    opts = parse_args(argv)

    # Configure

    if opts.verbose:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.WARNING)

    taxa_db = NcbiEutils(opts.cache_fp)
    taxa_db.load_cache()

    consensus_thresholds = [t for _, t in CONSENSUS_THRESHOLDS]
    assigner = Assigner(opts.min_cover, opts.min_species_id, opts.min_genus_id,
                        opts.min_id, consensus_thresholds, opts.max_generic,
                        taxa_db)

    # Read input files

    with open(opts.fasta_file) as f:
        sequences = list(iter_fasta(f))

    with open(opts.blast_file) as f:
        blast_hits = read_blast(f)

    # Open output files

    if not os.path.exists(opts.output_directory):
        os.mkdir(opts.output_directory)
    output_file = open(
        os.path.join(opts.output_directory, "Full_Taxonomy.txt"), 'w')
    standard_taxa_file = open(
        os.path.join(opts.output_directory, "Standard_Taxonomy.txt"), "w")
    log_file = open(os.path.join(opts.output_directory, "brocc.log"), "w")
    log_file.write(
        "Sequence\tWinner_Votes\tVotes_Cast\tGenerics_Pruned\tLevel\t"
        "Classification\n")

    # Do the work

    for name, seq in sequences:
        seq_hits = blast_hits[name]
        # This is where the magic happens
        a = assigner.assign(name, seq, seq_hits)

        output_file.write(a.format_for_full_taxonomy())
        standard_taxa_file.write(a.format_for_standard_taxonomy())
        log_file.write(a.format_for_log())

    # Close output files, write cache

    output_file.close()
    standard_taxa_file.close()
    log_file.close()

    taxa_db.save_cache()
Exemple #6
0
 def test_get_taxon_id(self):
     db = NcbiEutils()
     self.assertEqual(db.get_taxon_id("HQ608011.1"), "531911")
     self.assertEqual(db.taxon_ids, {"HQ608011.1": "531911"})
Exemple #7
0
 def setUp(self):
     self.cache_file = tempfile.NamedTemporaryFile(suffix=".json")
     self.db = NcbiEutils(self.cache_file.name)
Exemple #8
0
def main(argv=None):
    opts = parse_args(argv)

    # Configure

    if opts.verbose:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.WARNING)

    if os.path.exists(opts.taxonomy_db):
        taxa_db = NcbiLocal(opts.taxonomy_db)
    else:
        sys.stderr.write(
            "Did not detect a local copy of the NCBI taxonomy.\n"
            "Using NCBI EUtils to get taxonomic info instead.\n\n"
            "The NCBI taxonomy can be dowloaded with the script "
            "create_local_taxonomy_db.py\n"
            "This will greatly speed up the assignment process.\n")
        taxa_db = NcbiEutils()

    consensus_thresholds = [t for _, t in CONSENSUS_THRESHOLDS]
    assigner = Assigner(opts.min_cover, opts.min_species_id, opts.min_genus_id,
                        opts.min_id, consensus_thresholds,
                        opts.min_winning_votes, taxa_db)

    # Read input files

    with open(opts.fasta_file) as f:
        sequences = list(iter_fasta(f))

    with open(opts.blast_file) as f:
        blast_hits = read_blast(f)

    # Open output files

    if not os.path.exists(opts.output_directory):
        os.mkdir(opts.output_directory)
    standard_taxa_file = open(
        os.path.join(opts.output_directory, "Standard_Taxonomy.txt"), "w")
    log_file = open(os.path.join(opts.output_directory, "brocc.log"), "w")
    log_file.write(
        "Sequence\tWinner_Votes\tVotes_Cast\tGenerics_Pruned\tLevel\t"
        "Classification\n")

    # Set up log for voting details
    vote_logger = logging.getLogger("brocc.votes")
    vote_logger.setLevel(logging.DEBUG)
    vote_handler = logging.FileHandler(
        os.path.join(opts.output_directory, "voting_log.txt"))
    vote_handler.setLevel(logging.DEBUG)
    vote_formatter = logging.Formatter('%(message)s')
    vote_handler.setFormatter(vote_formatter)
    vote_logger.addHandler(vote_handler)
    vote_logger.propagate = False
    # Do the work

    for name, seq in sequences:
        seq_hits = blast_hits[name]
        # This is where the magic happens
        a = assigner.assign(name, seq, seq_hits)

        standard_taxa_file.write(a.format_for_standard_taxonomy())
        log_file.write(a.format_for_log())

    # Close output files

    standard_taxa_file.close()
    log_file.close()