def _basta_taxonomy(self, args): if not os.path.exists(args.output): os.makedirs(args.output) self.logger.info( "\n#### Downloading and processing NCBI taxonomy files\n") self.logger.info("\n# [BASTA STATUS] Download taxonomy files") dutils.down_and_check("ftp://ftp.ncbi.nih.gov/pub/taxonomy/", "taxdump.tar.gz", args.output) call([ "tar", "-xzvf", os.path.join(args.output, "taxdump.tar.gz"), "-C", args.output ]) self.logger.info( "\n# [BASTA STATUS] Creating complete taxonomy file\n") tax_creator = ntc.Creator(os.path.join(args.output, "names.dmp"), os.path.join(args.output, "nodes.dmp")) tax_creator._write(os.path.join(args.output, "complete_taxa")) self.logger.info("\n# [BASTA STATUS] Creating taxonomy database") dbutils.create_db(args.output, "complete_taxa.gz", "complete_taxa.db", 0, 1) self.logger.info( "\n### Done! NCBI taxonomy database created in %s ####" % (args.output))
def _fetch_mapping(args,logger): logger.info("\n# [STATUS] Initializing mapping database") db_file = db.get_db_name(args.directory,args.dbtype) map_lookup = db._init_db(os.path.abspath(os.path.join(args.directory,db_file))) with open(args.mapout,"w") as f: for k,v in map_lookup: f.write("%s\t%s\n" % (k,v))
def run_basta(self,args): if args.subparser_name == 'sequence': self._check_dir(args) if not dbutils._check_complete(args.directory): self.logger.error("\n[BASTA ERROR] Couldn't find complete_taxa.db in %s. Did you run initial \'basta download\'?" % (args.directory)) sys.exit() self._basta_sequence(args) elif args.subparser_name == 'single': self._check_dir(args) if not dbutils._check_complete(args.directory): self.logger.error("\n[BASTA ERROR] Couldn't find complete_taxa.db in %s. Did you run initial \'basta download\'?" % (args.directory)) sys.exit() self._basta_single(args) elif args.subparser_name == 'multiple': self._check_dir(args) if not dbutils._check_complete(args.directory): self.logger.error("\n[BASTA ERROR] Couldn't find complete_taxa.db in %s. Did you run initial \'basta download\'?" % (args.directory)) sys.exit() self._basta_multiple(args) elif args.subparser_name == 'download': self._check_dir(args) self._basta_download(args) elif args.subparser_name == 'create_db': self._check_dir(args) self._basta_create_db(args) elif args.subparser_name == 'taxonomy': self._check_dir(args) self._basta_taxonomy(args)
def _basta_create_db(self, args): if not os.path.exists(args.directory): os.makedirs(args.directory) self.logger.info("\n#### Creating database\n") dbutils.create_db(args.directory, args.input, args.output, args.key, args.value) self.logger.info("\n#### Done. Processed file %s\n" % (args.input))
def _basta_download(self, args): if not os.path.exists(args.directory): os.makedirs(args.directory) self.logger.info( "\n##### Downloading and processing mapping file(s) from NCBI ###\n" ) #db_file = dbutils.get_db_name(args.directory,args.type) if args.type == "prot": map_file = "prot.accession2taxid.gz" db_file = "prot_mapping.db" elif args.type == "wgs": map_file = "nucl_wgs.accession2taxid.gz" db_file = "wgs_mapping.db" elif args.type == "gss": map_file = "nucl_gss.accession2taxid.gz" db_file = "gss_mapping.db" elif args.type == "est": map_file = "nucl_est.accession2taxid.gz" db_file = "est_mapping.db" elif args.type == "pdb": map_file = "pdb.accession2taxid.gz" db_file = "pdb_mapping.db" else: map_file = "nucl_gb.accession2taxid.gz" db_file = "gb_mapping.db" self.logger.info("\n# [BASTA STATUS] Downloading mapping files\n") dutils.down_and_check(args.ftp, map_file, args.directory) self.logger.info("\n# [BASTA STATUS] Creating mapping database\n") dbutils.create_db(args.directory, map_file, db_file, 0, 2) self.logger.info("\n##### Done. Downloaded and processed file %s\n" % (map_file))
def _get_lookups(self, db_file): self.logger.info("\n# [BASTA STATUS] Initializing taxonomy database") tax_lookup = db._init_db( os.path.join(self.directory, "complete_taxa.db")) self.logger.info("\n# [BASTA STATUS] Initializing mapping database") map_lookup = db._init_db( os.path.abspath(os.path.join(self.directory, db_file))) return (tax_lookup, map_lookup)
def _basta_sequence(self,args): self.logger.info("\n#### Assigning taxonomy to each sequence ###\n") db_file = dbutils.get_db_name(args.directory,args.type) assigner = AssignTaxonomy.Assigner(args.evalue,args.alen,args.identity,args.number,args.minimum,args.maj_perc,args.directory,args.config_path,args.output,args.hit_count) if args.verbose: assigner.info_file = args.verbose assigner._assign_sequence(args.blast,db_file,args.best_hit) self.logger.info("\n#### Done. Output written to %s" % (args.output))
def _basta_download(self, args): if not os.path.exists(args.directory): os.makedirs(args.directory) self.logger.info( "\n##### Downloading and processing mapping file(s) from NCBI ###\n" ) #db_file = dbutils.get_db_name(args.directory,args.type) if args.type == "prot": map_file = "prot.accession2taxid.gz" db_file = "prot_mapping.db" elif args.type == "uni": # Another quick'n dirty hardcoded thing do add uniprot to # db type prot ... map_file = "idmapping_selected.tab.gz" db_file = "prot_mapping.db" self.logger.info("\n# [BASTA STATUS] Downloading mapping file\n") dutils.down( "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/", map_file, args.directory) self.logger.info("\n# [BASTA STATUS] Creating mapping database\n") dbutils.create_db(args.directory, map_file, db_file, 0, 12) return elif args.type == "wgs": map_file = "nucl_wgs.accession2taxid.gz" db_file = "wgs_mapping.db" elif args.type == "gss": map_file = "nucl_gss.accession2taxid.gz" db_file = "gss_mapping.db" elif args.type == "est": map_file = "nucl_est.accession2taxid.gz" db_file = "est_mapping.db" elif args.type == "pdb": map_file = "pdb.accession2taxid.gz" db_file = "pdb_mapping.db" else: map_file = "nucl_gb.accession2taxid.gz" db_file = "gb_mapping.db" self.logger.info("\n# [BASTA STATUS] Downloading mapping files\n") dutils.down_and_check(args.ftp, map_file, args.directory) self.logger.info("\n# [BASTA STATUS] Creating mapping database\n") dbutils.create_db(args.directory, map_file, db_file, 0, 2) self.logger.info("\n##### Done. Downloaded and processed file %s\n" % (map_file))
def _basta_single(self,args): self.logger.info("\n#### Assigning one taxonomy based on all sequences ###\n") db_file = dbutils.get_db_name(args.directory,args.type) assigner = AssignTaxonomy.Assigner(args.evalue,args.alen,args.identity,args.number,args.minimum,args.maj_perc,args.directory,args.config_path,args.output,args.hit_count) if args.verbose: assigner.info_file = args.verbose lca = assigner._assign_single(args.blast,db_file,args.best_hit) self.logger.info("\n##### Results #####\n") self.logger.info("Last Common Ancestor: %s\n" % (lca)) self.logger.info("\n###################\n")
def _basta_multiple(self,args): self.logger.info("\n#### Assigning one taxonomy for each file ###\n") db_file = "" db_file = dbutils.get_db_name(args.directory,args.type) assigner = AssignTaxonomy.Assigner(args.evalue,args.alen,args.identity,args.number,args.minimum,args.maj_perc,args.directory,args.config_path,args.output,args.hit_count,args.quiet) if args.verbose: if os.path.exists(args.verbose): os.remove(args.verbose) assigner.info_file = args.verbose assigner._assign_multiple(args.blast,db_file,args.best_hit) self.logger.info("\n###### Done. Output written to %s" % (args.output))
def _fetch_taxonomies(args,logger): tax_dict = {} logger.info("\n# [STATUS] Initializing taxonomy database") db_file = db.get_db_name(args.directory,args.dbtype) map_lookup = db._init_db(os.path.abspath(os.path.join(args.directory,db_file))) tax_lookup = db._init_db(os.path.join(args.directory,"complete_taxa.db")) not_found = {} with open(args.dbout,"w") as f: for k,v in map_lookup: tax_string = tax_lookup.get(v) if not tax_string: if v in not_found: continue else: logger.warning("\n# [WARNING] No taxon found for %d " % (int(v))) not_found[v] = 1 continue f.write("%s\t%s\n" % (v,tax_string))
def _basta_multiple(self, args): self.logger.info("\n#### Assigning one taxonomy for each file ###\n") db_file = "" db_file = dbutils.get_db_name(args.directory, args.type) assigner = AssignTaxonomy.Assigner(args.evalue, args.alen, args.identity, args.number, args.minimum, args.lazy, args.tax_method, args.directory, args.config_path, args.output) if args.verbose: assigner.info_file = args.verbose assigner._assign_multiple(args.blast, db_file, args.best_hit) self.logger.info("\n###### Done. Output written to %s" % (args.output))
def _fetch_taxonomies(seqs, args, logger): tax_dict = {} logger.info("\n# [STATUS] Initializing taxonomy database") tax_lookup = db._init_db(os.path.join(args.directory, "complete_taxa.db")) logger.info("\n# [STATUS] Initializing mapping database") db_file = db.get_db_name(args.directory, args.dbtype) map_lookup = db._init_db( os.path.abspath(os.path.join(args.directory, db_file))) for s in seqs: taxon_id = map_lookup.get(s) if not taxon_id: logger.warning("\n# [WARNING] No mapping found for %s " % (s)) continue tax_string = tax_lookup.get(taxon_id) if not tax_string: logger.warning("\n# [WARNING] No taxon found for %d " % (int(taxon_id))) continue tax_dict[s] = tax_string return tax_dict
def test_check_complete(self): os.mkdir(os.path.join(self.pwd, "complete_taxa.db")) self.assertTrue(db._check_complete(self.pwd)) os.rmdir(os.path.join(self.pwd, "complete_taxa.db"))
def test_get_db_name(self): os.mkdir(os.path.join(self.pwd, "test_mapping.db")) self.assertEqual(db.get_db_name(self.pwd, "test"), "test_mapping.db") os.rmdir(os.path.join(self.pwd, "test_mapping.db"))
def test_check_file_name(self): self.assertEqual(db._check_file_name("test"), "test.db") self.assertEqual(db._check_file_name("test.db"), "test.db")