Esempio n. 1
0
    def _basta_taxonomy(self, args):
        if not os.path.exists(args.output):
            os.makedirs(args.output)
        self.logger.info(
            "\n#### Downloading and processing NCBI taxonomy files\n")
        self.logger.info("\n# [BASTA STATUS] Download taxonomy files")
        dutils.down_and_check("ftp://ftp.ncbi.nih.gov/pub/taxonomy/",
                              "taxdump.tar.gz", args.output)
        call([
            "tar", "-xzvf",
            os.path.join(args.output, "taxdump.tar.gz"), "-C", args.output
        ])

        self.logger.info(
            "\n# [BASTA STATUS] Creating complete taxonomy file\n")
        tax_creator = ntc.Creator(os.path.join(args.output, "names.dmp"),
                                  os.path.join(args.output, "nodes.dmp"))
        tax_creator._write(os.path.join(args.output, "complete_taxa"))

        self.logger.info("\n# [BASTA STATUS] Creating taxonomy database")
        dbutils.create_db(args.output, "complete_taxa.gz", "complete_taxa.db",
                          0, 1)

        self.logger.info(
            "\n### Done! NCBI taxonomy database created in %s ####" %
            (args.output))
Esempio n. 2
0
def _fetch_mapping(args,logger):
    logger.info("\n# [STATUS] Initializing mapping database")
    db_file = db.get_db_name(args.directory,args.dbtype)
    map_lookup = db._init_db(os.path.abspath(os.path.join(args.directory,db_file)))
    with open(args.mapout,"w") as f:
        for k,v in map_lookup:
            f.write("%s\t%s\n" % (k,v))
Esempio n. 3
0
 def run_basta(self,args):
     if args.subparser_name == 'sequence':
         self._check_dir(args)
         if not dbutils._check_complete(args.directory):
             self.logger.error("\n[BASTA ERROR] Couldn't find complete_taxa.db in %s. Did you run initial \'basta download\'?" % (args.directory))
             sys.exit()
         self._basta_sequence(args)
     elif args.subparser_name == 'single':
         self._check_dir(args)
         if not dbutils._check_complete(args.directory):
             self.logger.error("\n[BASTA ERROR] Couldn't find complete_taxa.db in %s. Did you run initial \'basta download\'?" % (args.directory))
             sys.exit()
         self._basta_single(args)
     elif args.subparser_name == 'multiple':
         self._check_dir(args)
         if not dbutils._check_complete(args.directory):
             self.logger.error("\n[BASTA ERROR] Couldn't find complete_taxa.db in %s. Did you run initial \'basta download\'?" % (args.directory))
             sys.exit()
         self._basta_multiple(args)
     elif args.subparser_name == 'download':
         self._check_dir(args)
         self._basta_download(args)
     elif args.subparser_name == 'create_db':
         self._check_dir(args)
         self._basta_create_db(args)
     elif args.subparser_name == 'taxonomy':
         self._check_dir(args)
         self._basta_taxonomy(args)
Esempio n. 4
0
 def _basta_create_db(self, args):
     if not os.path.exists(args.directory):
         os.makedirs(args.directory)
     self.logger.info("\n#### Creating database\n")
     dbutils.create_db(args.directory, args.input, args.output, args.key,
                       args.value)
     self.logger.info("\n#### Done. Processed file %s\n" % (args.input))
Esempio n. 5
0
    def _basta_download(self, args):
        if not os.path.exists(args.directory):
            os.makedirs(args.directory)
        self.logger.info(
            "\n##### Downloading and processing mapping file(s) from NCBI ###\n"
        )
        #db_file = dbutils.get_db_name(args.directory,args.type)
        if args.type == "prot":
            map_file = "prot.accession2taxid.gz"
            db_file = "prot_mapping.db"
        elif args.type == "wgs":
            map_file = "nucl_wgs.accession2taxid.gz"
            db_file = "wgs_mapping.db"
        elif args.type == "gss":
            map_file = "nucl_gss.accession2taxid.gz"
            db_file = "gss_mapping.db"
        elif args.type == "est":
            map_file = "nucl_est.accession2taxid.gz"
            db_file = "est_mapping.db"
        elif args.type == "pdb":
            map_file = "pdb.accession2taxid.gz"
            db_file = "pdb_mapping.db"
        else:
            map_file = "nucl_gb.accession2taxid.gz"
            db_file = "gb_mapping.db"

        self.logger.info("\n# [BASTA STATUS] Downloading mapping files\n")
        dutils.down_and_check(args.ftp, map_file, args.directory)
        self.logger.info("\n# [BASTA STATUS] Creating mapping database\n")
        dbutils.create_db(args.directory, map_file, db_file, 0, 2)
        self.logger.info("\n##### Done. Downloaded and processed file %s\n" %
                         (map_file))
Esempio n. 6
0
 def _get_lookups(self, db_file):
     self.logger.info("\n# [BASTA STATUS] Initializing taxonomy database")
     tax_lookup = db._init_db(
         os.path.join(self.directory, "complete_taxa.db"))
     self.logger.info("\n# [BASTA STATUS] Initializing mapping database")
     map_lookup = db._init_db(
         os.path.abspath(os.path.join(self.directory, db_file)))
     return (tax_lookup, map_lookup)
Esempio n. 7
0
 def _basta_sequence(self,args):
     self.logger.info("\n#### Assigning taxonomy to each sequence ###\n")
     db_file = dbutils.get_db_name(args.directory,args.type)
     assigner = AssignTaxonomy.Assigner(args.evalue,args.alen,args.identity,args.number,args.minimum,args.maj_perc,args.directory,args.config_path,args.output,args.hit_count)
     if args.verbose:
         assigner.info_file = args.verbose
     assigner._assign_sequence(args.blast,db_file,args.best_hit)
     self.logger.info("\n#### Done. Output written to %s" % (args.output))
Esempio n. 8
0
    def _basta_download(self, args):
        if not os.path.exists(args.directory):
            os.makedirs(args.directory)
        self.logger.info(
            "\n##### Downloading and processing mapping file(s) from NCBI ###\n"
        )
        #db_file = dbutils.get_db_name(args.directory,args.type)
        if args.type == "prot":
            map_file = "prot.accession2taxid.gz"
            db_file = "prot_mapping.db"
        elif args.type == "uni":
            # Another quick'n dirty hardcoded thing do add uniprot to
            # db type prot ...
            map_file = "idmapping_selected.tab.gz"
            db_file = "prot_mapping.db"
            self.logger.info("\n# [BASTA STATUS] Downloading mapping file\n")
            dutils.down(
                "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/",
                map_file, args.directory)
            self.logger.info("\n# [BASTA STATUS] Creating mapping database\n")
            dbutils.create_db(args.directory, map_file, db_file, 0, 12)
            return
        elif args.type == "wgs":
            map_file = "nucl_wgs.accession2taxid.gz"
            db_file = "wgs_mapping.db"
        elif args.type == "gss":
            map_file = "nucl_gss.accession2taxid.gz"
            db_file = "gss_mapping.db"
        elif args.type == "est":
            map_file = "nucl_est.accession2taxid.gz"
            db_file = "est_mapping.db"
        elif args.type == "pdb":
            map_file = "pdb.accession2taxid.gz"
            db_file = "pdb_mapping.db"
        else:
            map_file = "nucl_gb.accession2taxid.gz"
            db_file = "gb_mapping.db"

        self.logger.info("\n# [BASTA STATUS] Downloading mapping files\n")
        dutils.down_and_check(args.ftp, map_file, args.directory)
        self.logger.info("\n# [BASTA STATUS] Creating mapping database\n")
        dbutils.create_db(args.directory, map_file, db_file, 0, 2)
        self.logger.info("\n##### Done. Downloaded and processed file %s\n" %
                         (map_file))
Esempio n. 9
0
 def _basta_single(self,args):
     self.logger.info("\n#### Assigning one taxonomy based on all sequences ###\n")
     db_file = dbutils.get_db_name(args.directory,args.type)
     assigner = AssignTaxonomy.Assigner(args.evalue,args.alen,args.identity,args.number,args.minimum,args.maj_perc,args.directory,args.config_path,args.output,args.hit_count) 
     if args.verbose:
         assigner.info_file = args.verbose
     lca = assigner._assign_single(args.blast,db_file,args.best_hit)
     self.logger.info("\n##### Results #####\n")
     self.logger.info("Last Common Ancestor: %s\n" % (lca))
     self.logger.info("\n###################\n")
Esempio n. 10
0
 def _basta_multiple(self,args):
     self.logger.info("\n####  Assigning one taxonomy for each file ###\n")
     db_file = ""
     db_file = dbutils.get_db_name(args.directory,args.type)
     assigner = AssignTaxonomy.Assigner(args.evalue,args.alen,args.identity,args.number,args.minimum,args.maj_perc,args.directory,args.config_path,args.output,args.hit_count,args.quiet)
     if args.verbose:
         if os.path.exists(args.verbose):
             os.remove(args.verbose)
         assigner.info_file = args.verbose
     assigner._assign_multiple(args.blast,db_file,args.best_hit)
     self.logger.info("\n###### Done. Output written to %s" % (args.output))
Esempio n. 11
0
def _fetch_taxonomies(args,logger):
    tax_dict = {}

    logger.info("\n# [STATUS] Initializing taxonomy database")

    db_file = db.get_db_name(args.directory,args.dbtype)
    map_lookup = db._init_db(os.path.abspath(os.path.join(args.directory,db_file)))
    tax_lookup = db._init_db(os.path.join(args.directory,"complete_taxa.db"))

    not_found = {}

    with open(args.dbout,"w") as f:
        for k,v in map_lookup:
            tax_string = tax_lookup.get(v)
            if not tax_string:
                if v in not_found:
                    continue
                else:
                    logger.warning("\n# [WARNING] No taxon found for %d " % (int(v)))
                    not_found[v] = 1
                    continue    
            f.write("%s\t%s\n" % (v,tax_string))
Esempio n. 12
0
 def _basta_multiple(self, args):
     self.logger.info("\n####  Assigning one taxonomy for each file ###\n")
     db_file = ""
     db_file = dbutils.get_db_name(args.directory, args.type)
     assigner = AssignTaxonomy.Assigner(args.evalue, args.alen,
                                        args.identity, args.number,
                                        args.minimum, args.lazy,
                                        args.tax_method, args.directory,
                                        args.config_path, args.output)
     if args.verbose:
         assigner.info_file = args.verbose
     assigner._assign_multiple(args.blast, db_file, args.best_hit)
     self.logger.info("\n###### Done. Output written to %s" % (args.output))
Esempio n. 13
0
def _fetch_taxonomies(seqs, args, logger):
    tax_dict = {}

    logger.info("\n# [STATUS] Initializing taxonomy database")
    tax_lookup = db._init_db(os.path.join(args.directory, "complete_taxa.db"))

    logger.info("\n# [STATUS] Initializing mapping database")
    db_file = db.get_db_name(args.directory, args.dbtype)
    map_lookup = db._init_db(
        os.path.abspath(os.path.join(args.directory, db_file)))

    for s in seqs:
        taxon_id = map_lookup.get(s)
        if not taxon_id:
            logger.warning("\n# [WARNING] No mapping found for %s " % (s))
            continue
        tax_string = tax_lookup.get(taxon_id)
        if not tax_string:
            logger.warning("\n# [WARNING] No taxon found for %d " %
                           (int(taxon_id)))
            continue
        tax_dict[s] = tax_string
    return tax_dict
Esempio n. 14
0
 def test_check_complete(self):
     os.mkdir(os.path.join(self.pwd, "complete_taxa.db"))
     self.assertTrue(db._check_complete(self.pwd))
     os.rmdir(os.path.join(self.pwd, "complete_taxa.db"))
Esempio n. 15
0
 def test_get_db_name(self):
     os.mkdir(os.path.join(self.pwd, "test_mapping.db"))
     self.assertEqual(db.get_db_name(self.pwd, "test"), "test_mapping.db")
     os.rmdir(os.path.join(self.pwd, "test_mapping.db"))
Esempio n. 16
0
 def test_check_file_name(self):
     self.assertEqual(db._check_file_name("test"), "test.db")
     self.assertEqual(db._check_file_name("test.db"), "test.db")