def process_ncbi(self): """ download NCBI sequences, parse gbk files to fasta format, rename sequences and sort per genome """ my_log = logging.getLogger('train:process_ncbi') if utils.dir_is_empty(self.config.settings["ncbi_processed_dir"]): my_log.warning("The NCBI_PROCESSED_DIR is empty, it is possible to download data from NCBI.") my_log.info("I can automatically download Bacterial & Archael data " "(for more possibilities see INSTALL.txt).") if self.yes: answ = "y" else: my_log.info("Download sequence data from NCBI? [Y/N] (default=Y, timeout 2 minutes)") answ = utils.get_answer_timeout() if answ != "y": error_message = "There is no training data available, provide some and run the program again." \ "Read INSTALL.txt for details on how this can be done." my_log.critical(error_message) sys.exit(1) my_log.info("Download may take some time ...") os.mkdir(os.path.join(self.config.settings["project_dir"], "tmp")) tmp_dir = os.path.join(self.config.settings["project_dir"], 'tmp') data_archive = os.path.join(tmp_dir, 'all.gbk.tar.gz') success = os.system("wget -O {} ftp://ftp.ncbi.nih.gov/genomes/Bacteria/all.gbk.tar.gz".format(data_archive)) if success != 0: my_log.critical("Error in downloading sequence data from NCBI.") sys.exit(1) # unpack data unpack_cmd = "tar xfz {a} -C {tmp}".format(a=data_archive, tmp=tmp_dir) success = os.system(unpack_cmd) if success != 0: my_log.critical("Error in unpacking the downloaded sequence data.") sys.exit(1) # process the data and create the fasta files in ncbi_dir process_object = process_ncbi.Process_NCBI(tmp_dir, self.config.settings["ncbi_processed_dir"]) success = process_object.run() if not success: sys.exit(1) # clean the dowloaded NCBI data shutil.rmtree(tmp_dir) # get all the organism names from the files in the ncbi_dir # this can be used for generating generic clades n_sequences = 0 files = glob.glob("{dir}{sep}*.*".format(dir=self.config.settings["ncbi_processed_dir"], sep=os.path.sep)) for f in files: ext = f.split(".")[-1] if len(self.config.settings["extensions"]) > 0 or \ (len(self.config.settings["extensions"]) == 1 and len(self.config.settings["extensions"][0]) > 0): # extensions entweder [] oder [""] if ext is not None and ext not in self.config.settings["extensions"]: continue if "." not in f: my_log.debug("Invalid file: {}..skipping".format(f)) continue else: organism = f.split(os.path.sep)[-1].split(".")[0] # exclude this genome if asked to if organism in self.config.genomes_exclude: self.genomes_excluded.append(organism) continue n_sequences += 1 self.organisms.add(organism) if organism not in self.organism_file_map: self.organism_file_map[organism] = [f] else: self.organism_file_map.append(f) if len(self.genomes_excluded) is not 0: my_log.info("excluded {} sequences.".format(str(len(self.genomes_excluded))))
def main_processing(self): """ all steps of the training pipeline - processing ncbi sequences - tree processing (copying newick tree or building tree from a clade list) - mapping genomes on this tree - generating fragments from the sequences - dealing with sample specific data - generating kmer features - building models """ if not self.config.settings["only_models"]: # if you do not want to build only models, check if the project directory is empty if len(os.listdir(self.config.settings["project_dir"])) != 0: self.log.warning("The project directory is not empty, this can result in unpredictable behavior.") if self.yes: answer = "y" else: answer = utils.get_answer_timeout("Remove? [Y/N]") if answer == "y": self.log.info("Deleting old project directory..") shutil.rmtree(self.config.settings["project_dir"]) os.mkdir(self.config.settings["project_dir"]) else: self.log.critical("Please provide an empty project directory. Quiting...") sys.exit(1) if self.backup: self.stat = Status(logged=True) self.backupdir = os.path.join(self.config.settings["project_dir"], "backup") os.mkdir(self.backupdir) self.log.info("creating folder structure...") self.create_folderstructure() self.log.info("checking database") self.check_db() if len(self.config.genomes_exclude) != 0: # write organisms that will not be considered into a file utils.list_to_file(self.config.genomes_exclude, os.path.join(self.config.settings["project_dir"], "excluded.txt")) if self.stat is not None: self.stat.add_written_file(os.path.join(self.config.settings["project_dir"], "excluded.txt")) self.stat.write_backup(self.backupdir) self.log.info("Processing NCBI data...") self.process_ncbi() if self.stat is not None: self.stat.change_variable(1, "status") self.stat.change_variable(self.organism_file_map, "organism_file_map") self.stat.change_variable(self.genomes_excluded, "genomes_excluded") self.stat.change_variable(self.organisms, "organisms") self.stat.write_backup(self.backupdir) self.log.info("tree processing...") self.tree_process() if self.stat is not None: self.stat.change_variable(2, "status") self.stat.change_variable(self.nodes, "nodes") self.stat.change_variable(self.tree_file, "tree_file") self.stat.write_backup(self.backupdir) self.log.info("mapping genomes on the tree...") self.map_genomes_on_tree() if self.stat is not None: self.stat.change_variable(3, "status") self.stat.change_variable("n_frags_per_node", "status") self.stat.change_variable("tree_organism_map", "status") self.stat.change_variable("organisms_invalid", "status") self.stat.change_variable("organism_tree_map", "status") self.stat.write_backup(self.backupdir) self.log.info("generating sequence fragments...") self.generate_seq() if self.stat is not None: self.stat.change_variable(4, "status") self.stat.write_backup(self.backupdir) self.log.info("sample specific stuff...") self.sample_specific() if self.stat is not None: self.stat.change_variable(5, "status") self.stat.write_backup(self.backupdir) self.log.info("generating kmer features...") self.generate_kmer_features_concat() if self.stat is not None: self.stat.change_variable(6, "status") self.stat.write_backup(self.backupdir) else: self.log.info("reading tree string") self.config.settings["tree_file"] = os.path.join(self.config.settings["project_dir"], "tree.newick") self.log.info("building models") self.build_models() if self.config.settings["clean_up_train"]: self.log.info("Cleaning..") shutil.rmtree(os.path.join(self.config.settings["project_dir"], "train_data")) shutil.rmtree(os.path.join(self.config.settings["project_dir"], "sampled_fasta")) self.sqlite_taxonomy.close() self.log.info("Processing finished ...models are ready in {}".format(os.path.join(self.config.settings["project_dir"], "models")))