Esempio n. 1
0
    def process_ncbi(self):
        """
            download NCBI sequences,
            parse gbk files to fasta format,
            rename sequences and sort per genome
        """
        my_log = logging.getLogger('train:process_ncbi')
        if utils.dir_is_empty(self.config.settings["ncbi_processed_dir"]):
            my_log.warning("The NCBI_PROCESSED_DIR is empty, it is possible to download data from NCBI.")
            my_log.info("I can automatically download Bacterial & Archael data "
                        "(for more possibilities see INSTALL.txt).")

            if self.yes:
                answ = "y"
            else:
                my_log.info("Download sequence data from NCBI? [Y/N] (default=Y, timeout 2 minutes)")
                answ = utils.get_answer_timeout()

            if answ != "y":
                error_message = "There is no training data available, provide some and run the program again." \
                                "Read INSTALL.txt for details on how this can be done."
                my_log.critical(error_message)
                sys.exit(1)

            my_log.info("Download may take some time ...")
            os.mkdir(os.path.join(self.config.settings["project_dir"], "tmp"))

            tmp_dir = os.path.join(self.config.settings["project_dir"], 'tmp')
            data_archive = os.path.join(tmp_dir, 'all.gbk.tar.gz')

            success = os.system("wget -O {} ftp://ftp.ncbi.nih.gov/genomes/Bacteria/all.gbk.tar.gz".format(data_archive))
            if success != 0:
                my_log.critical("Error in downloading sequence data from NCBI.")
                sys.exit(1)
                # unpack data

            unpack_cmd = "tar xfz {a} -C {tmp}".format(a=data_archive, tmp=tmp_dir)
            success = os.system(unpack_cmd)

            if success != 0:
                my_log.critical("Error in unpacking the downloaded sequence data.")
                sys.exit(1)

            # process the data and create the fasta files in ncbi_dir
            process_object = process_ncbi.Process_NCBI(tmp_dir, self.config.settings["ncbi_processed_dir"])
            success = process_object.run()
            if not success:
                sys.exit(1)

            # clean the dowloaded NCBI data
            shutil.rmtree(tmp_dir)

        # get all the organism names from the files in the ncbi_dir
        # this can be used for generating generic clades
        n_sequences = 0
        files = glob.glob("{dir}{sep}*.*".format(dir=self.config.settings["ncbi_processed_dir"],
                                                 sep=os.path.sep))
        for f in files:
            ext = f.split(".")[-1]
            if len(self.config.settings["extensions"]) > 0 or \
                    (len(self.config.settings["extensions"]) == 1 and len(self.config.settings["extensions"][0]) > 0):
                        # extensions entweder [] oder [""]
                if ext is not None and ext not in self.config.settings["extensions"]:
                    continue
            if "." not in f:
                my_log.debug("Invalid file: {}..skipping".format(f))
                continue
            else:
                organism = f.split(os.path.sep)[-1].split(".")[0]

            # exclude this genome if asked to
            if organism in self.config.genomes_exclude:
                self.genomes_excluded.append(organism)
                continue

            n_sequences += 1
            self.organisms.add(organism)
            if organism not in self.organism_file_map:
                self.organism_file_map[organism] = [f]
            else:
                self.organism_file_map.append(f)

        if len(self.genomes_excluded) is not 0:
            my_log.info("excluded {} sequences.".format(str(len(self.genomes_excluded))))
Esempio n. 2
0
    def main_processing(self):
        """
            all steps of the training pipeline
                - processing ncbi sequences
                - tree processing (copying newick tree or building tree from a clade list)
                - mapping genomes on this tree
                - generating fragments from the sequences
                - dealing with sample specific data
                - generating kmer features
                - building models
        """

        if not self.config.settings["only_models"]:
            # if you do not want to build only models, check if the project directory is empty
            if len(os.listdir(self.config.settings["project_dir"])) != 0:
                self.log.warning("The project directory is not empty, this can result in unpredictable behavior.")
                if self.yes:
                    answer = "y"
                else:
                    answer = utils.get_answer_timeout("Remove? [Y/N]")
                if answer == "y":
                    self.log.info("Deleting old project directory..")
                    shutil.rmtree(self.config.settings["project_dir"])
                    os.mkdir(self.config.settings["project_dir"])
                else:
                    self.log.critical("Please provide an empty project directory. Quiting...")
                    sys.exit(1)

            if self.backup:
                self.stat = Status(logged=True)
                self.backupdir = os.path.join(self.config.settings["project_dir"], "backup")
                os.mkdir(self.backupdir)

            self.log.info("creating folder structure...")
            self.create_folderstructure()

            self.log.info("checking database")
            self.check_db()

            if len(self.config.genomes_exclude) != 0:
                # write organisms that will not be considered into a file
                utils.list_to_file(self.config.genomes_exclude,
                             os.path.join(self.config.settings["project_dir"], "excluded.txt"))

                if self.stat is not None:
                    self.stat.add_written_file(os.path.join(self.config.settings["project_dir"], "excluded.txt"))
                    self.stat.write_backup(self.backupdir)

            self.log.info("Processing NCBI data...")
            self.process_ncbi()
            if self.stat is not None:
                self.stat.change_variable(1, "status")
                self.stat.change_variable(self.organism_file_map, "organism_file_map")
                self.stat.change_variable(self.genomes_excluded, "genomes_excluded")
                self.stat.change_variable(self.organisms, "organisms")
                self.stat.write_backup(self.backupdir)

            self.log.info("tree processing...")
            self.tree_process()
            if self.stat is not None:
                self.stat.change_variable(2, "status")
                self.stat.change_variable(self.nodes, "nodes")
                self.stat.change_variable(self.tree_file, "tree_file")
                self.stat.write_backup(self.backupdir)

            self.log.info("mapping genomes on the tree...")
            self.map_genomes_on_tree()
            if self.stat is not None:
                self.stat.change_variable(3, "status")
                self.stat.change_variable("n_frags_per_node", "status")
                self.stat.change_variable("tree_organism_map", "status")
                self.stat.change_variable("organisms_invalid", "status")
                self.stat.change_variable("organism_tree_map", "status")
                self.stat.write_backup(self.backupdir)

            self.log.info("generating sequence fragments...")
            self.generate_seq()
            if self.stat is not None:
                self.stat.change_variable(4, "status")
                self.stat.write_backup(self.backupdir)

            self.log.info("sample specific stuff...")
            self.sample_specific()
            if self.stat is not None:
                self.stat.change_variable(5, "status")
                self.stat.write_backup(self.backupdir)

            self.log.info("generating kmer features...")
            self.generate_kmer_features_concat()
            if self.stat is not None:
                self.stat.change_variable(6, "status")
                self.stat.write_backup(self.backupdir)

        else:
            self.log.info("reading tree string")
            self.config.settings["tree_file"] = os.path.join(self.config.settings["project_dir"],
                                                             "tree.newick")

        self.log.info("building models")
        self.build_models()

        if self.config.settings["clean_up_train"]:
            self.log.info("Cleaning..")
            shutil.rmtree(os.path.join(self.config.settings["project_dir"], "train_data"))
            shutil.rmtree(os.path.join(self.config.settings["project_dir"], "sampled_fasta"))

        self.sqlite_taxonomy.close()
        self.log.info("Processing finished ...models are ready in {}".format(os.path.join(self.config.settings["project_dir"], "models")))