Esempio n. 1
0
    def calculateAlignedMarkerSets(self, db_genome_ids, marker_ids):
        '''
        Run Hmmalign for PFAM and TIGRFAM missing markers

        :param genome_ids: list of genome ids that are used for the tree step
        :param marker_ids: list of marker ids used for the tree building step
        '''

        self.logger.info('Aligning marker genes not already in the database.')
        # return True

        # We need to rebuild the path to each
        genome_dirs_query = (
            "SELECT g.id, g.genes_file_location,gs.external_id_prefix "
            "FROM genomes g " +
            "LEFT JOIN genome_sources gs ON gs.id = g.genome_source_id " +
            "WHERE g.id in %s")
        self.cur.execute(genome_dirs_query, (tuple(db_genome_ids), ))
        raw_results = self.cur.fetchall()
        genome_dirs = {a: fastaPathGenerator(b, c) for a, b, c in raw_results}

        manager = multiprocessing.Manager()
        out_q = manager.Queue()
        procs = []
        nprocs = self.threads
        for item in splitchunks(genome_dirs, nprocs):
            p = multiprocessing.Process(target=self._hmmWorker,
                                        args=(item, marker_ids, out_q))
            procs.append(p)
            p.start()

        # Collect all results into a single result dict. We know how many dicts
        # with results to expect.
        while out_q.empty():
            time.sleep(1)

        # Wait for all worker processes to finish
        for p in procs:
            p.join()

        return True
    def calculateAlignedMarkerSets(self, db_genome_ids, marker_ids):
        '''
        Run Hmmalign for PFAM and TIGRFAM missing markers

        :param genome_ids: list of genome ids that are used for the tree step
        :param marker_ids: list of marker ids used for the tree building step
        '''

        self.logger.info('Aligning marker genes not already in the database.')

        # We need to rebuild the path to each
        genome_dirs_query = ("SELECT g.id, g.genes_file_location,gs.external_id_prefix "
                             "FROM genomes g " +
                             "LEFT JOIN genome_sources gs ON gs.id = g.genome_source_id " +
                             "WHERE g.id in %s")
        self.cur.execute(genome_dirs_query, (tuple(db_genome_ids),))
        raw_results = self.cur.fetchall()
        genome_dirs = {a: fastaPathGenerator(b, c) for a, b, c in raw_results}

        manager = multiprocessing.Manager()
        out_q = manager.Queue()
        procs = []
        nprocs = self.threads
        for item in splitchunks(genome_dirs, nprocs):
            p = multiprocessing.Process(
                target=self._hmmWorker,
                args=(item, marker_ids, out_q))
            procs.append(p)
            p.start()

        # Collect all results into a single result dict. We know how many dicts
        # with results to expect.
        while out_q.empty():
            time.sleep(1)

        # Wait for all worker processes to finish
        for p in procs:
            p.join()

        return True
Esempio n. 3
0
    def addGenomes(self, checkm_file, batchfile, study_file):
        """Add new genomes to DB.

        Parameters
        ----------
        checkm_file : str
            Name of file containing CheckM results.
        batchfile : str
            Name of file describing genomes to add.
        study_file : str
            Name of file describing study from which genomes were recovered

        Returns
        -------
        list
            List of database genome identifiers of added genomes.
        """

        try:
            self.tmp_output_dir = tempfile.mkdtemp()

            self.logger.info("Parsing Study file.")
            study_id = self._processStudy(study_file)

            self.logger.info("Reading CheckM file.")
            checkm_results_dict = self._processCheckM(checkm_file)

            genomic_files = self._addGenomeBatch(batchfile, self.tmp_output_dir)

            self.logger.info("Running Prodigal to identify genes.")
            prodigal = Prodigal(self.threads)
            file_paths = prodigal.run(genomic_files)

            self.logger.info("Calculating and storing metadata for each genome.")
            manager = multiprocessing.Manager()

            progress_queue = multiprocessing.Queue()
            progress_proc = multiprocessing.Process(target=self._progress, args=(len(genomic_files), progress_queue))
            progress_proc.start()

            out_q = multiprocessing.Manager().Queue()

            procs = []
            nprocs = self.threads
            for item in splitchunks(genomic_files, 1):
                p = multiprocessing.Process(
                    target=self._addGenomesWorker,
                    args=(item, file_paths, checkm_results_dict, study_id, out_q, progress_queue),
                )
                procs.append(p)
                p.start()

            # Pierre: why is this needed?
            while out_q.empty():
                time.sleep(1)

            # wait for all worker processes to finish
            for p in procs:
                p.join()

            self.logger.info("Waiting for progress process.")
            progress_queue.put(None)
            progress_proc.join()

            # annotated genes against TIGRfam and Pfam databases
            self.logger.info("Identifying TIGRfam protein families.")
            gene_files = [file_paths[db_genome_id]["aa_gene_path"] for db_genome_id in genomic_files]
            tigr_search = TigrfamSearch(self.cur, self.currentUser, self.threads)
            tigr_search.run(gene_files)

            self.logger.info("Identifying Pfam protein families.")
            pfam_search = PfamSearch(self.cur, self.currentUser, self.threads)
            pfam_search.run(gene_files)
        except:
            if os.path.exists(self.tmp_output_dir):
                shutil.rmtree(self.tmp_output_dir)
            raise

        return genomic_files.keys()
Esempio n. 4
0
    def addGenomes(self, checkm_file, batchfile, study_file):
        """Add new genomes to DB.

        Parameters
        ----------
        checkm_file : str
            Name of file containing CheckM results.
        batchfile : str
            Name of file describing genomes to add.
        study_file : str
            Name of file describing study from which genomes were recovered

        Returns
        -------
        list
            List of database genome identifiers of added genomes.
        """

        try:
            self.tmp_output_dir = tempfile.mkdtemp()

            self.logger.info("Parsing Study file.")
            study_id = self._processStudy(study_file)

            self.logger.info("Reading CheckM file.")
            checkm_results_dict = self._processCheckM(checkm_file)

            genomic_files = self._addGenomeBatch(batchfile,
                                                 self.tmp_output_dir)

            self.logger.info("Running Prodigal to identify genes.")
            prodigal = Prodigal(self.threads)
            file_paths = prodigal.run(genomic_files)

            self.logger.info(
                "Calculating and storing metadata for each genome.")
            manager = multiprocessing.Manager()

            progress_queue = multiprocessing.Queue()
            progress_proc = multiprocessing.Process(target=self._progress,
                                                    args=(len(genomic_files),
                                                          progress_queue))
            progress_proc.start()

            out_q = multiprocessing.Manager().Queue()

            procs = []
            nprocs = self.threads
            for item in splitchunks(genomic_files, 1):
                p = multiprocessing.Process(
                    target=self._addGenomesWorker,
                    args=(item, file_paths, checkm_results_dict, study_id,
                          out_q, progress_queue))
                procs.append(p)
                p.start()

            # Pierre: why is this needed?
            while out_q.empty():
                time.sleep(1)

            # wait for all worker processes to finish
            for p in procs:
                p.join()

            self.logger.info("Waiting for progress process.")
            progress_queue.put(None)
            progress_proc.join()

            # annotated genes against TIGRfam and Pfam databases
            self.logger.info("Identifying TIGRfam protein families.")
            gene_files = [
                file_paths[db_genome_id]['aa_gene_path']
                for db_genome_id in genomic_files
            ]
            tigr_search = TigrfamSearch(self.cur, self.currentUser,
                                        self.threads)
            tigr_search.run(gene_files)

            self.logger.info("Identifying Pfam protein families.")
            pfam_search = PfamSearch(self.cur, self.currentUser, self.threads)
            pfam_search.run(gene_files)
        except:
            if os.path.exists(self.tmp_output_dir):
                shutil.rmtree(self.tmp_output_dir)
            raise

        return genomic_files.keys()