Exemple #1
0
    def identify_marker_genes(self, ingroup_file, ubiquity_threshold,
                              single_copy_threshold, redundancy,
                              valid_marker_genes, output_msa_dir,
                              output_model_dir):
        """Identify ubiquitous, single-copy marker genes.

        Parameters
        ----------
        ingroup_file : str
            File specifying unique ids of ingroup genomes.
        ubiquity_threshold : float
            Threshold for defining ubiquity marker genes.
        single_copy_threshold : float
            Threshold for defining a single-copy marker gene.
        redundancy : float
            Threshold for declaring HMMs redundant.
        valid_marker_genes : iterable
            Restrict marker set to genes within this set.
        output_msa_dir : str
            Directory to store multiple sequence alignment of marker genes.
        output_model_dir : str
            Directory to store HMMs of marker genes.
        """

        # read directory for each genome
        genome_dirs = read_genome_dir_file(self.genome_dir_file)

        # read genomes within the ingroup
        ncbi_genome_ids, user_genome_ids = read_genome_id_file(ingroup_file)
        genome_ids = ncbi_genome_ids.union(user_genome_ids)
        self.logger.info('Ingroup genomes: %d' % len(genome_ids))
        self.logger.info('NCBI genomes: %d' % len(ncbi_genome_ids))
        self.logger.info('User genomes: %d' % len(user_genome_ids))

        # identify marker genes
        self.logger.info('Identifying marker genes.')
        gene_stats_file = os.path.join(output_model_dir, '..',
                                       'gene_stats.all.tsv')
        gene_count_table = self._gene_count_table(genome_ids, genome_dirs)
        marker_gene_stats = self._marker_genes(genome_ids, gene_count_table,
                                               ubiquity_threshold,
                                               single_copy_threshold,
                                               gene_stats_file)

        # with open('tmp_marker_gene_list', 'wb') as f:
        #    pickle.dump(marker_gene_stats, f)

        # with open('tmp_marker_gene_list', 'rb') as f:
        #    marker_gene_stats = pickle.load(f)

        marker_genes = set(marker_gene_stats.keys())
        if valid_marker_genes:
            self.logger.info(
                'Restricting %d identified markers to specified set of valid markers.'
                % len(marker_genes))
            marker_genes = marker_genes.intersection(valid_marker_genes)
        self.logger.info(
            'Identified ubiquitous, single-copy marker genes: %d' %
            len(marker_genes))

        redundancy_out_file = os.path.join(output_model_dir, '..',
                                           'redundant_markers.tsv')
        redundancy = redundancy * len(genome_ids)
        redundant_hmms = self._identify_redundant_hmms(marker_genes,
                                                       gene_count_table,
                                                       redundancy,
                                                       redundancy_out_file)
        marker_genes = marker_genes - redundant_hmms
        self.logger.info(
            'Marker genes identified as redundant between TIGRFAM and Pfam: %d'
            % len(redundant_hmms))
        self.logger.info('Remaining ubiquitous, single-copy marker genes: %d' %
                         len(marker_genes))

        # get HMM for each marker gene
        self.logger.info('Fetching HMM for each marker genes.')
        self._fetch_marker_models(marker_genes, output_model_dir)

        # align gene sequences
        align_markers = AlignMarkers(self.cpus)
        align_markers.run(genome_ids, genome_dirs, marker_genes, False,
                          output_msa_dir, output_model_dir)

        return len(genome_ids), len(ncbi_genome_ids), len(
            user_genome_ids), genome_ids, marker_gene_stats, marker_genes
Exemple #2
0
    def run(self, genome_id_file, marker_id_file, model, output_dir):
        """Identify phylogenetic tree.

        Parameters
        ----------
        genome_id_file : str
            File specifying unique ids of genomes to include in tree.
        marker_id_file : str
            File specifying unique ids of marker genes  to use for inference.
        model : str ['wag' or 'jtt']
            Model of evolution to use.
        output_dir : str
            Directory to store results.
        """

        time_keeper = TimeKeeper()

        output_alignment_dir = os.path.join(output_dir, 'alignments')
        make_sure_path_exists(output_alignment_dir)

        output_model_dir = os.path.join(output_dir, 'hmm_models')
        make_sure_path_exists(output_model_dir)

        # read directory for each genome
        genome_dirs = read_genome_dir_file(self.genome_dir_file)

        # read genomes within the ingroup
        ncbi_genome_ids, user_genome_ids = read_genome_id_file(genome_id_file)
        genome_ids = ncbi_genome_ids.union(user_genome_ids)
        self.logger.info('Inferring tree for %d genomes.' % len(genome_ids))
        self.logger.info('NCBI genomes: %d' % len(ncbi_genome_ids))
        self.logger.info('User genomes: %d' % len(user_genome_ids))

        # get marker genes
        self.logger.info('Reading marker genes.')
        marker_genes = read_marker_id_file(marker_id_file)
        self.logger.info('Read %d marker genes.' % len(marker_genes))

        # gather all single-copy HMMs into a single model file
        hmm_model_out = os.path.join(output_dir, 'phylo.hmm')
        hmm_info_out = os.path.join(output_dir, 'phylo.tsv')
        self.logger.info('Generating marker gene HMM model files.')
        self._fetch_marker_models(marker_genes, hmm_model_out, hmm_info_out,
                                  output_model_dir)

        # align gene sequences
        align_markers = AlignMarkers(self.cpus)
        align_markers.run(genome_ids, genome_dirs, marker_genes, True,
                          output_alignment_dir, output_model_dir)

        # create concatenated alignment file
        self.logger.info('Concatenating alignments.')
        concatenated_alignment_file = os.path.join(
            output_dir, 'concatenated_alignment.faa')
        marker_file = os.path.join(output_dir, 'concatenated_markers.tsv')
        create_concatenated_alignment(genome_ids, marker_genes,
                                      output_alignment_dir,
                                      concatenated_alignment_file, marker_file)

        # create concatenated genome tree
        self.logger.info('Inferring concatenated genome tree.')
        concatenated_tree = os.path.join(output_dir, 'concatenated.tree')
        concatenated_tree_log = os.path.join(output_dir,
                                             'concatenated.tree.log')
        log_file = os.path.join(output_dir, 'fasttree.log')
        fast_tree = FastTree(multithreaded=True)
        fast_tree.run(concatenated_alignment_file, 'prot', model,
                      concatenated_tree, concatenated_tree_log, log_file)

        # generate summary report
        report_out = os.path.join(output_dir, 'infer_workflow.log')
        fout = open(report_out, 'w')
        fout.write('[infer]\n')
        fout.write('Genome Id file: %s\n' % genome_id_file)
        fout.write('Marker Id file: %s\n' % marker_id_file)
        fout.write('Model of evolution: %s\n' % model)
        fout.write(time_keeper.get_time_stamp())
        fout.close()
    def run(self, genome_id_file,
                    marker_id_file,
                    model,
                    output_dir):
        """Identify phylogenetic tree.

        Parameters
        ----------
        genome_id_file : str
            File specifying unique ids of genomes to include in tree.
        marker_id_file : str
            File specifying unique ids of marker genes  to use for inference.
        model : str ['wag' or 'jtt']
            Model of evolution to use.
        output_dir : str
            Directory to store results.
        """

        time_keeper = TimeKeeper()

        output_alignment_dir = os.path.join(output_dir, 'alignments')
        make_sure_path_exists(output_alignment_dir)

        output_model_dir = os.path.join(output_dir, 'hmm_models')
        make_sure_path_exists(output_model_dir)

        # read directory for each genome
        genome_dirs = read_genome_dir_file(self.genome_dir_file)

        # read genomes within the ingroup
        ncbi_genome_ids, user_genome_ids = read_genome_id_file(genome_id_file)
        genome_ids = ncbi_genome_ids.union(user_genome_ids)
        self.logger.info('Inferring tree for %d genomes.' % len(genome_ids))
        self.logger.info('NCBI genomes: %d' % len(ncbi_genome_ids))
        self.logger.info('User genomes: %d' % len(user_genome_ids))

        # get marker genes
        self.logger.info('Reading marker genes.')
        marker_genes = read_marker_id_file(marker_id_file)
        self.logger.info('Read %d marker genes.' % len(marker_genes))

        # gather all single-copy HMMs into a single model file
        hmm_model_out = os.path.join(output_dir, 'phylo.hmm')
        hmm_info_out = os.path.join(output_dir, 'phylo.tsv')
        self.logger.info('Generating marker gene HMM model files.')
        self._fetch_marker_models(marker_genes, hmm_model_out, hmm_info_out, output_model_dir)

        # align gene sequences
        align_markers = AlignMarkers(self.cpus)
        align_markers.run(genome_ids, genome_dirs, marker_genes, True, output_alignment_dir, output_model_dir)

        # create concatenated alignment file
        self.logger.info('Concatenating alignments.')
        concatenated_alignment_file = os.path.join(output_dir, 'concatenated_alignment.faa')
        marker_file = os.path.join(output_dir, 'concatenated_markers.tsv')
        create_concatenated_alignment(genome_ids, marker_genes, output_alignment_dir, concatenated_alignment_file, marker_file)

        # create concatenated genome tree
        self.logger.info('Inferring concatenated genome tree.')
        concatenated_tree = os.path.join(output_dir, 'concatenated.tree')
        concatenated_tree_log = os.path.join(output_dir, 'concatenated.tree.log')
        log_file = os.path.join(output_dir, 'fasttree.log')
        fast_tree = FastTree(multithreaded=True)
        fast_tree.run(concatenated_alignment_file, 'prot', model, concatenated_tree, concatenated_tree_log, log_file)

        # generate summary report
        report_out = os.path.join(output_dir, 'infer_workflow.log')
        fout = open(report_out, 'w')
        fout.write('[infer]\n')
        fout.write('Genome Id file: %s\n' % genome_id_file)
        fout.write('Marker Id file: %s\n' % marker_id_file)
        fout.write('Model of evolution: %s\n' % model)
        fout.write(time_keeper.get_time_stamp())
        fout.close()