コード例 #1
0
ファイル: main.py プロジェクト: Python3pkg/CompareM
    def aai(self, options):
        """AAI command"""
        check_file_exists(options.sorted_hit_table)
        make_sure_path_exists(options.output_dir)

        aai_calculator = AAICalculator(options.cpus)
        aai_output_file, rbh_output_file = aai_calculator.run(
            options.query_gene_file, None, options.sorted_hit_table,
            options.evalue, options.per_identity, options.per_aln_len,
            options.keep_rbhs, options.output_dir)

        if rbh_output_file:
            self.logger.info('Identified reciprocal best hits written to: %s' %
                             rbh_output_file)

        self.logger.info('AAI between genomes written to: %s' %
                         aai_output_file)
コード例 #2
0
ファイル: main.py プロジェクト: dparks1134/CompareM
    def aai(self, options):
        """AAI command"""
        check_file_exists(options.sorted_hit_table)
        make_sure_path_exists(options.output_dir)

        aai_calculator = AAICalculator(options.cpus)
        aai_output_file, rbh_output_file = aai_calculator.run(options.query_gene_file,
                                                                None,
                                                                options.sorted_hit_table,
                                                                options.evalue,
                                                                options.per_identity,
                                                                options.per_aln_len,
                                                                options.keep_rbhs,
                                                                options.output_dir)

        if rbh_output_file:
            self.logger.info('Identified reciprocal best hits written to: %s' % rbh_output_file)
            
        self.logger.info('AAI between genomes written to: %s' % aai_output_file)
コード例 #3
0
ファイル: main.py プロジェクト: ctSkennerton/CompareM
    def aai(self, options):
        """AAI command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info(' [CompareM - aai] Calculating the AAI between homologs in genome pairs.')
        self.logger.info('*******************************************************************************')
        self.logger.info('')

        check_dir_exists(options.rblast_dir)
        make_sure_path_exists(options.output_dir)

        genome_ids = []
        protein_dir = os.path.join(options.rblast_dir, 'genes')
        for f in os.listdir(protein_dir):
            if f.endswith('.faa'):
                genome_id = remove_extension(f, '.faa')
                genome_ids.append(genome_id)

        if not genome_ids:
            self.logger.warning('  [Warning] No gene files found. Check the --protein_ext flag used to identify gene files.')
            sys.exit()

        aai_calculator = AAICalculator(options.cpus)
        aai_calculator.run(genome_ids,
                            protein_dir,
                            options.rblast_dir,
                            options.per_identity,
                            options.per_aln_len,
                            options.write_shared_genes,
                            options.output_dir)

        shared_genes_dir = os.path.join(options.output_dir, aai_calculator.shared_genes)
        self.logger.info('')
        self.logger.info('  Identified homologs between genome pairs written to: %s' % shared_genes_dir)

        self.time_keeper.print_time_stamp()
コード例 #4
0
ファイル: classify.py プロジェクト: gsc0107/CompareM
    def run(self, query_gene_file, target_gene_file, sorted_hit_table,
            evalue_threshold, per_iden_threshold, per_aln_len_threshold,
            num_top_targets, taxonomy_file, keep_rbhs, output_dir):
        """Classify genomes based on AAI to reference genomes.

        Parameters
        ----------
        query_gene_file : str
            File with all query genes in FASTA format.
        target_gene_file : str
            File with all target genes in FASTA format.
        sorted_hit_table : str
            Sorted table indicating genes with sequence similarity.
        evalue_threshold : float
            Evalue threshold used to define a homologous gene.
        per_identity_threshold : float
            Percent identity threshold used to define a homologous gene.
        per_aln_len_threshold : float
            Alignment length threshold used to define a homologous gene.
        num_top_targets : int
            Number of top scoring target genomes to report per query genome.
        taxonomy_file : str
            File indicating taxonomic identification of all target genomes.
        keep_rbhs : boolean
            Flag indicating if RBH should be written to file.
        output_dir : str
            Directory to store AAI results.
        """

        # read taxonomic identification of each genome
        taxonomy = {}
        if taxonomy_file:
            for line in open(taxonomy_file):
                genome_id, taxa_str = line.rstrip().split('\t')
                taxonomy[genome_id] = taxa_str

        # calculate AAI between query and target genomes
        aai_output_dir = os.path.join(output_dir, 'aai')
        make_sure_path_exists(aai_output_dir)
        aai_calculator = AAICalculator(self.cpus)
        aai_output_file, rbh_output_file = aai_calculator.run(
            query_gene_file, target_gene_file, sorted_hit_table,
            evalue_threshold, per_iden_threshold, per_aln_len_threshold,
            keep_rbhs, aai_output_dir)

        # determine matches to each query genomes
        aai_results_file = os.path.join(aai_output_dir, 'aai_summary.tsv')
        with open(aai_results_file) as f:
            f.readline()

            hits = defaultdict(list)
            for line in f:
                line_split = line.rstrip().split('\t')
                query_id = line_split[0]
                target_id = line_split[2]
                aai = float(line_split[5])
                of = float(line_split[7])

                hits[query_id].append([target_id, aai, of])

        # report top matches
        results_file = os.path.join(output_dir, 'classify.tsv')
        fout = open(results_file, 'w')
        fout.write('Query Id\tTarget Id\tAAI\tOF\tScore')
        if taxonomy:
            fout.write('\tTarget Taxonomy')
        fout.write('\n')

        for query_id, cur_hits in hits.items():
            cur_hits.sort(key=lambda x: x[1], reverse=True)
            for i in xrange(0, min(num_top_targets, len(cur_hits))):
                data = [query_id] + cur_hits[i]
                fout.write('%s\t%s\t%.2f\t%.2f' % tuple(data))

                aai = data[2]
                of = data[3]
                fout.write('\t%.2f' % (aai + of))

                target_id = cur_hits[i][0]
                if target_id in taxonomy:
                    fout.write('\t%s' % taxonomy[target_id])

                fout.write('\n')
        fout.close()

        return results_file
コード例 #5
0
ファイル: classify.py プロジェクト: dparks1134/CompareM
    def run(self, query_gene_file,
                    target_gene_file,
                    sorted_hit_table, 
                    evalue_threshold, 
                    per_iden_threshold, 
                    per_aln_len_threshold,
                    num_top_targets,
                    taxonomy_file,
                    keep_rbhs,
                    output_dir):
        """Classify genomes based on AAI to reference genomes.

        Parameters
        ----------
        query_gene_file : str
            File with all query genes in FASTA format.
        target_gene_file : str
            File with all target genes in FASTA format.
        sorted_hit_table : str
            Sorted table indicating genes with sequence similarity.
        evalue_threshold : float
            Evalue threshold used to define a homologous gene.
        per_identity_threshold : float
            Percent identity threshold used to define a homologous gene.
        per_aln_len_threshold : float
            Alignment length threshold used to define a homologous gene.
        num_top_targets : int
            Number of top scoring target genomes to report per query genome.
        taxonomy_file : str
            File indicating taxonomic identification of all target genomes.
        keep_rbhs : boolean
            Flag indicating if RBH should be written to file.
        output_dir : str
            Directory to store AAI results.
        """
        
        # read taxonomic identification of each genome
        taxonomy = {}
        if taxonomy_file:
            for line in open(taxonomy_file):
                genome_id, taxa_str = line.rstrip().split('\t')
                taxonomy[genome_id] = taxa_str

        # calculate AAI between query and target genomes
        aai_output_dir = os.path.join(output_dir, 'aai')
        make_sure_path_exists(aai_output_dir)
        aai_calculator = AAICalculator(self.cpus)
        aai_output_file, rbh_output_file = aai_calculator.run(query_gene_file,
                                                                target_gene_file,
                                                                sorted_hit_table,
                                                                evalue_threshold,
                                                                per_iden_threshold,
                                                                per_aln_len_threshold,
                                                                keep_rbhs,
                                                                aai_output_dir)

        # determine matches to each query genomes
        aai_results_file = os.path.join(aai_output_dir, 'aai_summary.tsv')
        with open(aai_results_file) as f:
            f.readline()
            
            hits = defaultdict(list)
            for line in f:
                line_split = line.rstrip().split('\t')
                query_id = line_split[0]
                target_id = line_split[2]
                aai = float(line_split[5])
                of = float(line_split[7])
                
                hits[query_id].append([target_id, aai, of])
                
        # report top matches
        results_file = os.path.join(output_dir, 'classify.tsv')
        fout = open(results_file, 'w')
        fout.write('Query Id\tTarget Id\tAAI\tOF\tScore')
        if taxonomy:
            fout.write('\tTarget Taxonomy')
        fout.write('\n')
             
        for query_id, cur_hits in hits.items():
            cur_hits.sort(key=lambda x: x[1], reverse=True)
            for i in xrange(0, min(num_top_targets, len(cur_hits))):
                data = [query_id] + cur_hits[i]
                fout.write('%s\t%s\t%.2f\t%.2f' % tuple(data))
                
                aai = data[2]
                of = data[3]
                fout.write('\t%.2f' % (aai+of))
                
                target_id = cur_hits[i][0]
                if target_id in taxonomy:
                    fout.write('\t%s' % taxonomy[target_id])
                
                fout.write('\n')
        fout.close()
        
        return results_file