Ejemplo n.º 1
0
    def run(self, aa_gene_files, evalue, per_identity, output_dir):
        """Apply reciprocal blast to all pairs of genomes in parallel.

        Parameters
        ----------
        aa_gene_files : list of str
            Amino acid fasta files to process via reciprocal blast.
        evalue : float
            E-value threshold for reporting hits.
        per_identity : float
            Percent identity threshold for reporting hits.
        output_dir : str
            Directory to store blast results.
        """

        # concatenate all gene files and create a single diamond database
        self.logger.info('  Creating diamond database (be patient!).')
        gene_file = os.path.join(output_dir, 'all_genes.faa')
        concatenate_files(aa_gene_files, gene_file)
        diamond_db = os.path.join(output_dir, 'all_genes')

        diamond = Diamond(self.cpus)
        diamond.make_database(gene_file, diamond_db)

        # blast all genes against the database
        self.logger.info('')
        self.logger.info('  Identifying hits between all pairs of genomes (be patient!).')
        hits_daa_file = os.path.join(output_dir, 'all_hits')
        diamond.blastp(gene_file, diamond_db, evalue, per_identity, len(aa_gene_files) * 10, hits_daa_file)

        # create flat hits table
        self.logger.info('  Creating table with hits.')
        hits_table_file = os.path.join(output_dir, 'all_hits.tsv')
        diamond.view(hits_daa_file + '.daa', hits_table_file)
Ejemplo n.º 2
0
    def rblast(self, options):
        """Reciprocal blast command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info(' [CompareM - rblast] Performing reciprocal blast between genomes.')
        self.logger.info('*******************************************************************************')

        check_dir_exists(options.protein_dir)
        make_sure_path_exists(options.output_dir)

        aa_gene_files = []
        for f in os.listdir(options.protein_dir):
            if f.endswith(options.protein_ext):
                aa_gene_files.append(os.path.join(options.protein_dir, f))

        if not aa_gene_files:
            self.logger.warning('  [Warning] No gene files found. Check the --protein_ext flag used to identify gene files.')
            sys.exit()

        # modify gene ids to include genome ids in order to ensure
        # all gene identifiers are unique across the set of genomes,
        # also removes the trailing asterisk used to identify the stop
        # codon
        self.logger.info('')
        self.logger.info('  Appending genome identifiers to all gene identifiers.')
        gene_out_dir = os.path.join(options.output_dir, 'genes')
        make_sure_path_exists(gene_out_dir)
        modified_aa_gene_files = []
        for gf in aa_gene_files:
            genome_id = remove_extension(gf)

            aa_file = os.path.join(gene_out_dir, genome_id + '.faa')
            fout = open(aa_file, 'w')
            for seq_id, seq, annotation in seq_io.read_fasta_seq(gf, keep_annotation=True):
                fout.write('>' + seq_id + '~' + genome_id + ' ' + annotation + '\n')
                if seq[-1] == '*':
                    seq = seq[0:-1]
                fout.write(seq + '\n')
            fout.close()

            modified_aa_gene_files.append(aa_file)

        # perform the reciprocal blast with blastp or diamond
        self.logger.info('')
        if options.blastp:
            rblast = ReciprocalBlast(options.cpus)
            rblast.run(modified_aa_gene_files, options.evalue, options.output_dir)

            # concatenate all blast tables to mimic output of diamond, all hits
            # for a given genome MUST be in consecutive order to fully mimic
            # the expected results from diamond
            self.logger.info('')
            self.logger.info('  Creating single file with all blast hits (be patient!).')
            blast_files = sorted([f for f in os.listdir(options.output_dir) if f.endswith('.blastp.tsv')])
            hit_tables = [os.path.join(options.output_dir, f) for f in blast_files]
            concatenate_files(hit_tables, os.path.join(options.output_dir, 'all_hits.tsv'))
        else:
            rdiamond = ReciprocalDiamond(options.cpus)
            rdiamond.run(modified_aa_gene_files, options.evalue, options.per_identity, options.output_dir)

        self.logger.info('')
        self.logger.info('  Reciprocal blast hits written to: %s' % options.output_dir)

        self.time_keeper.print_time_stamp()
Ejemplo n.º 3
0
    def run(self, query_gene_file, target_gene_file, sorted_hit_table,
            evalue_threshold, per_iden_threshold, per_aln_len_threshold,
            keep_rbhs, output_dir):
        """Calculate amino acid identity (AAI) between pairs of genomes.

        Parameters
        ----------
        query_gene_file : str
            File with all query genes in FASTA format.
        target_gene_file : str or None
            File with all target genes in FASTA format, or None if performing a reciprocal AAI calculation.
        sorted_hit_table : str
            Sorted table indicating genes with sequence similarity.
        evalue_threshold : float
            Evalue threshold used to define a homologous gene.
        per_identity_threshold : float
            Percent identity threshold used to define a homologous gene.
        per_aln_len_threshold : float
            Alignment length threshold used to define a homologous gene.
        keep_rbhs : boolean
            Flag indicating if RBH should be written to file.
        output_dir : str
            Directory to store AAI results.
        """

        self.sorted_hit_table = sorted_hit_table
        self.evalue_threshold = evalue_threshold
        self.per_identity_threshold = per_iden_threshold
        self.per_aln_len_threshold = per_aln_len_threshold
        self.keep_rbhs = keep_rbhs
        self.output_dir = output_dir

        # calculate length of genes and number of genes in each genome
        self.logger.info('Calculating length of genes.')
        self.gene_lengths = {}
        self.query_gene_count = defaultdict(int)
        query_genomes = set()
        for seq_id, seq in seq_io.read_fasta_seq(query_gene_file):
            if seq[-1] == '*':
                self.gene_lengths[seq_id] = len(seq) - 1
            else:
                self.gene_lengths[seq_id] = len(seq)

            genome_id = seq_id[0:seq_id.find('~')]
            self.query_gene_count[genome_id] += 1
            query_genomes.add(genome_id)

        self.target_gene_count = defaultdict(int)
        target_genomes = set()
        if target_gene_file:
            for seq_id, seq in seq_io.read_fasta_seq(target_gene_file):
                if seq[-1] == '*':
                    self.gene_lengths[seq_id] = len(seq) - 1
                else:
                    self.gene_lengths[seq_id] = len(seq)

                genome_id = seq_id[0:seq_id.find('~')]
                self.target_gene_count[genome_id] += 1
                target_genomes.add(genome_id)
        else:
            self.target_gene_count = self.query_gene_count

        # get byte offset of hits from each genome
        self.logger.info('Indexing sorted hit table.')
        self.offset_table = self._genome_offsets(self.sorted_hit_table)

        # calculate AAI between each pair of genomes in parallel
        if target_genomes:
            # compare query genomes to target genomes
            self.num_pairs = len(query_genomes) * len(target_genomes)
            self.logger.info(
                'Calculating AAI between %d query and %d target genomes:' %
                (len(query_genomes), len(target_genomes)))
        else:
            # compute pairwise values between target genomes
            ng = len(query_genomes)
            self.num_pairs = (ng * ng - ng) / 2
            self.logger.info(
                'Calculating AAI between all %d pairs of genomes:' %
                self.num_pairs)

        if self.num_pairs == 0:
            self.logger.warning('No genome pairs identified.')
            return

        genome_id_lists = []
        query_genomes = list(query_genomes)
        target_genomes = list(target_genomes)
        for i in xrange(0, len(query_genomes)):
            genome_idI = query_genomes[i]

            if target_genomes:
                genome_id_list = target_genomes
            else:
                genome_id_list = []
                for j in xrange(i + 1, len(query_genomes)):
                    genome_idJ = query_genomes[j]
                    genome_id_list.append(genome_idJ)

            genome_id_lists.append((genome_idI, genome_id_list))

        self.processed_paired = 0
        parallel = Parallel(self.cpus)

        progress_func = self._progress
        if self.logger.is_silent:
            progress_func = None
        consumer_data = parallel.run(self._producer, self._consumer,
                                     genome_id_lists, progress_func)

        # write results for each genome pair
        self.logger.info('Summarizing AAI results.')
        aai_summay_file = os.path.join(output_dir, 'aai_summary.tsv')
        fout = open(aai_summay_file, 'w')
        fout.write(
            'Genome A\tGenes in A\tGenome B\tGenes in B\t# orthologous genes\tMean AAI\tStd AAI\tOrthologous fraction (OF)\n'
        )

        for data in consumer_data:
            fout.write('%s\t%d\t%s\t%d\t%d\t%.2f\t%.2f\t%.2f\n' % data)

        fout.close()

        # concatenate RBH files
        rbh_output_file = None
        if self.keep_rbhs:
            self.logger.info('Concatenating RBH files.')
            rbh_files = []
            for genome_id in query_genomes:
                rbh_files.append(
                    os.path.join(self.output_dir, genome_id + '.rbh.tsv'))

            rbh_output_file = os.path.join(self.output_dir, 'rbh.tsv')
            concatenate_files(rbh_files, rbh_output_file, common_header=True)

            for f in rbh_files:
                os.remove(f)

        return aai_summay_file, rbh_output_file
Ejemplo n.º 4
0
    def run(self, query_proteins, db_file, custom_db_file, taxonomy_file,
            custom_taxonomy_file, evalue, per_identity, per_aln_len,
            max_matches, homology_search, min_per_taxa, consensus, min_per_bp,
            use_trimAl, restrict_taxon, msa_program, tree_program, prot_model,
            skip_rooting, output_dir):
        """Infer a gene tree for homologs genes identified by blast.

        Workflow for inferring a gene tree from sequences identified as being
        homologs to a set of query proteins. Homologs are identified using BLASTP
        and a set of user-defined parameters.

        Parameters
        ----------
        query_proteins : str
            Fasta file containing query proteins.
        db_file : str
            BLAST database of reference proteins.
        custom_db_file : str
            Custom database of proteins.
        taxonomy_file : str
            Taxonomic assignment of each reference genomes.
        custom_taxonomy_file : str
            Taxonomic assignment of genomes in custom database.
        evalue : float
            E-value threshold used to define homolog.
        per_identity : float
            Percent identity threshold used to define a homolog.
        per_aln_len : float
            Alignment length threshold used to define a homolog.
        max_matches : int
            Maximum matches per query protein.
        metadata : dict[genome_id] -> metadata dictionary
            Metadata for genomes.
        homology_search : str
            Type of homology search to perform.
        min_per_taxa : float
            Minimum percentage of taxa required to retain a column.
        consensus : float
            Minimum percentage of the same amino acid required to retain column.
        min_per_bp : float
            Minimum percentage of base pairs required to keep trimmed sequence.
        use_trimAl : boolean
            Filter columns using trimAl.
        restrict_taxon : str
            Restrict alignment to specific taxonomic group (e.g., k__Archaea).
        msa_program : str
            Program to use for multiple sequence alignment ['mafft', 'muscle'].
        tree_program : str
            Program to use for tree inference ['fasttree', 'raxml'].
        prot_model : str
            Protein substitution model for tree inference ['WAG', 'LG', 'AUTO'].
        skip_rooting : boolean
            Skip midpoint rooting if True.
        output_dir : str
            Directory to store results.
        """

        # validate query sequence names for use with GeneTreeTk
        validate_seq_ids(query_proteins)

        # read taxonomy file
        self.logger.info('Reading taxonomy file.')
        taxonomy = Taxonomy().read(taxonomy_file)

        if custom_taxonomy_file:
            custom_taxonomy = Taxonomy().read(custom_taxonomy_file)
            taxonomy.update(custom_taxonomy)

        # report distribution of query genes
        mean_len, max_len, min_len, p10, p50, p90 = self._gene_distribution(
            query_proteins)
        self.logger.info(
            'Query gene lengths: min, mean, max = %d, %.1f, %d | p10, p50, p90 = %.1f, %.1f, %.1f'
            % (min_len, mean_len, max_len, p10, p50, p90))

        # identify homologs using BLASTP
        self.logger.info('Identifying homologs using %s.' % homology_search)
        blast = Blast(self.cpus)
        blast_output = os.path.join(output_dir, 'reference_hits.tsv')
        if homology_search == 'diamond':
            diamond = Diamond(self.cpus)
            diamond.blastp(query_proteins,
                           db_file,
                           evalue,
                           per_identity,
                           per_aln_len,
                           max_matches,
                           blast_output,
                           output_fmt='custom')
        else:
            blast.blastp(query_proteins,
                         db_file,
                         blast_output,
                         evalue,
                         max_matches,
                         output_fmt='custom',
                         task=homology_search)
        homologs = blast.identify_homologs(blast_output, evalue, per_identity,
                                           per_aln_len)
        self.logger.info('Identified %d homologs in reference database.' %
                         len(homologs))

        custom_homologs = None
        if custom_db_file:
            custom_blast_output = os.path.join(output_dir, 'custom_hits.tsv')
            if homology_search == 'diamond':
                diamond = Diamond(self.cpus)
                diamond.blastp(query_proteins,
                               custom_db_file,
                               evalue,
                               per_identity,
                               per_aln_len,
                               max_matches,
                               custom_blast_output,
                               output_fmt='custom')
            else:
                blast.blastp(query_proteins,
                             custom_db_file,
                             custom_blast_output,
                             evalue,
                             max_matches,
                             output_fmt='custom',
                             task=homology_search)
            custom_homologs = blast.identify_homologs(custom_blast_output,
                                                      evalue, per_identity,
                                                      per_aln_len)
            self.logger.info('Identified %d homologs in custom database.' %
                             len(custom_homologs))

        # restrict homologs to specific taxonomic group
        if restrict_taxon:
            self.logger.info('Restricting homologs to %s.' % restrict_taxon)
            restricted_homologs = {}
            for query_id, hit in homologs.iteritems():
                genome_id = hit.subject_id.split('~')[0]
                if restrict_taxon in taxonomy[genome_id]:
                    restricted_homologs[query_id] = hit

            self.logger.info(
                '%d of %d homologs in reference database are from the specified group.'
                % (len(restricted_homologs), len(homologs)))
            homologs = restricted_homologs

        if len(homologs) == 0:
            self.logger.error(
                'Too few homologs were identified. Gene tree cannot be inferred.'
            )
            sys.exit()

        # extract homologs
        self.logger.info(
            'Extracting homologs and determining local gene context.')
        db_homologs_tmp = os.path.join(output_dir, 'homologs_db.tmp')
        gene_precontext, gene_postcontext = self.extract_homologs_and_context(
            homologs.keys(), db_file, db_homologs_tmp)

        # report gene length distribution of homologs
        mean_len, max_len, min_len, p10, p50, p90 = self._gene_distribution(
            db_homologs_tmp)
        self.logger.info(
            'Homolog gene lengths: min, mean, max = %d, %.1f, %d | p10, p50, p90 = %.1f, %.1f, %.1f'
            % (min_len, mean_len, max_len, p10, p50, p90))

        # concatenate homologs with initial query genes
        homolog_ouput_tmp = os.path.join(output_dir, 'homologs.faa.tmp')
        if custom_homologs:
            custom_db_homologs_tmp = os.path.join(output_dir,
                                                  'custom_homologs_db.tmp')
            custom_gene_precontext, custom_gene_postcontext = self.extract_homologs_and_context(
                custom_homologs.keys(), custom_db_file, custom_db_homologs_tmp)
            gene_precontext.update(custom_gene_precontext)
            gene_postcontext.update(custom_gene_postcontext)
            homologs.update(custom_homologs)
            concatenate_files(
                [query_proteins, db_homologs_tmp, custom_db_homologs_tmp],
                homolog_ouput_tmp)
            os.remove(custom_db_homologs_tmp)
        else:
            concatenate_files([query_proteins, db_homologs_tmp],
                              homolog_ouput_tmp)

        os.remove(db_homologs_tmp)

        # remove stop codons
        homolog_ouput = os.path.join(output_dir, 'homologs.faa')
        self._remove_stop_codons(homolog_ouput_tmp, homolog_ouput)
        os.remove(homolog_ouput_tmp)

        # infer multiple sequence alignment
        msa = MsaWorkflow(self.cpus)
        trimmed_msa_output = msa.run(homolog_ouput, min_per_taxa, consensus,
                                     min_per_bp, use_trimAl, msa_program,
                                     output_dir)

        # infer tree
        tw = TreeWorkflow(self.cpus)
        tree_output = tw.run(trimmed_msa_output, tree_program, prot_model,
                             skip_rooting, output_dir)

        # create tax2tree consensus map and decorate tree
        self.logger.info('Decorating internal tree nodes with tax2tree.')
        output_taxonomy_file = os.path.join(output_dir, 'taxonomy.tsv')
        fout = open(output_taxonomy_file, 'w')
        for homolog_id in homologs.keys():
            genome_id = homolog_id.split('~')[0]
            t = taxonomy.get(genome_id, None)
            if t:
                fout.write(homolog_id + '\t' + ';'.join(t) + '\n')
        fout.close()

        t2t_tree = os.path.join(output_dir, 'homologs.tax2tree.tree')
        cmd = 't2t decorate -m %s -t %s -o %s' % (output_taxonomy_file,
                                                  tree_output, t2t_tree)
        os.system(cmd)

        # create tree with leaf nodes given as genome accessions
        tree = dendropy.Tree.get_from_path(t2t_tree,
                                           schema='newick',
                                           rooting='force-rooted',
                                           preserve_underscores=True)

        for leaf in tree.leaf_node_iter():
            leaf.taxon.label = leaf.taxon.label.split('~')[0]

        genome_tree = os.path.join(output_dir,
                                   'homologs.tax2tree.genome_accessions.tree')
        tree.write_to_path(genome_tree,
                           schema='newick',
                           suppress_rooting=True,
                           unquoted_underscores=True)

        # setup metadata for ARB file
        src_dir = os.path.dirname(os.path.realpath(__file__))
        version_file = open(os.path.join(src_dir, 'VERSION'))

        metadata = {}
        metadata['genetreetk_version'] = version_file.read().strip()
        metadata['genetreetk_query_proteins'] = query_proteins
        metadata['genetreetk_db_file'] = db_file
        metadata['genetreetk_taxonomy_file'] = taxonomy_file
        metadata['genetreetk_blast_evalue'] = str(evalue)
        metadata['genetreetk_blast_per_identity'] = str(per_identity)
        metadata['genetreetk_blast_per_aln_len'] = str(per_aln_len)
        metadata['genetreetk_blast_max_matches'] = str(max_matches)
        metadata['genetreetk_homology_search'] = homology_search

        metadata['genetreetk_msa_min_per_taxa'] = str(min_per_taxa)
        metadata['genetreetk_msa_consensus'] = str(consensus)
        metadata['genetreetk_msa_min_per_bp'] = str(min_per_bp)
        metadata['genetreetk_msa_program'] = msa_program

        metadata['genetreetk_tree_program'] = tree_program
        metadata['genetreetk_tree_prot_model'] = prot_model

        # create ARB metadata file
        self.logger.info('Creating ARB metadata file.')
        arb_metadata_file = os.path.join(output_dir, 'arb.metadata.txt')
        self.create_arb_metadata(homologs, trimmed_msa_output, taxonomy,
                                 metadata, gene_precontext, gene_postcontext,
                                 arb_metadata_file)
Ejemplo n.º 5
0
    def run(self, query_gene_file,
                    target_gene_file,
                    sorted_hit_table, 
                    evalue_threshold, 
                    per_iden_threshold, 
                    per_aln_len_threshold,
                    keep_rbhs,
                    output_dir):
        """Calculate amino acid identity (AAI) between pairs of genomes.

        Parameters
        ----------
        query_gene_file : str
            File with all query genes in FASTA format.
        target_gene_file : str or None
            File with all target genes in FASTA format, or None if performing a reciprocal AAI calculation.
        sorted_hit_table : str
            Sorted table indicating genes with sequence similarity.
        evalue_threshold : float
            Evalue threshold used to define a homologous gene.
        per_identity_threshold : float
            Percent identity threshold used to define a homologous gene.
        per_aln_len_threshold : float
            Alignment length threshold used to define a homologous gene.
        keep_rbhs : boolean
            Flag indicating if RBH should be written to file.
        output_dir : str
            Directory to store AAI results.
        """

        self.sorted_hit_table = sorted_hit_table
        self.evalue_threshold = evalue_threshold
        self.per_identity_threshold = per_iden_threshold
        self.per_aln_len_threshold = per_aln_len_threshold
        self.keep_rbhs = keep_rbhs
        self.output_dir = output_dir

        # calculate length of genes and number of genes in each genome
        self.logger.info('Calculating length of genes.')
        self.gene_lengths = {}
        self.query_gene_count = defaultdict(int)
        query_genomes = set()
        for seq_id, seq in seq_io.read_fasta_seq(query_gene_file):
            if seq[-1] == '*':
                self.gene_lengths[seq_id] = len(seq) - 1
            else:
                self.gene_lengths[seq_id] = len(seq)
                
            genome_id = seq_id[0:seq_id.find('~')]
            self.query_gene_count[genome_id] += 1
            query_genomes.add(genome_id)
            
        self.target_gene_count = defaultdict(int)
        target_genomes = set()
        if target_gene_file:
            for seq_id, seq in seq_io.read_fasta_seq(target_gene_file):
                if seq[-1] == '*':
                    self.gene_lengths[seq_id] = len(seq) - 1
                else:
                    self.gene_lengths[seq_id] = len(seq)
                    
                genome_id = seq_id[0:seq_id.find('~')]
                self.target_gene_count[genome_id] += 1
                target_genomes.add(genome_id)
        else:
            self.target_gene_count = self.query_gene_count

        # get byte offset of hits from each genome
        self.logger.info('Indexing sorted hit table.')
        self.offset_table = self._genome_offsets(self.sorted_hit_table)

        # calculate AAI between each pair of genomes in parallel
        if target_genomes:
            # compare query genomes to target genomes
            self.num_pairs = len(query_genomes) * len(target_genomes)
            self.logger.info('Calculating AAI between %d query and %d target genomes:' % (len(query_genomes), len(target_genomes)))
        else:
            # compute pairwise values between target genomes
            ng = len(query_genomes)
            self.num_pairs = (ng*ng - ng) / 2
            self.logger.info('Calculating AAI between all %d pairs of genomes:' % self.num_pairs)
            
        if self.num_pairs == 0:
            self.logger.warning('No genome pairs identified.')
            return

        genome_id_lists = []
        query_genomes = list(query_genomes)
        target_genomes = list(target_genomes)
        for i in range(0, len(query_genomes)):
            genome_idI = query_genomes[i]
            
            if target_genomes:
                genome_id_list = target_genomes
            else:
                genome_id_list = []
                for j in range(i + 1, len(query_genomes)):
                    genome_idJ = query_genomes[j]
                    genome_id_list.append(genome_idJ)

            genome_id_lists.append((genome_idI, genome_id_list))

        self.processed_paired = 0
        parallel = Parallel(self.cpus)
        
        progress_func = self._progress
        if self.logger.is_silent:
            progress_func = None
        consumer_data = parallel.run(self._producer, self._consumer, genome_id_lists, progress_func)

        # write results for each genome pair
        self.logger.info('Summarizing AAI results.')
        aai_summay_file = os.path.join(output_dir, 'aai_summary.tsv')
        fout = open(aai_summay_file, 'w')
        fout.write('#Genome A\tGenes in A\tGenome B\tGenes in B\t# orthologous genes\tMean AAI\tStd AAI\tOrthologous fraction (OF)\n')

        for data in consumer_data:
            fout.write('%s\t%d\t%s\t%d\t%d\t%.2f\t%.2f\t%.2f\n' % data)

        fout.close()

        # concatenate RBH files
        rbh_output_file = None
        if self.keep_rbhs:
            self.logger.info('Concatenating RBH files.')
            rbh_files = []
            for genome_id in query_genomes:
                rbh_files.append(os.path.join(self.output_dir, genome_id + '.rbh.tsv'))
                
            rbh_output_file = os.path.join(self.output_dir, 'rbh.tsv')
            concatenate_files(rbh_files, rbh_output_file, common_header=True)
            
            for f in rbh_files:
                os.remove(f)
                
        return aai_summay_file, rbh_output_file