def run(self, aa_gene_files, evalue, per_identity, output_dir):
        """Apply reciprocal blast to all pairs of genomes in parallel.

        Parameters
        ----------
        aa_gene_files : list of str
            Amino acid fasta files to process via reciprocal blast.
        evalue : float
            E-value threshold for reporting hits.
        per_identity : float
            Percent identity threshold for reporting hits.
        output_dir : str
            Directory to store blast results.
        """

        # concatenate all gene files and create a single diamond database
        self.logger.info('  Creating diamond database (be patient!).')
        gene_file = os.path.join(output_dir, 'all_genes.faa')
        concatenate_files(aa_gene_files, gene_file)
        diamond_db = os.path.join(output_dir, 'all_genes')

        diamond = Diamond(self.cpus)
        diamond.make_database(gene_file, diamond_db)

        # blast all genes against the database
        self.logger.info('')
        self.logger.info('  Identifying hits between all pairs of genomes (be patient!).')
        hits_daa_file = os.path.join(output_dir, 'all_hits')
        diamond.blastp(gene_file, diamond_db, evalue, per_identity, len(aa_gene_files) * 10, hits_daa_file)

        # create flat hits table
        self.logger.info('  Creating table with hits.')
        hits_table_file = os.path.join(output_dir, 'all_hits.tsv')
        diamond.view(hits_daa_file + '.daa', hits_table_file)
Beispiel #2
0
    def run(self, scaffold_gene_file, stat_file, ref_genome_gene_files, db_file, evalue, per_identity, per_aln_len):
        """Create taxonomic profiles for a set of genomes.

        Parameters
        ----------
        scaffold_gene_file : str
            Fasta file of genes on scaffolds in amino acid space.
        stat_file : str
            File with statistics for individual scaffolds.
        ref_genome_gene_files : list of str
            Fasta files of called genes on reference genomes of interest.
        db_file : str
            Database of competing reference genes.
        evalue : float
            E-value threshold of valid hits.
        per_identity : float
            Percent identity threshold of valid hits [0,100].
        per_aln_len : float
            Percent query coverage of valid hits [0, 100].
        """

        # read statistics file
        self.logger.info('Reading scaffold statistics.')
        scaffold_stats = ScaffoldStats()
        scaffold_stats.read(stat_file)

        # perform homology searches
        self.logger.info('Creating DIAMOND database for reference genomes.')
        ref_gene_file = os.path.join(self.output_dir, 'ref_genes.faa')
        concatenate_gene_files(ref_genome_gene_files, ref_gene_file)

        diamond = Diamond(self.cpus)
        ref_diamond_db = os.path.join(self.output_dir, 'ref_genes')
        diamond.create_db(ref_gene_file, ref_diamond_db)

        self.logger.info('Identifying homologs within reference genomes of interest (be patient!).')
        self.diamond_dir = os.path.join(self.output_dir, 'diamond')
        make_sure_path_exists(self.diamond_dir)
        hits_ref_genomes = os.path.join(self.diamond_dir, 'ref_hits.tsv')
        diamond.blastp(scaffold_gene_file, ref_diamond_db, evalue, per_identity, per_aln_len, 1, False, hits_ref_genomes)

        self.logger.info('Identifying homologs within competing reference genomes (be patient!).')
        hits_comp_ref_genomes = os.path.join(self.diamond_dir, 'competing_ref_hits.tsv')
        diamond.blastp(scaffold_gene_file, db_file, evalue, per_identity, per_aln_len, 1, False, hits_comp_ref_genomes)

        # get list of genes with a top hit to the reference genomes of interest
        hits_to_ref = self._top_hits_to_reference(hits_ref_genomes, hits_comp_ref_genomes)

        # get number of genes on each scaffold
        num_genes_on_scaffold = defaultdict(int)
        for seq_id, _seq in seq_io.read_seq(scaffold_gene_file):
            scaffold_id = seq_id[0:seq_id.rfind('_')]
            num_genes_on_scaffold[scaffold_id] += 1

        # get hits to each scaffold
        hits_to_scaffold = defaultdict(list)
        for query_id, hit in hits_to_ref.items():
            gene_id = query_id[0:query_id.rfind('~')]
            scaffold_id = gene_id[0:gene_id.rfind('_')]
            hits_to_scaffold[scaffold_id].append(hit)

        # report summary stats for each scaffold
        reference_out = os.path.join(self.output_dir, 'references.tsv')
        fout = open(reference_out, 'w')
        fout.write('Scaffold ID\tSubject genome IDs\tSubject scaffold IDs')
        fout.write('\tGenome ID\tLength (bp)\tGC\tMean coverage')
        fout.write('\t# genes\t# hits\t% genes\tAvg. align. length (bp)\tAvg. % identity\tAvg. e-value\tAvg. bitscore\n')

        for scaffold_id, hits in hits_to_scaffold.items():
            aln_len = []
            perc_iden = []
            evalue = []
            bitscore = []
            subject_scaffold_ids = defaultdict(int)
            subject_bin_ids = defaultdict(int)
            for hit in hits:
                aln_len.append(hit.aln_length)
                perc_iden.append(hit.perc_identity)
                evalue.append(hit.evalue)
                bitscore.append(hit.bitscore)

                subject_bin_id, subject_gene_id  = hit.subject_id.split('~')
                subject_scaffold_id = subject_gene_id[0:subject_gene_id.rfind('_')]
                subject_scaffold_ids[subject_scaffold_id] += 1
                subject_bin_ids[subject_bin_id] += 1
               

            sorted_subject_bin_ids = sorted(subject_bin_ids.items(), 
                                                key=operator.itemgetter(1),
                                                reverse=True)
            subject_bin_id_str = []
            for bin_id, num_hits in sorted_subject_bin_ids:
                subject_bin_id_str.append(bin_id + ':' + str(num_hits))
            subject_bin_id_str = ','.join(subject_bin_id_str)

            sorted_subject_scaffold_ids = sorted(subject_scaffold_ids.items(), 
                                                    key=operator.itemgetter(1),
                                                    reverse=True)
            subject_scaffold_id_str = []
            for subject_id, num_hits in sorted_subject_scaffold_ids:
                subject_scaffold_id_str.append(subject_id + ':' + str(num_hits))
            subject_scaffold_id_str = ','.join(subject_scaffold_id_str)

            fout.write('%s\t%s\t%s\t%s\t%.2f\t%d\t%d\t%.2f\t%d\t%.2f\t%.2g\t%.2f\n' % (
                                                                        scaffold_id,
                                                                        subject_bin_id_str,
                                                                        subject_scaffold_id_str,
                                                                        scaffold_stats.print_stats(scaffold_id),
                                                                        mean(scaffold_stats.coverage(scaffold_id)),
                                                                        num_genes_on_scaffold[scaffold_id],
                                                                        len(hits),
                                                                        len(hits) * 100.0 / num_genes_on_scaffold[scaffold_id],
                                                                        mean(aln_len),
                                                                        mean(perc_iden),
                                                                        mean(evalue),
                                                                        mean(bitscore)))

        fout.close()

        return reference_out
    def run(self, genome_files, db_file, taxonomy_file, evalue, per_identity, window_size, step_size):
        """Create taxonomic profiles for a set of genomes.

        Parameters
        ----------
        genome_files : list of str
            Fasta files of genomes to process.
        db_file : str
            Database of reference genes.
        taxonomy_file : str
            File containing GreenGenes taxonomy strings for reference genomes.
        evalue : float
            E-value threshold used by blast.
        per_identity: float
            Percent identity threshold used by blast.
        window_size : int
            Size of each fragment.
        step_size : int
            Number of bases to move after each window.
        """

        # parse taxonomy file
        self.logger.info('  Reading taxonomic assignment of reference genomes.')
        taxonomy = Taxonomy().read(taxonomy_file)

        # fragment each genome into fixed sizes windows
        self.logger.info('')
        self.logger.info('  Fragmenting sequences in each bin:')
        diamond_output_dir = os.path.join(self.output_dir, 'diamond')
        make_sure_path_exists(diamond_output_dir)

        fragment_file = os.path.join(diamond_output_dir, 'fragments.fna')
        fragment_out = open(fragment_file, 'w')
        contig_id_to_genome_id = {}
        for genome_file in genome_files:
            genome_id = remove_extension(genome_file)
            self.profiles[genome_id] = Profile(genome_id, taxonomy)
            self._fragment_genomes(genome_file,
                                  window_size,
                                  step_size,
                                  self.profiles[genome_id],
                                  fragment_out)

            for seq_id, _seq in seq_io.read_seq(genome_file):
                contig_id_to_genome_id[seq_id] = genome_id

        # run diamond
        self.logger.info('')
        self.logger.info('  Running diamond blastx with %d processes (be patient!)' % self.cpus)

        diamond = Diamond(self.cpus)
        diamond_daa_out = os.path.join(diamond_output_dir, 'diamond_hits')
        diamond.blastx(fragment_file, db_file, evalue, per_identity, 1, diamond_daa_out)

        diamond_table_out = os.path.join(diamond_output_dir, 'diamond_hits.tsv')
        diamond.view(diamond_daa_out + '.daa', diamond_table_out)

        self.logger.info('')
        self.logger.info('  Creating taxonomic profile for each genome.')
        self._taxonomic_profiles(diamond_table_out, taxonomy, contig_id_to_genome_id)

        self.logger.info('')
        self.logger.info('  Writing taxonomic profile for each genome.')

        report_dir = os.path.join(self.output_dir, 'bin_reports')
        make_sure_path_exists(report_dir)

        for genome_id, profile in self.profiles.iteritems():
            seq_summary_out = os.path.join(report_dir, genome_id + '.sequences.tsv')
            profile.write_seq_summary(seq_summary_out)

            genome_profile_out = os.path.join(report_dir, genome_id + '.profile.tsv')
            profile.write_genome_profile(genome_profile_out)

        genome_summary_out = os.path.join(self.output_dir, 'genome_summary.tsv')
        self._write_genome_summary(genome_summary_out)

        # create Krona plot
        krona_profiles = defaultdict(lambda: defaultdict(int))
        for genome_id, profile in self.profiles.iteritems():
            seq_assignments = profile.classify_seqs(taxonomy)

            for seq_id, classification in seq_assignments.iteritems():
                taxa = []
                for r in xrange(0, len(profile.rank_labels)):
                    taxa.append(classification[r][0])

                krona_profiles[genome_id][';'.join(taxa)] += profile.seq_len[seq_id]

        krona = Krona()
        krona_output_file = os.path.join(self.output_dir, 'taxonomic_profiles.krona.html')
        krona.create(krona_profiles, krona_output_file)
Beispiel #4
0
    def _run_reciprocal_diamond(self, query_gene_file, target_gene_file,
                                evalue, per_identity, per_aln_len, max_hits,
                                sensitive, high_mem, tmp_dir, output_dir):
        """Perform similarity search of query genes against target genes, and reciprocal hits.

        Parameters
        ----------
        query_gene_file : str
            File with all query proteins.
        target_gene_file : str
            File with all target proteins.
        evalue : float
            E-value threshold for reporting hits.
        per_identity : float
            Percent identity threshold for reporting hits.
        per_aln_len : float
            Percent query coverage threshold for reporting hits.
        max_hits : int
            Maximum number of hits to report per query sequences.
        tmp_dir : str
            Directory to store temporary files.
        output_dir : str
            Directory to store blast results.
        """

        self.logger.info(
            'Creating DIAMOND database of query proteins (be patient!).')
        diamond = Diamond(self.cpus)
        query_diamond_db = os.path.join(output_dir, 'query_genes')
        diamond.create_db(query_gene_file, query_diamond_db)

        self.logger.info(
            'Creating DIAMOND database of target proteins (be patient!).')
        target_diamond_db = os.path.join(output_dir, 'target_genes')
        diamond.create_db(target_gene_file, target_diamond_db)

        # blast query genes against target proteins
        self.logger.info(
            'Performing similarity sequence between query and target proteins (be patient!).'
        )

        if tmp_dir:
            tmp_query_hits_table = tempfile.NamedTemporaryFile(
                prefix='comparem_hits_', dir=tmp_dir, delete=False)
        else:
            tmp_query_hits_table = tempfile.NamedTemporaryFile(
                prefix='comparem_hits_', delete=False)
        tmp_query_hits_table.close()

        query_hits_daa_file = os.path.join(output_dir, 'query_hits')

        if high_mem:
            diamond.blastp(query_gene_file,
                           target_diamond_db,
                           evalue,
                           per_identity,
                           per_aln_len,
                           max_hits,
                           sensitive,
                           tmp_query_hits_table.name,
                           'standard',
                           tmp_dir,
                           chunk_size=1,
                           block_size=8)
        else:
            diamond.blastp(query_gene_file, target_diamond_db, evalue,
                           per_identity, per_aln_len, max_hits, sensitive,
                           tmp_query_hits_table.name, 'standard', tmp_dir)

        # get target genes hit by one or more query proteins
        self.logger.info(
            'Creating file with target proteins with similarity to query proteins.'
        )
        target_hit = set()
        for line in open(tmp_query_hits_table.name):
            line_split = line.split('\t')
            target_hit.add(line_split[1])

        target_genes_hits = os.path.join(output_dir, 'target_genes_hit.faa')
        fout = open(target_genes_hits, 'w')
        for seq_id, seq in seq_io.read_seq(target_gene_file):
            if seq_id in target_hit:
                fout.write('>' + seq_id + '\n')
                fout.write(seq + '\n')
        fout.close()

        self.logger.info(
            'Identified %d target proteins to be used in reciprocal search.' %
            len(target_hit))

        # perform reciprocal blast
        self.logger.info(
            'Performing reciprocal similarity sequence between target and query proteins (be patient!).'
        )

        if tmp_dir:
            tmp_target_hits_table = tempfile.NamedTemporaryFile(
                prefix='comparem_hits_', dir=tmp_dir, delete=False)
        else:
            tmp_target_hits_table = tempfile.NamedTemporaryFile(
                prefix='comparem_hits_', delete=False)
        tmp_target_hits_table.close()

        if high_mem:
            diamond.blastp(target_genes_hits,
                           query_diamond_db,
                           evalue,
                           per_identity,
                           per_aln_len,
                           max_hits,
                           sensitive,
                           tmp_target_hits_table.name,
                           'standard',
                           tmp_dir,
                           chunk_size=1,
                           block_size=8)
        else:
            diamond.blastp(target_genes_hits, query_diamond_db, evalue,
                           per_identity, per_aln_len, max_hits, sensitive,
                           tmp_target_hits_table.name, 'standard', tmp_dir)

        # combine hit tables and sort
        os.system('cat %s >> %s' %
                  (tmp_target_hits_table.name, tmp_query_hits_table.name))
        os.remove(tmp_target_hits_table.name)
        hits_table_file = os.path.join(output_dir, 'hits_sorted.tsv')
        self._sort_hit_table(tmp_query_hits_table.name, hits_table_file)
Beispiel #5
0
    def _run_self_diamond(self, query_gene_file, evalue, per_identity,
                          per_aln_len, max_hits, sensitive, high_mem, tmp_dir,
                          output_dir):
        """Perform similarity search of query genes against themselves.

        Parameters
        ----------
        query_gene_file : str
            File with all query sequences.
        evalue : float
            E-value threshold for reporting hits.
        per_identity : float
            Percent identity threshold for reporting hits.
        per_aln_len : float
            Percent query coverage threshold for reporting hits.
        max_hits : int
            Maximum number of hits to report per query sequences.
        tmp_dir : str
            Directory to store temporary files.
        output_dir : str
            Directory to store blast results.
        """

        self.logger.info('Creating DIAMOND database (be patient!).')

        diamond_db = os.path.join(output_dir, 'query_genes')
        diamond = Diamond(self.cpus)
        diamond.create_db(query_gene_file, diamond_db)

        # create flat hits table
        if tmp_dir:
            tmp_hits_table = tempfile.NamedTemporaryFile(
                prefix='comparem_hits_', dir=tmp_dir, delete=False)
        else:
            tmp_hits_table = tempfile.NamedTemporaryFile(
                prefix='comparem_hits_', delete=False)
        tmp_hits_table.close()

        # blast all genes against the database
        self.logger.info(
            'Performing self similarity sequence between genomes (be patient!).'
        )

        if high_mem:
            diamond.blastp(query_gene_file,
                           diamond_db,
                           evalue,
                           per_identity,
                           per_aln_len,
                           max_hits,
                           sensitive,
                           tmp_hits_table.name,
                           'standard',
                           tmp_dir,
                           chunk_size=1,
                           block_size=8)
        else:
            diamond.blastp(query_gene_file, diamond_db, evalue, per_identity,
                           per_aln_len, max_hits, sensitive,
                           tmp_hits_table.name, 'standard', tmp_dir)

        # sort hit table
        hits_table_file = os.path.join(output_dir, 'hits_sorted.tsv')
        self._sort_hit_table(tmp_hits_table.name, hits_table_file)
    def _run_reciprocal_diamond(self, query_gene_file,
                                        target_gene_file,
                                        evalue, 
                                        per_identity, 
                                        per_aln_len,
                                        max_hits,
                                        sensitive,
                                        high_mem,
                                        tmp_dir,
                                        output_dir):
        """Perform similarity search of query genes against target genes, and reciprocal hits.

        Parameters
        ----------
        query_gene_file : str
            File with all query proteins.
        target_gene_file : str
            File with all target proteins.
        evalue : float
            E-value threshold for reporting hits.
        per_identity : float
            Percent identity threshold for reporting hits.
        per_aln_len : float
            Percent query coverage threshold for reporting hits.
        max_hits : int
            Maximum number of hits to report per query sequences.
        tmp_dir : str
            Directory to store temporary files.
        output_dir : str
            Directory to store blast results.
        """
        
        self.logger.info('Creating DIAMOND database of query proteins (be patient!).')
        diamond = Diamond(self.cpus)
        query_diamond_db = os.path.join(output_dir, 'query_genes')
        diamond.make_database(query_gene_file, query_diamond_db)
        
        self.logger.info('Creating DIAMOND database of target proteins (be patient!).')
        target_diamond_db = os.path.join(output_dir, 'target_genes')
        diamond.make_database(target_gene_file, target_diamond_db)

        # blast query genes against target proteins
        self.logger.info('Performing similarity sequence between query and target proteins (be patient!).')
        
        if tmp_dir:
            tmp_query_hits_table = tempfile.NamedTemporaryFile(prefix='comparem_hits_', dir=tmp_dir, delete=False)
        else:
            tmp_query_hits_table = tempfile.NamedTemporaryFile(prefix='comparem_hits_', delete=False)
        tmp_query_hits_table.close()
        
        query_hits_daa_file = os.path.join(output_dir, 'query_hits')
        
        if high_mem:
            diamond.blastp(query_gene_file, 
                            target_diamond_db, 
                            evalue, 
                            per_identity, 
                            per_aln_len, 
                            max_hits,
                            sensitive,
                            tmp_query_hits_table.name, 
                            'standard', 
                            tmp_dir, 
                            chunk_size=1, 
                            block_size=8)
        else:
            diamond.blastp(query_gene_file, 
                            target_diamond_db, 
                            evalue, 
                            per_identity, 
                            per_aln_len, 
                            max_hits,
                            sensitive,
                            tmp_query_hits_table.name, 
                            'standard', 
                            tmp_dir)
                
        # get target genes hit by one or more query proteins
        self.logger.info('Creating file with target proteins with similarity to query proteins.')
        target_hit = set()
        for line in open(tmp_query_hits_table.name):
            line_split = line.split('\t')
            target_hit.add(line_split[1])

        target_genes_hits = os.path.join(output_dir, 'target_genes_hit.faa')
        fout = open(target_genes_hits, 'w')
        for seq_id, seq in seq_io.read_seq(target_gene_file):
            if seq_id in target_hit:
                fout.write('>' + seq_id + '\n')
                fout.write(seq + '\n')
        fout.close()
        
        self.logger.info('Identified %d target proteins to be used in reciprocal search.' % len(target_hit))
        
        # perform reciprocal blast
        self.logger.info('Performing reciprocal similarity sequence between target and query proteins (be patient!).')

        if tmp_dir:
            tmp_target_hits_table = tempfile.NamedTemporaryFile(prefix='comparem_hits_', dir=tmp_dir, delete=False)
        else:
            tmp_target_hits_table = tempfile.NamedTemporaryFile(prefix='comparem_hits_', delete=False)
        tmp_target_hits_table.close()
        
        if high_mem:
            diamond.blastp(target_genes_hits, 
                            query_diamond_db, 
                            evalue, 
                            per_identity, 
                            per_aln_len, 
                            max_hits,
                            sensitive,
                            tmp_target_hits_table.name, 
                            'standard', 
                            tmp_dir, 
                            chunk_size=1, 
                            block_size=8)
        else:
            diamond.blastp(target_genes_hits, 
                            query_diamond_db, 
                            evalue, 
                            per_identity, 
                            per_aln_len, 
                            max_hits,
                            sensitive,
                            tmp_target_hits_table.name, 
                            'standard', 
                            tmp_dir)
                
        # combine hit tables and sort
        os.system('cat %s >> %s' % (tmp_target_hits_table.name, tmp_query_hits_table.name))
        os.remove(tmp_target_hits_table.name)
        hits_table_file = os.path.join(output_dir, 'hits_sorted.tsv')
        self._sort_hit_table(tmp_query_hits_table.name, hits_table_file)
    def _run_self_diamond(self, query_gene_file, 
                                evalue, 
                                per_identity, 
                                per_aln_len,
                                max_hits,
                                sensitive,
                                high_mem,
                                tmp_dir,
                                output_dir):
        """Perform similarity search of query genes against themselves.

        Parameters
        ----------
        query_gene_file : str
            File with all query sequences.
        evalue : float
            E-value threshold for reporting hits.
        per_identity : float
            Percent identity threshold for reporting hits.
        per_aln_len : float
            Percent query coverage threshold for reporting hits.
        max_hits : int
            Maximum number of hits to report per query sequences.
        tmp_dir : str
            Directory to store temporary files.
        output_dir : str
            Directory to store blast results.
        """
        
        self.logger.info('Creating DIAMOND database (be patient!).')
        
        diamond_db = os.path.join(output_dir, 'query_genes')
        diamond = Diamond(self.cpus)
        diamond.make_database(query_gene_file, diamond_db)
            
        # create flat hits table
        if tmp_dir:
            tmp_hits_table = tempfile.NamedTemporaryFile(prefix='comparem_hits_', dir=tmp_dir, delete=False)
        else:
            tmp_hits_table = tempfile.NamedTemporaryFile(prefix='comparem_hits_', delete=False)
        tmp_hits_table.close()

        # blast all genes against the database
        self.logger.info('Performing self similarity sequence between genomes (be patient!).')

        if high_mem:
            diamond.blastp(query_gene_file, 
                            diamond_db, 
                            evalue, 
                            per_identity, 
                            per_aln_len, 
                            max_hits,
                            sensitive,
                            tmp_hits_table.name, 
                            'standard', 
                            tmp_dir, 
                            chunk_size=1, 
                            block_size=8)
        else:
            diamond.blastp(query_gene_file, 
                            diamond_db, 
                            evalue, 
                            per_identity, 
                            per_aln_len, 
                            max_hits,
                            sensitive,
                            tmp_hits_table.name, 
                            'standard', 
                            tmp_dir)

        # sort hit table
        hits_table_file = os.path.join(output_dir, 'hits_sorted.tsv')
        self._sort_hit_table(tmp_hits_table.name, hits_table_file)
Beispiel #8
0
    def run(self, query_proteins, db_file, custom_db_file, taxonomy_file,
            custom_taxonomy_file, evalue, per_identity, per_aln_len,
            max_matches, homology_search, min_per_taxa, consensus, min_per_bp,
            use_trimAl, restrict_taxon, msa_program, tree_program, prot_model,
            skip_rooting, output_dir):
        """Infer a gene tree for homologs genes identified by blast.

        Workflow for inferring a gene tree from sequences identified as being
        homologs to a set of query proteins. Homologs are identified using BLASTP
        and a set of user-defined parameters.

        Parameters
        ----------
        query_proteins : str
            Fasta file containing query proteins.
        db_file : str
            BLAST database of reference proteins.
        custom_db_file : str
            Custom database of proteins.
        taxonomy_file : str
            Taxonomic assignment of each reference genomes.
        custom_taxonomy_file : str
            Taxonomic assignment of genomes in custom database.
        evalue : float
            E-value threshold used to define homolog.
        per_identity : float
            Percent identity threshold used to define a homolog.
        per_aln_len : float
            Alignment length threshold used to define a homolog.
        max_matches : int
            Maximum matches per query protein.
        metadata : dict[genome_id] -> metadata dictionary
            Metadata for genomes.
        homology_search : str
            Type of homology search to perform.
        min_per_taxa : float
            Minimum percentage of taxa required to retain a column.
        consensus : float
            Minimum percentage of the same amino acid required to retain column.
        min_per_bp : float
            Minimum percentage of base pairs required to keep trimmed sequence.
        use_trimAl : boolean
            Filter columns using trimAl.
        restrict_taxon : str
            Restrict alignment to specific taxonomic group (e.g., k__Archaea).
        msa_program : str
            Program to use for multiple sequence alignment ['mafft', 'muscle'].
        tree_program : str
            Program to use for tree inference ['fasttree', 'raxml'].
        prot_model : str
            Protein substitution model for tree inference ['WAG', 'LG', 'AUTO'].
        skip_rooting : boolean
            Skip midpoint rooting if True.
        output_dir : str
            Directory to store results.
        """

        # validate query sequence names for use with GeneTreeTk
        validate_seq_ids(query_proteins)

        # read taxonomy file
        self.logger.info('Reading taxonomy file.')
        taxonomy = Taxonomy().read(taxonomy_file)

        if custom_taxonomy_file:
            custom_taxonomy = Taxonomy().read(custom_taxonomy_file)
            taxonomy.update(custom_taxonomy)

        # report distribution of query genes
        mean_len, max_len, min_len, p10, p50, p90 = self._gene_distribution(
            query_proteins)
        self.logger.info(
            'Query gene lengths: min, mean, max = %d, %.1f, %d | p10, p50, p90 = %.1f, %.1f, %.1f'
            % (min_len, mean_len, max_len, p10, p50, p90))

        # identify homologs using BLASTP
        self.logger.info('Identifying homologs using %s.' % homology_search)
        blast = Blast(self.cpus)
        blast_output = os.path.join(output_dir, 'reference_hits.tsv')
        if homology_search == 'diamond':
            diamond = Diamond(self.cpus)
            diamond.blastp(query_proteins,
                           db_file,
                           evalue,
                           per_identity,
                           per_aln_len,
                           max_matches,
                           blast_output,
                           output_fmt='custom')
        else:
            blast.blastp(query_proteins,
                         db_file,
                         blast_output,
                         evalue,
                         max_matches,
                         output_fmt='custom',
                         task=homology_search)
        homologs = blast.identify_homologs(blast_output, evalue, per_identity,
                                           per_aln_len)
        self.logger.info('Identified %d homologs in reference database.' %
                         len(homologs))

        custom_homologs = None
        if custom_db_file:
            custom_blast_output = os.path.join(output_dir, 'custom_hits.tsv')
            if homology_search == 'diamond':
                diamond = Diamond(self.cpus)
                diamond.blastp(query_proteins,
                               custom_db_file,
                               evalue,
                               per_identity,
                               per_aln_len,
                               max_matches,
                               custom_blast_output,
                               output_fmt='custom')
            else:
                blast.blastp(query_proteins,
                             custom_db_file,
                             custom_blast_output,
                             evalue,
                             max_matches,
                             output_fmt='custom',
                             task=homology_search)
            custom_homologs = blast.identify_homologs(custom_blast_output,
                                                      evalue, per_identity,
                                                      per_aln_len)
            self.logger.info('Identified %d homologs in custom database.' %
                             len(custom_homologs))

        # restrict homologs to specific taxonomic group
        if restrict_taxon:
            self.logger.info('Restricting homologs to %s.' % restrict_taxon)
            restricted_homologs = {}
            for query_id, hit in homologs.iteritems():
                genome_id = hit.subject_id.split('~')[0]
                if restrict_taxon in taxonomy[genome_id]:
                    restricted_homologs[query_id] = hit

            self.logger.info(
                '%d of %d homologs in reference database are from the specified group.'
                % (len(restricted_homologs), len(homologs)))
            homologs = restricted_homologs

        if len(homologs) == 0:
            self.logger.error(
                'Too few homologs were identified. Gene tree cannot be inferred.'
            )
            sys.exit()

        # extract homologs
        self.logger.info(
            'Extracting homologs and determining local gene context.')
        db_homologs_tmp = os.path.join(output_dir, 'homologs_db.tmp')
        gene_precontext, gene_postcontext = self.extract_homologs_and_context(
            homologs.keys(), db_file, db_homologs_tmp)

        # report gene length distribution of homologs
        mean_len, max_len, min_len, p10, p50, p90 = self._gene_distribution(
            db_homologs_tmp)
        self.logger.info(
            'Homolog gene lengths: min, mean, max = %d, %.1f, %d | p10, p50, p90 = %.1f, %.1f, %.1f'
            % (min_len, mean_len, max_len, p10, p50, p90))

        # concatenate homologs with initial query genes
        homolog_ouput_tmp = os.path.join(output_dir, 'homologs.faa.tmp')
        if custom_homologs:
            custom_db_homologs_tmp = os.path.join(output_dir,
                                                  'custom_homologs_db.tmp')
            custom_gene_precontext, custom_gene_postcontext = self.extract_homologs_and_context(
                custom_homologs.keys(), custom_db_file, custom_db_homologs_tmp)
            gene_precontext.update(custom_gene_precontext)
            gene_postcontext.update(custom_gene_postcontext)
            homologs.update(custom_homologs)
            concatenate_files(
                [query_proteins, db_homologs_tmp, custom_db_homologs_tmp],
                homolog_ouput_tmp)
            os.remove(custom_db_homologs_tmp)
        else:
            concatenate_files([query_proteins, db_homologs_tmp],
                              homolog_ouput_tmp)

        os.remove(db_homologs_tmp)

        # remove stop codons
        homolog_ouput = os.path.join(output_dir, 'homologs.faa')
        self._remove_stop_codons(homolog_ouput_tmp, homolog_ouput)
        os.remove(homolog_ouput_tmp)

        # infer multiple sequence alignment
        msa = MsaWorkflow(self.cpus)
        trimmed_msa_output = msa.run(homolog_ouput, min_per_taxa, consensus,
                                     min_per_bp, use_trimAl, msa_program,
                                     output_dir)

        # infer tree
        tw = TreeWorkflow(self.cpus)
        tree_output = tw.run(trimmed_msa_output, tree_program, prot_model,
                             skip_rooting, output_dir)

        # create tax2tree consensus map and decorate tree
        self.logger.info('Decorating internal tree nodes with tax2tree.')
        output_taxonomy_file = os.path.join(output_dir, 'taxonomy.tsv')
        fout = open(output_taxonomy_file, 'w')
        for homolog_id in homologs.keys():
            genome_id = homolog_id.split('~')[0]
            t = taxonomy.get(genome_id, None)
            if t:
                fout.write(homolog_id + '\t' + ';'.join(t) + '\n')
        fout.close()

        t2t_tree = os.path.join(output_dir, 'homologs.tax2tree.tree')
        cmd = 't2t decorate -m %s -t %s -o %s' % (output_taxonomy_file,
                                                  tree_output, t2t_tree)
        os.system(cmd)

        # create tree with leaf nodes given as genome accessions
        tree = dendropy.Tree.get_from_path(t2t_tree,
                                           schema='newick',
                                           rooting='force-rooted',
                                           preserve_underscores=True)

        for leaf in tree.leaf_node_iter():
            leaf.taxon.label = leaf.taxon.label.split('~')[0]

        genome_tree = os.path.join(output_dir,
                                   'homologs.tax2tree.genome_accessions.tree')
        tree.write_to_path(genome_tree,
                           schema='newick',
                           suppress_rooting=True,
                           unquoted_underscores=True)

        # setup metadata for ARB file
        src_dir = os.path.dirname(os.path.realpath(__file__))
        version_file = open(os.path.join(src_dir, 'VERSION'))

        metadata = {}
        metadata['genetreetk_version'] = version_file.read().strip()
        metadata['genetreetk_query_proteins'] = query_proteins
        metadata['genetreetk_db_file'] = db_file
        metadata['genetreetk_taxonomy_file'] = taxonomy_file
        metadata['genetreetk_blast_evalue'] = str(evalue)
        metadata['genetreetk_blast_per_identity'] = str(per_identity)
        metadata['genetreetk_blast_per_aln_len'] = str(per_aln_len)
        metadata['genetreetk_blast_max_matches'] = str(max_matches)
        metadata['genetreetk_homology_search'] = homology_search

        metadata['genetreetk_msa_min_per_taxa'] = str(min_per_taxa)
        metadata['genetreetk_msa_consensus'] = str(consensus)
        metadata['genetreetk_msa_min_per_bp'] = str(min_per_bp)
        metadata['genetreetk_msa_program'] = msa_program

        metadata['genetreetk_tree_program'] = tree_program
        metadata['genetreetk_tree_prot_model'] = prot_model

        # create ARB metadata file
        self.logger.info('Creating ARB metadata file.')
        arb_metadata_file = os.path.join(output_dir, 'arb.metadata.txt')
        self.create_arb_metadata(homologs, trimmed_msa_output, taxonomy,
                                 metadata, gene_precontext, gene_postcontext,
                                 arb_metadata_file)
Beispiel #9
0
    def run(self, scaffold_gene_file, stat_file, ref_genome_gene_files, db_file, evalue, per_identity,):
        """Create taxonomic profiles for a set of genomes.

        Parameters
        ----------
        scaffold_gene_file : str
            Fasta file of genes on scaffolds in amino acid space.
        stat_file : str
            File with statistics for individual scaffolds.
        ref_genome_gene_files : list of str
            Fasta files of called genes on reference genomes of interest.
        db_file : str
            Database of competing reference genes.
        evalue : float
            E-value threshold used by blast.
        per_identity: float
            Percent identity threshold used by blast.
        """

        # read statistics file
        self.logger.info('')
        self.logger.info('  Reading scaffold statistics.')
        scaffold_stats = ScaffoldStats()
        scaffold_stats.read(stat_file)

        # perform homology searches
        self.logger.info('')
        self.logger.info('  Creating diamond database for reference genomes.')
        ref_gene_file = os.path.join(self.output_dir, 'ref_genes.faa')
        concatenate_gene_files(ref_genome_gene_files, ref_gene_file)

        diamond = Diamond(self.cpus)
        ref_diamond_db = os.path.join(self.output_dir, 'ref_genes')
        diamond.make_database(ref_gene_file, ref_diamond_db)

        self.logger.info('  Identifying homologs within reference genomes of interest (be patient!).')
        self.diamond_dir = os.path.join(self.output_dir, 'diamond')
        make_sure_path_exists(self.diamond_dir)
        hits_ref_genomes_daa = os.path.join(self.diamond_dir, 'ref_hits')
        diamond.blastp(scaffold_gene_file, ref_diamond_db, evalue, per_identity, 1, hits_ref_genomes_daa)

        hits_ref_genomes = os.path.join(self.diamond_dir, 'ref_hits.tsv')
        diamond.view(hits_ref_genomes_daa + '.daa', hits_ref_genomes)

        self.logger.info('  Identifying homologs within competing reference genomes (be patient!).')
        hits_comp_ref_genomes_daa = os.path.join(self.diamond_dir, 'competing_ref_hits')
        diamond.blastp(scaffold_gene_file, db_file, evalue, per_identity, 1, hits_comp_ref_genomes_daa)

        hits_comp_ref_genomes = os.path.join(self.diamond_dir, 'competing_ref_hits.tsv')
        diamond.view(hits_comp_ref_genomes_daa + '.daa', hits_comp_ref_genomes)

        # get list of genes with a top hit to the reference genomes of interest
        hits_to_ref = self._top_hits_to_reference(hits_ref_genomes, hits_comp_ref_genomes)

        # get number of genes on each scaffold
        num_genes_on_scaffold = defaultdict(int)
        for seq_id, _seq in seq_io.read_seq(scaffold_gene_file):
            scaffold_id = seq_id[0:seq_id.rfind('_')]
            num_genes_on_scaffold[scaffold_id] += 1

        # get hits to each scaffold
        hits_to_scaffold = defaultdict(list)
        for query_id, hit in hits_to_ref.iteritems():
            gene_id = query_id[0:query_id.rfind('~')]
            scaffold_id = gene_id[0:gene_id.rfind('_')]
            hits_to_scaffold[scaffold_id].append(hit)

        # report summary stats for each scaffold
        reference_out = os.path.join(self.output_dir, 'references.tsv')
        fout = open(reference_out, 'w')
        fout.write('Scaffold id\tSubject scaffold ids\tSubject genome ids')
        fout.write('\tGenome id\tLength (bp)\tGC\tMean coverage')
        fout.write('\t# genes\t# hits\t% genes\tAvg. align. length (bp)\tAvg. % identity\tAvg. e-value\tAvg. bitscore\n')

        for scaffold_id, hits in hits_to_scaffold.iteritems():
            aln_len = []
            perc_iden = []
            evalue = []
            bitscore = []
            subject_scaffold_ids = defaultdict(int)
            subject_bin_ids = defaultdict(int)
            for hit in hits:
                aln_len.append(hit.aln_length)
                perc_iden.append(hit.perc_identity)
                evalue.append(hit.evalue)
                bitscore.append(hit.bitscore)

                subject_id, subject_bin_id = hit.subject_id.split('~')
                subject_scaffold_id = subject_id[0:subject_id.rfind('_')]
                subject_scaffold_ids[subject_scaffold_id] += 1
                subject_bin_ids[subject_bin_id] += 1

            subject_scaffold_id_str = []
            for subject_id, num_hits in subject_scaffold_ids.iteritems():
                subject_scaffold_id_str.append(subject_id + ':' + str(num_hits))
            subject_scaffold_id_str = ','.join(subject_scaffold_id_str)

            subject_bin_id_str = []
            for bin_id, num_hits in subject_bin_ids.iteritems():
                subject_bin_id_str.append(bin_id + ':' + str(num_hits))
            subject_bin_id_str = ','.join(subject_bin_id_str)

            fout.write('%s\t%s\t%s\t%s\t%.2f\t%d\t%d\t%.2f\t%d\t%.2f\t%.2g\t%.2f\n' % (
                                                                        scaffold_id,
                                                                        subject_scaffold_id_str,
                                                                        subject_bin_id_str,
                                                                        scaffold_stats.print_stats(scaffold_id),
                                                                        mean(scaffold_stats.coverage(scaffold_id)),
                                                                        num_genes_on_scaffold[scaffold_id],
                                                                        len(hits),
                                                                        len(hits) * 100.0 / num_genes_on_scaffold[scaffold_id],
                                                                        mean(aln_len),
                                                                        mean(perc_iden),
                                                                        mean(evalue),
                                                                        mean(bitscore)))

        fout.close()

        return reference_out
Beispiel #10
0
    def run(self, genome_files, db_file, taxonomy_file, evalue, per_identity,
            window_size, step_size):
        """Create taxonomic profiles for a set of genomes.

        Parameters
        ----------
        genome_files : list of str
            Fasta files of genomes to process.
        db_file : str
            Database of reference genes.
        taxonomy_file : str
            File containing GreenGenes taxonomy strings for reference genomes.
        evalue : float
            E-value threshold used by blast.
        per_identity: float
            Percent identity threshold used by blast.
        window_size : int
            Size of each fragment.
        step_size : int
            Number of bases to move after each window.
        """

        # parse taxonomy file
        self.logger.info(
            '  Reading taxonomic assignment of reference genomes.')
        taxonomy = Taxonomy().read(taxonomy_file)

        # fragment each genome into fixed sizes windows
        self.logger.info('')
        self.logger.info('  Fragmenting sequences in each bin:')
        diamond_output_dir = os.path.join(self.output_dir, 'diamond')
        make_sure_path_exists(diamond_output_dir)

        fragment_file = os.path.join(diamond_output_dir, 'fragments.fna')
        fragment_out = open(fragment_file, 'w')
        contig_id_to_genome_id = {}
        for genome_file in genome_files:
            genome_id = remove_extension(genome_file)
            self.profiles[genome_id] = Profile(genome_id, taxonomy)
            self._fragment_genomes(genome_file, window_size, step_size,
                                   self.profiles[genome_id], fragment_out)

            for seq_id, _seq in seq_io.read_seq(genome_file):
                contig_id_to_genome_id[seq_id] = genome_id

        # run diamond
        self.logger.info('')
        self.logger.info(
            '  Running diamond blastx with %d processes (be patient!)' %
            self.cpus)

        diamond = Diamond(self.cpus)
        diamond_daa_out = os.path.join(diamond_output_dir, 'diamond_hits')
        diamond.blastx(fragment_file, db_file, evalue, per_identity, 1,
                       diamond_daa_out)

        diamond_table_out = os.path.join(diamond_output_dir,
                                         'diamond_hits.tsv')
        diamond.view(diamond_daa_out + '.daa', diamond_table_out)

        self.logger.info('')
        self.logger.info('  Creating taxonomic profile for each genome.')
        self._taxonomic_profiles(diamond_table_out, taxonomy,
                                 contig_id_to_genome_id)

        self.logger.info('')
        self.logger.info('  Writing taxonomic profile for each genome.')

        report_dir = os.path.join(self.output_dir, 'bin_reports')
        make_sure_path_exists(report_dir)

        for genome_id, profile in self.profiles.iteritems():
            seq_summary_out = os.path.join(report_dir,
                                           genome_id + '.sequences.tsv')
            profile.write_seq_summary(seq_summary_out)

            genome_profile_out = os.path.join(report_dir,
                                              genome_id + '.profile.tsv')
            profile.write_genome_profile(genome_profile_out)

        genome_summary_out = os.path.join(self.output_dir,
                                          'genome_summary.tsv')
        self._write_genome_summary(genome_summary_out)

        # create Krona plot
        krona_profiles = defaultdict(lambda: defaultdict(int))
        for genome_id, profile in self.profiles.iteritems():
            seq_assignments = profile.classify_seqs(taxonomy)

            for seq_id, classification in seq_assignments.iteritems():
                taxa = []
                for r in xrange(0, len(profile.rank_labels)):
                    taxa.append(classification[r][0])

                krona_profiles[genome_id][';'.join(
                    taxa)] += profile.seq_len[seq_id]

        krona = Krona()
        krona_output_file = os.path.join(self.output_dir,
                                         'taxonomic_profiles.krona.html')
        krona.create(krona_profiles, krona_output_file)