Beispiel #1
0
    def _process_peptide_files(self, peptide_paths):
        """Checks that `peptide_paths` files actually exist, then combines them."""
        # For renaming fasta headers
        hit_id = 0

        # Set up data storage.
        # Todo: These are keyed with ints, so idk why they aren't lists.  Maybe used as a dict later in the code?
        gene_calls_dict = {
        }  # each entry must contain {'contig', 'start', stop, 'direction', 'partial'} items.
        amino_acid_sequences_dict = {}

        for peptide_path in peptide_paths:
            if not os.path.exists(peptide_path):
                if self.logger:
                    self.logger.progress.end()

                raise ConfigError(
                    "Something went wrong with prodigal, and it failed to generate the "
                    "expected output ('%s') :/ Fortunately, this log file should tell you what "
                    "might be the problem: '%s'. Please do not forget to include this "
                    "file if you were to ask for help." %
                    (peptide_path, self.log_file_path))

            # Some splits may not actually have gene calls.  Skip them.
            if filesnpaths.is_file_empty(peptide_path):
                continue

            # If we get here, the fasta file will not be empty.
            fasta = fastalib.SequenceSource(peptide_path)

            while next(fasta):
                gene_calls_dict[hit_id] = self.parser(fasta.id)
                amino_acid_sequences_dict[hit_id] = fasta.seq.replace('*', '')
                hit_id += 1

            fasta.close()

            # todo i think this is removed elsewhere
            # Remove the split peptide file.
            os.remove(peptide_path)

        # If no genes were predicted across all output files, warn the user.
        if len(amino_acid_sequences_dict) == 0:
            if self.logger:
                self.logger.run.info(
                    'Result',
                    f'Prodigal ({self.installed_version}) has identified no genes :/',
                    nl_after=1,
                    mc="red")
        else:  # Write out the final gene file
            assert 'peptide_path' in self.collated_output_file_paths
            with open(self.collated_output_file_paths['peptide_path'],
                      'w') as f:
                for hit_id, sequence in amino_acid_sequences_dict.items():
                    f.write(f">{hit_id}\n{sequence}\n")

        return gene_calls_dict, amino_acid_sequences_dict
Beispiel #2
0
    def process(self, fasta_file_path, output_dir):
        """Take the fasta file, run prodigal on it, and make sense of the output

           Returns a gene calls dict, and amino acid sequences dict.
        """
        gene_calls_dict = {} # each entry must contain {'contig', 'start', stop, 'direction', 'partial'} items.
        amino_acid_sequences_dict = {}

        self.genes_in_contigs = os.path.join(output_dir, 'contigs.genes')
        self.amino_acid_sequences_in_contigs = os.path.join(output_dir, 'contigs.amino_acid_sequences')

        log_file_path = os.path.join(output_dir, '00_log.txt')

        self.run.warning('', header='Finding ORFs in contigs', lc='green')
        self.run.info('Genes', self.genes_in_contigs)
        self.run.info('Amino acid sequences', self.amino_acid_sequences_in_contigs)
        self.run.info('Log file', log_file_path)

        self.progress.new('Processing')
        self.progress.update('Identifying ORFs in contigs ...')

        cmd_line = ['prodigal', '-i', fasta_file_path, '-o', self.genes_in_contigs, '-a', self.amino_acid_sequences_in_contigs, '-p', 'meta']
        utils.run_command(cmd_line, log_file_path)

        if not os.path.exists(self.amino_acid_sequences_in_contigs):
            self.progress.end()
            raise ConfigError("Something went wrong with prodigal, and it failed to generate the\
                               expected output :/ Fortunately, this log file should tell you what\
                               might be the problem: '%s'. Please do not forget to include this\
                               file if you were to ask for help." % log_file_path)

        if filesnpaths.is_file_empty(self.amino_acid_sequences_in_contigs):
            self.progress.end()
            self.run.info('Result', 'Prodigal (%s) has identified no genes :/' % (self.installed_version), nl_after=1, mc="red")
            return gene_calls_dict, amino_acid_sequences_dict

        self.progress.update('Processing gene calls ...')

        fasta = fastalib.SequenceSource(self.amino_acid_sequences_in_contigs)

        hit_id = 0
        while next(fasta):
            gene_calls_dict[hit_id] = self.parser(fasta.id)
            amino_acid_sequences_dict[hit_id] = fasta.seq.replace('*', '')
            hit_id += 1

        fasta.close()

        self.progress.end()

        self.run.info('Result', 'Prodigal (%s) has identified %d genes.' % (self.installed_version, len(gene_calls_dict)), nl_after=1)

        return gene_calls_dict, amino_acid_sequences_dict
Beispiel #3
0
    def process(self, fasta_file_path, output_dir):
        """Take the fasta file, run prodigal on it, and make sense of the output

           Returns a gene calls dict, and amino acid sequences dict.
        """
        gene_calls_dict = {
        }  # each entry must contain {'contig', 'start', stop, 'direction', 'partial'} items.
        amino_acid_sequences_dict = {}

        self.genes_in_contigs = os.path.join(output_dir, 'contigs.genes')
        self.amino_acid_sequences_in_contigs = os.path.join(
            output_dir, 'contigs.amino_acid_sequences')

        log_file_path = os.path.join(output_dir, '00_log.txt')

        self.run.warning('', header='Finding ORFs in contigs', lc='green')
        self.run.info('Genes', self.genes_in_contigs)
        self.run.info('Amino acid sequences',
                      self.amino_acid_sequences_in_contigs)
        self.run.info('Log file', log_file_path)

        cmd_line = [
            'prodigal', '-i', fasta_file_path, '-o', self.genes_in_contigs,
            '-a', self.amino_acid_sequences_in_contigs
        ]

        if self.prodigal_translation_table:
            cmd_line.extend(['-g', self.prodigal_translation_table])
            self.run.warning(
                "Prodigal translation table is set to '%s' (whatever you did has worked so far, but\
                              keep an eye for errors from prodigal in case it doesn't like your translation table\
                              parameter). This means we will not use prodigal in the metagenomics mode, due to this\
                              issue: https://github.com/hyattpd/Prodigal/issues/19. If that issue is closed, and you\
                              are reading this message, then please contact an anvi'o developer."
                % str(self.prodigal_translation_table))
        else:
            cmd_line.extend(['-p', 'meta'])

        self.run.warning(
            "Anvi'o will use 'prodigal' by Hyatt et al (doi:10.1186/1471-2105-11-119) to identify open\
                          reading frames in your data. When you publish your findings, please do not forget to properly\
                          credit their work.",
            lc='green',
            header="CITATION")

        self.progress.new('Processing')
        self.progress.update('Identifying ORFs in contigs ...')

        utils.run_command(cmd_line, log_file_path)

        if not os.path.exists(self.amino_acid_sequences_in_contigs):
            self.progress.end()
            raise ConfigError(
                "Something went wrong with prodigal, and it failed to generate the\
                               expected output :/ Fortunately, this log file should tell you what\
                               might be the problem: '%s'. Please do not forget to include this\
                               file if you were to ask for help." %
                log_file_path)

        if filesnpaths.is_file_empty(self.amino_acid_sequences_in_contigs):
            self.progress.end()
            self.run.info('Result',
                          'Prodigal (%s) has identified no genes :/' %
                          (self.installed_version),
                          nl_after=1,
                          mc="red")
            return gene_calls_dict, amino_acid_sequences_dict

        self.progress.update('Processing gene calls ...')

        fasta = fastalib.SequenceSource(self.amino_acid_sequences_in_contigs)

        hit_id = 0
        while next(fasta):
            gene_calls_dict[hit_id] = self.parser(fasta.id)
            amino_acid_sequences_dict[hit_id] = fasta.seq.replace('*', '')
            hit_id += 1

        fasta.close()

        self.progress.end()

        self.run.info('Result',
                      'Prodigal (%s) has identified %d genes.' %
                      (self.installed_version, len(gene_calls_dict)),
                      nl_after=1)

        return gene_calls_dict, amino_acid_sequences_dict
Beispiel #4
0
    def process(self, fasta_file_path, output_dir):
        """Take the fasta file, run prodigal on it, and make sense of the output

           Returns a gene calls dict, and amino acid sequences dict.
        """
        gene_calls_dict = {
        }  # each entry must contain {'contig', 'start', stop, 'direction', 'partial'} items.
        amino_acid_sequences_dict = {}

        self.genes_in_contigs = os.path.join(output_dir, 'contigs.genes')
        self.amino_acid_sequences_in_contigs = os.path.join(
            output_dir, 'contigs.amino_acid_sequences')

        log_file_path = os.path.join(output_dir, '00_log.txt')

        self.run.warning('', header='Finding ORFs in contigs', lc='green')
        self.run.info('Genes', self.genes_in_contigs)
        self.run.info('Amino acid sequences',
                      self.amino_acid_sequences_in_contigs)
        self.run.info('Log file', log_file_path)

        self.progress.new('Processing')
        self.progress.update('Identifying ORFs in contigs ...')

        cmd_line = [
            'prodigal', '-i', fasta_file_path, '-o', self.genes_in_contigs,
            '-a', self.amino_acid_sequences_in_contigs, '-p', 'meta'
        ]
        utils.run_command(cmd_line, log_file_path)

        if not os.path.exists(self.amino_acid_sequences_in_contigs):
            self.progress.end()
            raise ConfigError(
                "Something went wrong with prodigal, and it failed to generate the\
                               expected output :/ Fortunately, this log file should tell you what\
                               might be the problem: '%s'. Please do not forget to include this\
                               file if you were to ask for help." %
                log_file_path)

        if filesnpaths.is_file_empty(self.amino_acid_sequences_in_contigs):
            self.progress.end()
            self.run.info('Result',
                          'Prodigal (%s) has identified no genes :/' %
                          (self.installed_version),
                          nl_after=1,
                          mc="red")
            return gene_calls_dict, amino_acid_sequences_dict

        self.progress.update('Processing gene calls ...')

        fasta = fastalib.SequenceSource(self.amino_acid_sequences_in_contigs)

        hit_id = 0
        while next(fasta):
            gene_calls_dict[hit_id] = self.parser(fasta.id)
            amino_acid_sequences_dict[hit_id] = fasta.seq.replace('*', '')
            hit_id += 1

        fasta.close()

        self.progress.end()

        self.run.info('Result',
                      'Prodigal (%s) has identified %d genes.' %
                      (self.installed_version, len(gene_calls_dict)),
                      nl_after=1)

        return gene_calls_dict, amino_acid_sequences_dict
Beispiel #5
0
    def process(self, fasta_file_path, output_dir):
        """Take the fasta file, run prodigal on it, and make sense of the output

           Returns a gene calls dict, and amino acid sequences dict.
        """
        gene_calls_dict = {} # each entry must contain {'contig', 'start', stop, 'direction', 'partial'} items.
        amino_acid_sequences_dict = {}

        self.genes_in_contigs = os.path.join(output_dir, 'contigs.genes')
        self.amino_acid_sequences_in_contigs = os.path.join(output_dir, 'contigs.amino_acid_sequences')

        log_file_path = os.path.join(output_dir, '00_log.txt')

        self.run.warning('', header='Finding ORFs in contigs', lc='green')
        self.run.info('Genes', self.genes_in_contigs)
        self.run.info('Amino acid sequences', self.amino_acid_sequences_in_contigs)
        self.run.info('Log file', log_file_path)

        cmd_line = ['prodigal', '-i', fasta_file_path, '-o', self.genes_in_contigs, '-a', self.amino_acid_sequences_in_contigs]

        if self.prodigal_translation_table:
            cmd_line.extend(['-g', self.prodigal_translation_table])
            self.run.warning("Prodigal translation table is set to '%s' (whatever you did has worked so far, but\
                              keep an eye for errors from prodigal in case it doesn't like your translation table\
                              parameter). This means we will not use prodigal in the metagenomics mode, due to this\
                              issue: https://github.com/hyattpd/Prodigal/issues/19. If that issue is closed, and you\
                              are reading this message, then please contact an anvi'o developer." % str(self.prodigal_translation_table))
        else:
            cmd_line.extend(['-p', 'meta'])

        self.progress.new('Processing')
        self.progress.update('Identifying ORFs in contigs ...')

        utils.run_command(cmd_line, log_file_path)

        if not os.path.exists(self.amino_acid_sequences_in_contigs):
            self.progress.end()
            raise ConfigError("Something went wrong with prodigal, and it failed to generate the\
                               expected output :/ Fortunately, this log file should tell you what\
                               might be the problem: '%s'. Please do not forget to include this\
                               file if you were to ask for help." % log_file_path)

        if filesnpaths.is_file_empty(self.amino_acid_sequences_in_contigs):
            self.progress.end()
            self.run.info('Result', 'Prodigal (%s) has identified no genes :/' % (self.installed_version), nl_after=1, mc="red")
            return gene_calls_dict, amino_acid_sequences_dict

        self.progress.update('Processing gene calls ...')

        fasta = fastalib.SequenceSource(self.amino_acid_sequences_in_contigs)

        hit_id = 0
        while next(fasta):
            gene_calls_dict[hit_id] = self.parser(fasta.id)
            amino_acid_sequences_dict[hit_id] = fasta.seq.replace('*', '')
            hit_id += 1

        fasta.close()

        self.progress.end()

        self.run.info('Result', 'Prodigal (%s) has identified %d genes.' % (self.installed_version, len(gene_calls_dict)), nl_after=1)

        return gene_calls_dict, amino_acid_sequences_dict