Exemple #1
0
    def split_input_file(self):
        parts = []
        next_part = 1
        part_obj = None

        if self.input_is_fasta:
            fasta = u.SequenceSource(self.input_file_path)

            while fasta.next():
                if (fasta.pos - 1) % self.num_entries_per_file == 0:
                    self.progress.update('Creating part: ~ %s' %
                                         (pp(next_part)))

                    if part_obj:
                        part_obj.close()

                    file_path = os.path.join(self.tmp_dir,
                                             'part-%08d.fa' % next_part)
                    parts.append(file_path)
                    next_part += 1
                    part_obj = open(file_path, 'w')

                part_obj.write('>%s\n' % fasta.id)
                part_obj.write('%s\n' % fasta.seq)

            if part_obj:
                part_obj.close()

        return parts
Exemple #2
0
    def process(self, fasta_file_path, output_dir):
        """Take the fasta file, run prodigal on it, and make sense of the output

           Returns a gene calls dict, and amino acid sequences dict.
        """
        gene_calls_dict = {
        }  # each entry must contain {'contig', 'start', stop, 'direction', 'partial'} items.
        amino_acid_sequences_dict = {}

        self.genes_in_contigs = os.path.join(output_dir, 'contigs.genes')
        self.amino_acid_sequences_in_contigs = os.path.join(
            output_dir, 'contigs.amino_acid_sequences')

        log_file_path = os.path.join(output_dir, '00_log.txt')

        self.run.warning('', header='Finding ORFs in contigs', lc='green')
        self.run.info('Genes', self.genes_in_contigs)
        self.run.info('Amino acid sequences',
                      self.amino_acid_sequences_in_contigs)
        self.run.info('Log file', log_file_path)

        self.progress.new('Processing')
        self.progress.update('Identifying ORFs in contigs ...')

        cmd_line = [
            'prodigal', '-i', fasta_file_path, '-o', self.genes_in_contigs,
            '-a', self.amino_acid_sequences_in_contigs, '-p', 'meta'
        ]
        utils.run_command(cmd_line, log_file_path)

        if not os.path.exists(self.amino_acid_sequences_in_contigs):
            self.progress.end()
            raise ConfigError(
                "Something went wrong with prodigal, and it failed to generate the\
                               expected output :/ Fortunately, this log file should tell you what\
                               might be the problem: '%s'. Please do not forget to include this\
                               file if you were to ask for help." %
                log_file_path)

        self.progress.update('Processing gene calls ...')

        fasta = fastalib.SequenceSource(self.amino_acid_sequences_in_contigs)

        hit_id = 0
        while next(fasta):
            gene_calls_dict[hit_id] = self.parser(fasta.id)
            amino_acid_sequences_dict[hit_id] = fasta.seq.replace('*', '')
            hit_id += 1

        fasta.close()

        self.progress.end()

        self.run.info('Result',
                      'Prodigal (%s) has identified %d genes.' %
                      (self.installed_version, len(gene_calls_dict)),
                      nl_after=1)

        return gene_calls_dict, amino_acid_sequences_dict
Exemple #3
0
def get_all_ids_from_fasta(input_file):
    fasta = u.SequenceSource(input_file)
    ids = []
    
    while fasta.next():
        ids.append(fasta.id) 

    return ids
Exemple #4
0
def is_file_fasta_formatted(file_path):
    is_file_exists(file_path)

    try:
        f = u.SequenceSource(file_path)
    except u.FastaLibError, e:
        raise FilesNPathsError, "Someone is not happy with your FASTA file '%s' (this is\
                            what the lib says: '%s'." % (file_path, e)
Exemple #5
0
def get_read_lengths_from_fasta(input_file):
    contig_lengths = {}

    fasta = u.SequenceSource(input_file)
    while fasta.next():
        contig_lengths[fasta.id] = len(fasta.seq)

    fasta.close()
    return contig_lengths
Exemple #6
0
    def _process_peptide_files(self, peptide_paths):
        """Checks that `peptide_paths` files actually exist, then combines them."""
        # For renaming fasta headers
        hit_id = 0

        # Set up data storage.
        # Todo: These are keyed with ints, so idk why they aren't lists.  Maybe used as a dict later in the code?
        gene_calls_dict = {
        }  # each entry must contain {'contig', 'start', stop, 'direction', 'partial'} items.
        amino_acid_sequences_dict = {}

        for peptide_path in peptide_paths:
            if not os.path.exists(peptide_path):
                if self.logger:
                    self.logger.progress.end()

                raise ConfigError(
                    "Something went wrong with prodigal, and it failed to generate the "
                    "expected output ('%s') :/ Fortunately, this log file should tell you what "
                    "might be the problem: '%s'. Please do not forget to include this "
                    "file if you were to ask for help." %
                    (peptide_path, self.log_file_path))

            # Some splits may not actually have gene calls.  Skip them.
            if filesnpaths.is_file_empty(peptide_path):
                continue

            # If we get here, the fasta file will not be empty.
            fasta = fastalib.SequenceSource(peptide_path)

            while next(fasta):
                gene_calls_dict[hit_id] = self.parser(fasta.id)
                amino_acid_sequences_dict[hit_id] = fasta.seq.replace('*', '')
                hit_id += 1

            fasta.close()

            # todo i think this is removed elsewhere
            # Remove the split peptide file.
            os.remove(peptide_path)

        # If no genes were predicted across all output files, warn the user.
        if len(amino_acid_sequences_dict) == 0:
            if self.logger:
                self.logger.run.info(
                    'Result',
                    f'Prodigal ({self.installed_version}) has identified no genes :/',
                    nl_after=1,
                    mc="red")
        else:  # Write out the final gene file
            assert 'peptide_path' in self.collated_output_file_paths
            with open(self.collated_output_file_paths['peptide_path'],
                      'w') as f:
                for hit_id, sequence in amino_acid_sequences_dict.items():
                    f.write(f">{hit_id}\n{sequence}\n")

        return gene_calls_dict, amino_acid_sequences_dict
Exemple #7
0
    def get_corresponding_gene_call_from_target_fasta_path(self):
        """corresponding_gene_call is assumed to be the definline of self.argstarget_fast_path"""

        target_fasta = u.SequenceSource(self.target_fasta_path,
                                        lazy_init=False)
        while next(target_fasta):
            corresponding_gene_call = target_fasta.id
        target_fasta.close()

        return corresponding_gene_call
Exemple #8
0
def get_GC_content_for_FASTA_entries(file_path):
    filesnpaths.is_file_exists(file_path)
    filesnpaths.is_file_fasta_formatted(file_path)

    GC_content_dict = {}

    fasta = u.SequenceSource(file_path)
    while fasta.next():
        GC_content_dict[fasta.id] = get_GC_content_for_sequence(fasta.seq)

    return GC_content_dict
Exemple #9
0
def get_FASTA_file_as_dictionary(file_path):
    filesnpaths.is_file_exists(file_path)
    filesnpaths.is_file_fasta_formatted(file_path)

    d = {}

    fasta = u.SequenceSource(file_path)
    while fasta.next():
        d[fasta.id] = fasta.seq

    return d
    def generate(self):
        output = fastalib.FastaOutput(self.output_file)

        for index_fasta in range(0, len(self.fasta_files)):
            f = self.fasta_files_dict[self.fasta_files[index_fasta]]

            x = self.short_read_length
            c = f['coverage']

            self.progress.new(
                'Working on file %d of %d (%s) with expected coverage of %d' %
                (index_fasta + 1, len(self.fasta_files), f['alias'], c))

            fasta = fastalib.SequenceSource(f['path'])
            total_num_errors = 0
            total_num_reads = 0
            while next(fasta):
                L = len(fasta.seq)

                av_num_short_reads_needed = int(L / x * c)
                total_num_reads += av_num_short_reads_needed

                for index_short_read in range(0, av_num_short_reads_needed):
                    if (index_short_read + 1) % 100 == 0:
                        self.progress.update('Entry %s :: %s nts :: reads %s of %s :: num errors: %s ...'\
                                                        % (pp(fasta.pos + 1), pp(len(fasta.seq)),
                                                           pp(index_short_read + 1), pp(av_num_short_reads_needed),
                                                           pp(total_num_errors)))

                    start_pos = random.randint(0, L - x)
                    short_read, num_errors = simulate_errors(
                        self.error_rate, fasta.seq[start_pos:start_pos + x])
                    total_num_errors += num_errors

                    output.write_id('%s_%d|source:%s|start:%d|stop:%d' %
                                    (f['alias'], index_short_read, fasta.id,
                                     start_pos, start_pos + x))
                    output.write_seq(short_read)

            self.progress.end()
            self.run.info('%s w/ %d contigs' % (f['alias'], fasta.pos), '%s reads with %s errors (avg %.4f) for %sX avg cov.'\
                                        % (pp(total_num_reads),
                                           pp(total_num_errors),
                                           total_num_errors * 1.0 / (total_num_reads * x),
                                           pp(c),
                                           ))

        output.close()
        self.run.info('Fasta output', self.output_file)
Exemple #11
0
    def sanity_check(self, skip_warnings=False):
        A = lambda x, t: t(args.__dict__[x]
                           ) if x in self.args.__dict__ else None
        null = lambda x: x

        # the directory files will be dumped into (can exist but must be empty)
        if filesnpaths.is_file_exists(self.directory, dont_raise=True):
            filesnpaths.is_output_dir_writable(self.directory)
            if not filesnpaths.is_dir_empty(self.directory):
                raise ModellerError(
                    "You cannot give MODELLER a non-empty directory to work in."
                )
        else:
            filesnpaths.gen_output_directory(self.directory)

        if not self.lazy_init:
            self.executable = check_MODELLER(self.executable)

        # does target_fasta_path point to a fasta file?
        utils.filesnpaths.is_file_fasta_formatted(self.target_fasta_path)

        # make sure target_fasta is valid
        target_fasta = u.SequenceSource(self.target_fasta_path,
                                        lazy_init=False)
        if target_fasta.total_seq != 1:
            raise ConfigError(
                "MODELLER :: The input FASTA file must have exactly one sequence. "
                "You provided one with {}.".format(target_fasta.total_seq))
        try:
            while next(target_fasta):
                int(target_fasta.id)
        except:
            raise ConfigError(
                "MODELLER :: The defline of this fasta file must be an integer"
            )
        target_fasta.close()

        # parameter consistencies
        if self.deviation < 0.5 or self.deviation > 20:
            self.run.warning(
                "You realize that deviation is given in angstroms, right? You chose {}"
                .format(self.deviation))

        if self.very_fast and self.num_models > 1:
            self.num_models = 1
            self.run.warning(
                "Since you chose --very-fast, there will be little difference, if at all, between models. Anvi'o "
                "authoritatively sets --num-models to 1 to save you time.")
Exemple #12
0
    def use_external_gene_calls_to_populate_genes_in_contigs_table(
            self,
            input_file_path,
            gene_calls_dict=None,
            ignore_internal_stop_codons=False):
        """Add genes to the contigs database.

           Either provide an `input_file_path` for external gene calls, or provide an
           external gene calls dictionary. The format should follow this:

                {
                  "1": {
                      "contig": "contig_name",
                      "start": 20,
                      "stop": 1544,
                      "direction": "f",
                      "partial": 0,
                      "source": "source_name",
                      "version": "unknown"
                  },

                  "2": {
                    (...)
                  },

                (...)
                }

            If you provide a `gene_calls_dict`, they will be APPENDED to the database. So you
            need to make sure gene caller ids in your dict does not overlap with the ones in
            the database.

        """

        # by default we assume that this is a pristine run. but if the user sends a dictionary
        append_to_the_db = False

        gene_calls_found = False
        # let's do a rigorous check whether the user provided a gene_calls_dict.
        if (gene_calls_dict is not None and gene_calls_dict is not False):
            if not isinstance(gene_calls_dict, dict):
                raise ConfigError(
                    "'Use external gene calls' function received a non-empty gene_calls_dict object,\
                                    but it is of type '%s', and not '%s'" %
                    (type(gene_calls_dict), type({})))

            # congrats, we have a dict.
            gene_calls_found = True

            if not len(gene_calls_dict):
                # but it is empty ... silly user.
                self.run.info_single(
                    "'Use external gene calls' function found an empty gene calls dict, returning\
                                      prematurely and assuming you know what's up. If you don't, stop here and try to\
                                      identify what decisions you've made might have led you to this weird point your\
                                      workflow (or 'life', totally up to you and your mood, but anvi'o thinks you've\
                                      done great so far.",
                    nl_before=1,
                    nl_after=1)
                return

        if (not input_file_path
                and not gene_calls_found) or (input_file_path
                                              and gene_calls_found):
            raise ConfigError(
                "You must provide either an input file, or an gene calls dict to process external\
                               gene calls. You called `use_external_gene_calls_to_populate_genes_in_contigs_table`\
                               with wrong parameters.")

        Table.__init__(self,
                       self.db_path,
                       anvio.__contigs__version__,
                       self.run,
                       self.progress,
                       simple=True)

        # take care of gene calls dict
        if not gene_calls_found:
            gene_calls_dict = utils.get_TAB_delimited_file_as_dictionary(
                input_file_path,
                expected_fields=t.genes_in_contigs_table_structure,
                only_expected_fields=True,
                column_mapping=[int, str, int, int, str, int, str, str])

            if not len(gene_calls_dict):
                raise ConfigError(
                    "You provided an external gene calls file, but it returned zero gene calls. Assuming that\
                                   this is an error, anvi'o will stop here and complain. If this is not an error and you\
                                   in fact expected this, the proper way of doing this is to use `--skip-gene-calls` flag,\
                                   instead of providing an emtpy external gene calls file. You don't agree? You need this\
                                   for some weird step for you weird pipeline? Let us know, and we will consider changing\
                                   this.")

            self.run.info(
                "External gene calls",
                "%d gene calls recovered and will be processed." %
                len(gene_calls_dict))
        else:
            # FIXME: we need to make sure the gene caller ids in the incoming directory is not going to
            #        overwrite an existing gene call. Something like this would have returned the
            #        current max, which could be cross-checked with what's in the dict:
            #
            #            contigs_db = ContigsDatabase(self.db_path)
            #            next_id = contigs_db.db.get_max_value_in_column('genes_in_contigs', 'gene_callers_id') + 1
            #            contigs_db.disconnect()
            append_to_the_db = True

        # recover amino acid sequences. during this operation we are going to have to read all contig sequences
        # into the damn memory. anvi'o is doing a pretty bad job with memory management :(
        amino_acid_sequences = {}

        contig_sequences = {}
        if self.contigs_fasta:
            fasta = u.SequenceSource(self.contigs_fasta)
            while next(fasta):
                contig_sequences[fasta.id] = {'sequence': fasta.seq}
            fasta.close()
        else:
            database = db.DB(self.db_path,
                             utils.get_required_version_for_db(self.db_path))
            contig_sequences = database.get_table_as_dict(
                t.contig_sequences_table_name)

        num_genes_with_internal_stops = 0
        number_of_impartial_gene_calls = 0
        for gene_callers_id in gene_calls_dict:
            gene_call = gene_calls_dict[gene_callers_id]
            contig_name = gene_call['contig']

            if contig_name not in contig_sequences:
                # remove the partial contigs database so things don't get screwed later
                os.remove(self.db_path)
                raise ConfigError(
                    "You are in big trouble :( The contig name '%s' in your external gene callers file\
                                    does not appear to be in the contigs FASTA file. How did this happen?"
                    % contig_name)

            if gene_call['partial']:
                amino_acid_sequences[gene_callers_id] = ''
                number_of_impartial_gene_calls += 1
                continue

            sequence = contig_sequences[contig_name]['sequence'][
                gene_call['start']:gene_call['stop']]
            if gene_call['direction'] == 'r':
                sequence = utils.rev_comp(sequence)

            amino_acid_sequence = utils.get_DNA_sequence_translated(
                sequence, gene_callers_id)

            # check if there are any internal stops:
            if amino_acid_sequence.find('*') > -1:
                if ignore_internal_stop_codons:
                    amino_acid_sequence = amino_acid_sequence.replace('*', 'X')
                    num_genes_with_internal_stops += 1
                else:
                    os.remove(self.db_path)
                    raise ConfigError(
                        "Oops. Anvi'o run into an amino acid seqeunce (that corresponds to the gene callers id '%s')\
                                       which had an internal stop codon :/ This usually indicates that your external gene calls\
                                       have problems. If you still want to continue, you can ask anvi'o to ignore internal stop\
                                       codons on your own risk. It will probably look very ugly on your screen, but here is the\
                                       DNA sequence for that gene in case you don't trust anvi'o (which only would be fair since\
                                       anvi'o does not trust you either): %s" %
                        (str(gene_callers_id), sequence))

            amino_acid_sequences[gene_callers_id] = amino_acid_sequence

        # populate genes_in_contigs, and gene_amino_acid_sequences table in contigs db.
        self.populate_genes_in_contigs_table(gene_calls_dict,
                                             amino_acid_sequences,
                                             append_to_the_db=append_to_the_db)

        if num_genes_with_internal_stops:
            percent_genes_with_internal_stops = num_genes_with_internal_stops * 100.0 / len(
                gene_calls_dict)
            self.run.warning("Please read this carefully: Your external gene calls contained open reading frames with internal\
                              stop codons, and you asked anvi'o to ignore those. Anvi'o replaced internal stop codons with 'X'\
                              characters, and stored them in the contigs database that way. %d of your genes, which corresponded\
                              to %.2f%% of the total %d genes, had internal stop codons. We hope you are happy."                                                                                                                 % \
                                        (num_genes_with_internal_stops, percent_genes_with_internal_stops, len(gene_calls_dict)))

        if number_of_impartial_gene_calls:
            self.run.warning(
                '%d of your %d gene calls were impartial, hence the translated amino acid sequences for those\
                              were not stored in the database.' %
                (number_of_impartial_gene_calls, len(gene_calls_dict)))
Exemple #13
0
    def run_search_and_parse_results(self):
        """Align the protein against the database based on only sequence"""

        # Change to MODELLER working directory
        os.chdir(self.directory)

        columns = [
            'qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gaps',
            'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore'
        ]
        driver = diamond.Diamond(
            query_fasta=self.target_fasta_path,
            target_fasta=J(self.database_dir,
                           self.modeller_database + '.dmnd'),
            outfmt=' '.join(['6'] + columns),
            run=terminal.Run(verbose=False),
            progress=terminal.Progress(verbose=False),
        )
        driver.blastp()

        # Change back to user directory
        os.chdir(self.start_dir)

        search_df = driver.view_as_dataframe(
            J(self.directory, driver.tabular_output_path))

        matches_found = search_df.shape[0]

        if not matches_found:
            self.run.warning(
                "No proteins with homologous sequence were found for {}. No structure will be modelled"
                .format(self.corresponding_gene_call))
            raise self.EndModeller

        # We need the gene length for pident
        target_fasta = u.SequenceSource(self.target_fasta_path,
                                        lazy_init=False)
        while next(target_fasta):
            gene_length = len(target_fasta.seq)

        # add some useful columns
        search_df["code"] = search_df["sseqid"].str[:-1]
        search_df["chain"] = search_df["sseqid"].str[-1]
        search_df["align_fraction"] = (search_df["length"] -
                                       search_df["gaps"]) / gene_length
        search_df["proper_pident"] = search_df["pident"] * search_df[
            "align_fraction"]

        # Find best match for align fraction and pident
        code_chain_id_of_best = tuple(
            search_df.iloc[search_df['proper_pident'].argmax()][[
                'code', 'chain'
            ]].values)
        best_hit = search_df.loc[
            (search_df['code'] == code_chain_id_of_best[0]) & \
            (search_df['chain'] == code_chain_id_of_best[1]), ['pident', 'align_fraction']
        ].iloc[0]

        # filter results by self.percent_cutoff and self.alignment_fraction_cutoff
        search_df = search_df[search_df["pident"] >= self.percent_cutoff]
        search_df = search_df[
            search_df["align_fraction"] >= self.alignment_fraction_cutoff]

        # Rank by the alignment fraction times the percent id
        search_df = search_df.sort_values("proper_pident", ascending=False)

        # If more than 1 template in 1 PDB id, just choose 1
        search_df = search_df.drop_duplicates('code', keep='first')

        matches_after_filter = len(search_df)
        if not matches_after_filter:
            self.run.warning("Gene {} did not have a search result with percent identicalness above or equal "
                             "to {}% and alignment fraction above {}%. The best match was chain {} of https://www.rcsb.org/structure/{}, which had a "
                             "percent identicalness of {:.2f}% and an alignment fraction of {:.3f}. No structure will be modelled.".\
                              format(self.corresponding_gene_call,
                                     self.percent_cutoff,
                                     self.alignment_fraction_cutoff,
                                     code_chain_id_of_best[1],
                                     code_chain_id_of_best[0],
                                     best_hit['pident'],
                                     best_hit['align_fraction']))
            raise self.EndModeller

        # Filter out templates with proper_pident more than 5% less than best match
        # http://merenlab.org/2018/09/04/getting-started-with-anvi-structure/#how-much-do-templates-matter
        search_df = search_df[search_df['proper_pident'] >= (
            search_df['proper_pident'].max() - 5)]

        # get up to self.modeller.max_number_templates of those with the highest proper_ident scores.
        search_df = search_df.iloc[:min(
            [len(search_df), self.max_number_templates])]

        # Get their chain and 4-letter ids
        self.list_of_template_code_and_chain_ids = list(
            zip(search_df["code"], search_df["chain"]))

        self.run.info("Max number of templates allowed",
                      self.max_number_templates)
        self.run.info("Number of candidate templates", matches_found)
        self.run.info(
            "After >{}% identical filter".format(self.percent_cutoff),
            matches_after_filter)
        self.run.info("Number accepted as templates",
                      len(self.list_of_template_code_and_chain_ids))

        # update user on which templates are used, and write the templates to self.out
        for i in range(len(self.list_of_template_code_and_chain_ids)):
            pdb_id, chain_id = self.list_of_template_code_and_chain_ids[i]
            proper_percent_similarity = search_df["proper_pident"].iloc[i]
            percent_similarity = search_df["pident"].iloc[i]
            align_fraction = search_df["align_fraction"].iloc[i]

            self.out["templates"]["pdb_id"].append(pdb_id)
            self.out["templates"]["chain_id"].append(chain_id)
            self.out["templates"]["proper_percent_similarity"].append(
                proper_percent_similarity)
            self.out["templates"]["percent_similarity"].append(
                percent_similarity)
            self.out["templates"]["align_fraction"].append(align_fraction)

            self.run.info(
                "Template {}".format(i + 1),
                "Protein ID: {}, Chain {} ({:.1f}% identical, {:.2f} align fraction)"
                .format(pdb_id, chain_id, percent_similarity, align_fraction))
Exemple #14
0
    def run_search_and_parse_results(self):
        """Align the protein against the database based on only sequence"""

        if not self.percent_identical_cutoff or not self.max_number_templates:
            raise ConfigError(
                "run_search_and_parse_results :: You initiated this class without providing values for percent_identical_cutoff "
                "and max_number_templates, which is required for this function."
            )

        # Change to MODELLER working directory
        os.chdir(self.directory)

        driver = diamond.Diamond(
            query_fasta=self.target_fasta_path,
            target_fasta=J(self.database_dir,
                           self.modeller_database + '.dmnd'),
            run=terminal.Run(verbose=False),
            progress=terminal.Progress(verbose=False),
        )
        driver.blastp()

        # Change back to user directory
        os.chdir(self.start_dir)

        search_df = driver.view_as_dataframe(
            J(self.directory, driver.tabular_output_path))

        matches_found = search_df.shape[0]

        if not matches_found:
            self.run.warning(
                "No proteins with homologous sequence were found for {}. No structure will be modelled"
                .format(self.corresponding_gene_call))
            raise self.EndModeller

        # We need the gene length for proper_pident
        target_fasta = u.SequenceSource(self.target_fasta_path,
                                        lazy_init=False)
        while next(target_fasta):
            gene_length = len(target_fasta.seq)

        # add some useful columns
        search_df["proper_pident"] = search_df["pident"] * search_df[
            "length"] / gene_length
        search_df["code"] = search_df["sseqid"].str[:-1]
        search_df["chain"] = search_df["sseqid"].str[-1]

        # filter results by self.percent_identical_cutoff.
        max_pident_found = search_df["proper_pident"].max()
        id_of_max_pident = tuple(
            search_df.loc[search_df["proper_pident"].idxmax(),
                          ["code", "chain"]].values)
        search_df = search_df[
            search_df["proper_pident"] >= self.percent_identical_cutoff]

        search_df = search_df.sort_values("proper_pident", ascending=False)

        # If more than 1 template in 1 PDB id, just choose 1
        search_df = search_df.drop_duplicates('code', keep='first')

        # Order them and take the first self.modeller.max_number_templates.
        matches_after_filter = len(search_df)
        if not matches_after_filter:
            self.run.warning("Gene {} did not have a search result with proper percent identicalness above or equal "
                             "to {}%. The best match was chain {} of https://www.rcsb.org/structure/{}, which had a "
                             "proper percent identicalness of {:.2f}%. No structure will be modelled.".\
                              format(self.corresponding_gene_call,
                                     self.percent_identical_cutoff,
                                     id_of_max_pident[1],
                                     id_of_max_pident[0],
                                     max_pident_found))
            raise self.EndModeller

        # get up to self.modeller.max_number_templates of those with the highest proper_ident scores.
        search_df = search_df.iloc[:min(
            [len(search_df), self.max_number_templates])]

        # Get their chain and 4-letter ids
        self.list_of_template_code_and_chain_ids = list(
            zip(search_df["code"], search_df["chain"]))

        self.run.info("Max number of templates allowed",
                      self.max_number_templates)
        self.run.info("Number of candidate templates", matches_found)
        self.run.info(
            "After >{}% identical filter".format(
                self.percent_identical_cutoff), matches_after_filter)
        self.run.info("Number accepted as templates",
                      len(self.list_of_template_code_and_chain_ids))

        # update user on which templates are used, and write the templates to self.out
        for i in range(len(self.list_of_template_code_and_chain_ids)):
            pdb_id, chain_id = self.list_of_template_code_and_chain_ids[i]
            ppi = search_df["proper_pident"].iloc[i]

            self.out["templates"]["pdb_id"].append(pdb_id)
            self.out["templates"]["chain_id"].append(chain_id)
            self.out["templates"]["ppi"].append(ppi)

            self.run.info(
                "Template {}".format(i + 1),
                "Protein ID: {}, Chain {} ({:.1f}% identical)".format(
                    pdb_id, chain_id, ppi))
Exemple #15
0
    def check_database(self):
        """Setup the database files

        Downloads the .pir file if it is missing
        Binarizes .pir file if .bin is missing
        Creates the .dmnd file if it is missing
        """

        bin_db_path = J(self.database_dir, self.modeller_database + ".bin")
        pir_db_path = J(self.database_dir, self.modeller_database + ".pir")
        bin_exists = utils.filesnpaths.is_file_exists(bin_db_path,
                                                      dont_raise=True)
        pir_exists = utils.filesnpaths.is_file_exists(pir_db_path,
                                                      dont_raise=True)

        if bin_exists and pir_exists:
            # We good
            pass
        else:
            if not pir_exists:
                # Download .pir
                self.run.warning(
                    "Anvi'o looked in {} for a database with the name {} and with an extension "
                    "of either .bin or .pir, but didn't find anything matching that "
                    "criteria. Anvi'o will try and download the best database it knows of from "
                    "https://salilab.org/modeller/downloads/pdb_95.pir.gz and use that. "
                    "You can checkout https://salilab.org/modeller/ for more info about the pdb_95 "
                    "database".format(self.database_dir,
                                      self.modeller_database))

                db_download_path = os.path.join(self.database_dir,
                                                "pdb_95.pir.gz")
                utils.download_file(
                    "https://salilab.org/modeller/downloads/pdb_95.pir.gz",
                    db_download_path)
                utils.run_command(
                    ['gzip', '-d', db_download_path],
                    log_file_path=filesnpaths.get_temp_file_path())

            # Binarize .pir (make .bin)
            self.run.warning(
                "Your database is not in binary format. That means accessing its contents is slower "
                "than it could be. Anvi'o is going to make a binary format. Just FYI"
            )
            self.run_binarize_database(pir_db_path, bin_db_path)

        dmnd_db_path = J(self.database_dir, self.modeller_database + '.dmnd')

        if os.path.exists(dmnd_db_path):
            return

        self.run.warning(
            "Your diamond database does not exist. It will be created.")

        script_name = "pir_to_fasta.py"

        self.copy_script_to_directory(script_name)

        input_pir_path = J(self.database_dir, self.modeller_database + '.pir')
        fasta_path = J(self.database_dir, self.modeller_database + '.fa')
        dmnd_path = J(self.database_dir, self.modeller_database)

        command = [self.executable, script_name, input_pir_path, fasta_path]

        self.run_command(command, script_name=script_name, rename_log=False)

        temp = u.FastaOutput(filesnpaths.get_temp_file_path())
        fasta = u.SequenceSource(fasta_path)

        while next(fasta):
            temp.write_id(fasta.id)
            temp.write_seq(fasta.seq.replace('-', '').replace('.', 'X'))

        shutil.move(temp.output_file_path, fasta_path)
        fasta.close()
        temp.close()

        driver = diamond.Diamond(
            query_fasta=fasta_path,
            run=terminal.Run(verbose=False),
            progress=terminal.Progress(verbose=False),
        )
        driver.makedb(output_file_path=dmnd_path)

        os.remove(fasta_path)
Exemple #16
0
    def sanity_check(self):
        A = lambda x, t: t(args.__dict__[x]) if x in self.args.__dict__ else None
        null = lambda x: x

        # the directory files will be dumped into (can exist but must be empty)
        if filesnpaths.is_file_exists(self.directory, dont_raise=True):
            filesnpaths.is_output_dir_writable(self.directory)
            if not filesnpaths.is_dir_empty(self.directory):
                raise ModellerError("You cannot give MODELLER a non-empty directory to work in.")
        else:
            filesnpaths.gen_output_directory(self.directory)

        # All MODELLER scripts are housed in self.script_folder
        self.scripts_folder = J(os.path.dirname(anvio.__file__), 'data/misc/MODELLER/scripts')
        if utils.filesnpaths.is_dir_empty(self.scripts_folder):
            raise ConfigError("Anvi'o houses all its MODELLER scripts in {}, but your directory \
                               contains no scripts. Why you do dat?")

        # check that MODELLER exists
        if self.args.__dict__['modeller_executable'] if 'modeller_executable' in self.args.__dict__ else None:
            self.run.info_single("As per your request, anvi'o will use `%s` to run MODELLER." % self.executable, nl_before=1)
            utils.is_program_exists(self.executable)
        else:
            try:
                utils.is_program_exists(self.executable)
            except ConfigError as e:
                raise ConfigError("Anvi'o needs a MODELLER program to be installed on your system. You didn't specify one\
                                   (which can be done with `--modeller-executable`), so anvi'o tried the most recent version\
                                   it knows about: '%s'. If you are certain you have it on your system (for instance you can run it\
                                   by typing '%s' in your terminal window), you may want to send a detailed bug report. If you\
                                   don't have it on your system, check out these installation instructions on our website:\
                                   http://merenlab.org/2016/06/18/installing-third-party-software/#modeller" % (self.executable, self.executable))

            self.run.info_single("Anvi'o found the default executable for MODELLER, `%s`, and will\
                                  use it." % self.executable, nl_before=1)
        self.is_executable_a_MODELLER_program()

        # does target_fasta_path point to a fasta file?
        utils.filesnpaths.is_file_fasta_formatted(self.target_fasta_path)

        # make sure target_fasta is valid
        target_fasta = u.SequenceSource(self.target_fasta_path, lazy_init=False)
        if target_fasta.total_seq != 1:
            raise ConfigError("MODELLER::The input FASTA file must have exactly one sequence.\
                               You provided one with {}.".format(target_fasta.total_seq))

        # (not sanity check but we get self.corresponding_gene_call since target_fasta is opened)
        while next(target_fasta):
            self.corresponding_gene_call = target_fasta.id
        target_fasta.close()

        # parameter consistencies
        if self.deviation < 0.5 or self.deviation > 20:
            self.run.warning("You realize that deviation is given in angstroms, right? You chose {}".format(self.deviation))

        if self.very_fast and self.num_models > 1:
            self.run.warning("Since you chose --very-fast, there will be little difference, if at all, between models. You \
                              can potentially save a lot of time by setting --num-models to 1.")

        if self.percent_identical_cutoff <= 20:
            self.run.warning("Two completely unrelated sequences of same length can expect to have around 10% proper \
                              percent identicalness... Having this parameter below 20% is probably a bad idea.")
    def generate(self):
        output_r1 = open(self.output_sample_name + '-R1.fastq', 'w')
        output_r2 = open(self.output_sample_name + '-R2.fastq', 'w')

        self.run.info('Read lenth', self.short_read_length)
        self.run.info('Insert size', self.insert_size)
        self.run.info('Insert size std', self.insert_size_std)

        x = self.short_read_length
        self.Q_str = ''.join(['A'] * x)

        for index_fasta in range(0, len(self.fasta_files)):
            f = self.fasta_files_dict[self.fasta_files[index_fasta]]

            c = f['coverage']

            self.progress.new(
                'Working on file %d of %d (%s) with expected coverage of %d' %
                (index_fasta + 1, len(self.fasta_files), f['alias'], c))

            fasta = fastalib.SequenceSource(f['path'])
            total_r1_errors = 0
            total_r2_errors = 0
            total_num_reads = 0
            while next(fasta):
                L = len(fasta.seq)

                av_num_short_reads_needed = int(L / x * c)
                total_num_reads += av_num_short_reads_needed

                av_num_pairs_needed = int(av_num_short_reads_needed / 2)

                for index_pair in range(0, av_num_pairs_needed):
                    if (index_pair + 1) % 100 == 0:
                        self.progress.update('Seq %s :: %s nts :: reads %s of %s :: num errors: %s ...'\
                                                        % (pp(fasta.pos + 1), pp(len(fasta.seq)),
                                                           pp(index_pair + 1), pp(av_num_pairs_needed),
                                                           pp(total_r1_errors + total_r2_errors)))

                    I = int(
                        round(
                            random.gauss(self.insert_size,
                                         self.insert_size_std)))
                    if L - ((x * 2) + I) > 0:
                        start_pos = random.randint(0, L - ((x * 2) + I))
                    else:
                        start_pos = random.randint(0, L - (x * 2))

                    read_1_start = start_pos
                    read_1_stop = read_1_start + x

                    read_2_start = read_1_stop + I
                    read_2_stop = read_2_start + x

                    read_1, num_errors_r1 = simulate_errors(
                        self.error_rate, fasta.seq[read_1_start:read_1_stop])
                    read_2, num_errors_r2 = simulate_errors(
                        self.error_rate, fasta.seq[read_2_start:read_2_stop])

                    total_r1_errors += num_errors_r1
                    total_r2_errors += num_errors_r2

                    c1, c2 = random.randint(1, 10000), random.randint(1, 10000)
                    output_r1.write(
                        '@%s:23:B02CBACXX:8:2315:%d:%d 1:N:0:GATCAG\n' %
                        (f['alias'], c1, c2))
                    output_r1.write(read_1 + '\n')
                    output_r1.write(
                        '+source:%s; start:%d; stop:%d; insert_size:%d\n' %
                        (fasta.id, read_1_start, read_1_stop, I))
                    output_r1.write('%s\n' % self.Q_str)

                    output_r2.write(
                        '@%s:23:B02CBACXX:8:2315:%d:%d 2:N:0:GATCAG\n' %
                        (f['alias'], c1, c2))
                    output_r2.write(u.rev_comp(read_2) + '\n')
                    output_r2.write(
                        '+source:%s; start:%d; stop:%d; insert_size:%d\n' %
                        (fasta.id, read_2_start, read_2_stop, I))
                    output_r2.write('%s\n' % self.Q_str)

            self.progress.end()
            total_num_errors = total_r1_errors + total_r2_errors
            self.run.info('%s w/ %d contigs' % (f['alias'], fasta.pos),
                     '%s reads in %s pairs with %s errors (avg %.4f) for %sX avg cov.'\
                                        % (pp(total_num_reads),
                                           pp(total_num_reads / 2),
                                           pp(total_num_errors),
                                           total_num_errors * 1.0 / (total_num_reads * x),
                                           pp(c),
                                           ))

        output_r1.close()
        output_r2.close()
        self.run.info('FASTQ R1', self.output_sample_name + '-R1.fastq')
        self.run.info('FASTQ R2', self.output_sample_name + '-R2.fastq')
Exemple #18
0
    def get_amino_acid_sequences_for_genes_in_gene_calls_dict(self, gene_calls_dict, ignore_internal_stop_codons=False):
        '''Recover amino acid sequences for gene calls in a gene_calls_dict.

        If 'aa_sequence' exists as keys in the gene_calls_dict[<key>] objects, this trivially
        reorganizes the data and returns a sequence dictionary. Otherwise, the sequence dictionary
        is created by reading all contig sequences into memory. Anvi'o is doing a pretty bad job
        with memory management in this case :(

        Parameters
        ==========
        ignore_internal_stop_codons : bool, False
            If False, ConfigError will be raised if a stop codon is found inside any gene. If True,
            this is suppressed and the stop codon is replaced with the character `X`.
        '''

        if 'aa_sequence' in gene_calls_dict[list(gene_calls_dict.keys())[0]]:
            # we already have AA sequences
            return {gene_caller_id: info['aa_sequence'] for gene_caller_id, info in gene_calls_dict.items()}

        amino_acid_sequences = {}

        # FIXME: this is a very poor practice for memory management:
        contig_sequences = {}

        if self.contigs_fasta:
            fasta = u.SequenceSource(self.contigs_fasta)
            while next(fasta):
                contig_sequences[fasta.id] = {'sequence': fasta.seq}
            fasta.close()
        else:
            database = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path))
            contig_sequences = database.get_table_as_dict(t.contig_sequences_table_name)

        num_genes_with_internal_stops = 0
        number_of_impartial_gene_calls = 0
        for gene_callers_id in gene_calls_dict:
            gene_call = gene_calls_dict[gene_callers_id]
            contig_name = gene_call['contig']

            if contig_name not in contig_sequences:
                # remove the partial contigs database so things don't get screwed later
                os.remove(self.db_path)
                raise ConfigError("You are in big trouble :( The contig name '%s' in your external gene callers file "
                                   "does not appear to be in the contigs FASTA file. How did this happen?" % contig_name)

            if gene_call['partial']:
                amino_acid_sequences[gene_callers_id] = ''
                number_of_impartial_gene_calls += 1
                continue

            sequence = contig_sequences[contig_name]['sequence'][gene_call['start']:gene_call['stop']]
            if gene_call['direction'] == 'r':
                sequence = utils.rev_comp(sequence)

            amino_acid_sequence = utils.get_DNA_sequence_translated(sequence, gene_callers_id)

            # check if there are any internal stops:
            if amino_acid_sequence.find('*') > -1:
                if ignore_internal_stop_codons:
                    amino_acid_sequence = amino_acid_sequence.replace('*', 'X')
                    num_genes_with_internal_stops += 1
                else:
                    os.remove(self.db_path)
                    raise ConfigError("Oops. Anvi'o run into an amino acid sequence (that corresponds to the gene callers id '%s') "
                                      "which had an internal stop codon :/ This usually indicates that your external gene calls "
                                      "have problems. If you still want to continue, you can ask anvi'o to ignore internal stop "
                                      "codons on your own risk. It will probably look very ugly on your screen, but here is the "
                                      "DNA sequence for that gene in case you don't trust anvi'o (which only would be fair since "
                                      "anvi'o does not trust you either): %s" % (str(gene_callers_id), sequence))

            amino_acid_sequences[gene_callers_id] = amino_acid_sequence

        if num_genes_with_internal_stops:
            percent_genes_with_internal_stops = num_genes_with_internal_stops * 100.0 / len(gene_calls_dict)
            self.run.warning("Please read this carefully: Your external gene calls contained open reading frames with internal "
                             "stop codons, and you asked anvi'o to ignore those. Anvi'o replaced internal stop codons with 'X' "
                             "characters, and stored them in the contigs database that way. %d of your genes, which corresponded "
                             "to %.2f%% of the total %d genes, had internal stop codons. We hope you are happy." % \
                                        (num_genes_with_internal_stops, percent_genes_with_internal_stops, len(gene_calls_dict)))

        if number_of_impartial_gene_calls:
            self.run.warning('%d of your %d gene calls were impartial, hence the translated amino acid sequences for those '
                             'were not stored in the database.' % (number_of_impartial_gene_calls, len(gene_calls_dict)))

        return amino_acid_sequences
Exemple #19
0
    def process(self, fasta_file_path, output_dir):
        """Take the fasta file, run prodigal on it, and make sense of the output

           Returns a gene calls dict, and amino acid sequences dict.
        """
        gene_calls_dict = {
        }  # each entry must contain {'contig', 'start', stop, 'direction', 'partial'} items.
        amino_acid_sequences_dict = {}

        self.genes_in_contigs = os.path.join(output_dir, 'contigs.genes')
        self.amino_acid_sequences_in_contigs = os.path.join(
            output_dir, 'contigs.amino_acid_sequences')

        log_file_path = os.path.join(output_dir, '00_log.txt')

        self.run.warning('', header='Finding ORFs in contigs', lc='green')
        self.run.info('Genes', self.genes_in_contigs)
        self.run.info('Amino acid sequences',
                      self.amino_acid_sequences_in_contigs)
        self.run.info('Log file', log_file_path)

        cmd_line = [
            'prodigal', '-i', fasta_file_path, '-o', self.genes_in_contigs,
            '-a', self.amino_acid_sequences_in_contigs
        ]

        if self.prodigal_translation_table:
            cmd_line.extend(['-g', self.prodigal_translation_table])
            self.run.warning(
                "Prodigal translation table is set to '%s' (whatever you did has worked so far, but\
                              keep an eye for errors from prodigal in case it doesn't like your translation table\
                              parameter). This means we will not use prodigal in the metagenomics mode, due to this\
                              issue: https://github.com/hyattpd/Prodigal/issues/19. If that issue is closed, and you\
                              are reading this message, then please contact an anvi'o developer."
                % str(self.prodigal_translation_table))
        else:
            cmd_line.extend(['-p', 'meta'])

        self.run.warning(
            "Anvi'o will use 'prodigal' by Hyatt et al (doi:10.1186/1471-2105-11-119) to identify open\
                          reading frames in your data. When you publish your findings, please do not forget to properly\
                          credit their work.",
            lc='green',
            header="CITATION")

        self.progress.new('Processing')
        self.progress.update('Identifying ORFs in contigs ...')

        utils.run_command(cmd_line, log_file_path)

        if not os.path.exists(self.amino_acid_sequences_in_contigs):
            self.progress.end()
            raise ConfigError(
                "Something went wrong with prodigal, and it failed to generate the\
                               expected output :/ Fortunately, this log file should tell you what\
                               might be the problem: '%s'. Please do not forget to include this\
                               file if you were to ask for help." %
                log_file_path)

        if filesnpaths.is_file_empty(self.amino_acid_sequences_in_contigs):
            self.progress.end()
            self.run.info('Result',
                          'Prodigal (%s) has identified no genes :/' %
                          (self.installed_version),
                          nl_after=1,
                          mc="red")
            return gene_calls_dict, amino_acid_sequences_dict

        self.progress.update('Processing gene calls ...')

        fasta = fastalib.SequenceSource(self.amino_acid_sequences_in_contigs)

        hit_id = 0
        while next(fasta):
            gene_calls_dict[hit_id] = self.parser(fasta.id)
            amino_acid_sequences_dict[hit_id] = fasta.seq.replace('*', '')
            hit_id += 1

        fasta.close()

        self.progress.end()

        self.run.info('Result',
                      'Prodigal (%s) has identified %d genes.' %
                      (self.installed_version, len(gene_calls_dict)),
                      nl_after=1)

        return gene_calls_dict, amino_acid_sequences_dict
Exemple #20
0
    def get_amino_acid_sequences_for_genes_in_gene_calls_dict(
            self,
            gene_calls_dict,
            ignore_internal_stop_codons=False,
            skip_predict_frame=False):
        """Recover amino acid sequences for gene calls in a gene_calls_dict.

        If 'aa_sequence' exists as keys in the gene_calls_dict[<key>] objects, this function will take
        those seqeunces into consideration and use them without trying to predict frames even if the gene
        call is partial. So user-defined aa sequences in `aa_sequence` column will have priority.

        Please note this FIXME: By reading all contig sequences into memory, anvi'o does a pretty bad job
        at memory management throughout this function :(

        Parameters
        ==========
        ignore_internal_stop_codons : bool, False
            If False, ConfigError will be raised if a stop codon is found inside any gene. If True,
            this is suppressed and the stop codon is replaced with the character `X`.

        skip_predict_frame : bool, False
            If True, ConfigError will be raised if a gene is not divisible by 3. If False, anvi'o predicts
            the most likley open reading frame and trims the start/stop of the gene call to reflect this
            change so that the gene *is* divisible by 3. This flag allows the retention of amino acid
            sequences even if genes are not divisible by 3, or when it is flagged as partial.
        """

        predict_frame = (not skip_predict_frame)

        if predict_frame:
            # Preload the markov model to predict frames and assign null codon and stop codon transition probabilities
            model_path = os.path.join(
                os.path.dirname(anvio.__file__),
                'data/seq_transition_models/AA/fourth_order.npy')
            if not filesnpaths.is_file_exists(model_path, dont_raise=True):
                raise ConfigError(
                    "The task at hand calls for the use of the anvi'o Markov model to predict proper open reading "
                    "frames for external gene calls when necessary, but the model does not seem to be in the right "
                    "place in the anvi'o codebase. FAILING BIG HERE.")

            model = numpy.load(model_path)
            null_prob = numpy.median(model)
            stop_prob = model.min() / 1e6

        gene_caller_ids_with_user_provided_amino_acid_sequences = set([])

        # get all the amino acids sorted out. either we will start with an empty dict, or take user defined
        # aa seqs as starting material
        if 'aa_sequence' in gene_calls_dict[list(gene_calls_dict.keys())[0]]:
            # the external gene calls file include amino acid sequences for at least some of the gene calls
            # here we will learn about them, and then use them we already have AA sequences
            amino_acid_sequences = {
                gene_caller_id: info['aa_sequence'].strip()
                for gene_caller_id, info in gene_calls_dict.items()
            }

            for gene_callers_id in amino_acid_sequences:
                if len(amino_acid_sequences[gene_callers_id]):
                    gene_caller_ids_with_user_provided_amino_acid_sequences.add(
                        gene_callers_id)

                    gene_length = gene_calls_dict[gene_callers_id][
                        'stop'] - gene_calls_dict[gene_callers_id]['start']
                    estimated_length_for_aa = gene_length / 3
                    user_provided_aa_length = len(
                        amino_acid_sequences[gene_callers_id])

                    # there is already a sanity check for htis, but one can't be too careful
                    if gene_calls_dict[gene_callers_id][
                            'call_type'] != constants.gene_call_types[
                                'CODING'] and user_provided_aa_length:
                        raise ConfigError(
                            "You have provided an amino acid sequence for at least one gene call in your external gene calls "
                            "(%d) file that was not marked as CODING type :(" %
                            gene_callers_id)

                    if user_provided_aa_length > estimated_length_for_aa:
                        raise ConfigError(
                            "Bad news :( There seems to be at least one gene call in your external gene calls file "
                            "that has an aminio acid sequence that is longer than the expected length of it given the "
                            "start/stop positions of the gene call. This is certainly true for gene call number %d "
                            "but anvi'o doesn't know if there are more of these in your file or not :/"
                            % gene_callers_id)

            self.run.warning(
                "Anvi'o found amino acid sequences in your external gene calls file that match to %d of %d gene "
                "in it and will use these amino acid sequences for everything."
                % (len(amino_acid_sequences), len(gene_calls_dict)))
        else:
            amino_acid_sequences = {}

        # FIXME: this is a very poor practice for memory management:
        contig_sequences = {}
        if self.contigs_fasta:
            fasta = u.SequenceSource(self.contigs_fasta)
            while next(fasta):
                contig_sequences[fasta.id] = {'sequence': fasta.seq}
            fasta.close()
        else:
            database = db.DB(self.db_path,
                             utils.get_required_version_for_db(self.db_path))
            contig_sequences = database.get_table_as_dict(
                t.contig_sequences_table_name)

        # keep track of things to report later
        P = lambda x: x % ("partial_" if partial else "")
        report = {
            "num_non_coding_gene_calls": 0,
            "num_partial_gene_calls": 0,
            "num_partial_gene_calls_with_user_provided_amino_acid_sequences":
            0,
            "num_gene_calls_with_user_provided_amino_acid_sequences": 0,
            "num_partial_gene_calls_with_predicted_frame": 0,
            "num_gene_calls_with_predicted_frame": 0,
            "num_partial_gene_calls_with_no_predicted_frame": 0,
            "num_gene_calls_with_no_predicted_frame": 0,
            "num_partial_genes_with_internal_stops": 0,
            "num_genes_with_internal_stops": 0
        }

        # the main loop to go through all the gene calls.
        for gene_callers_id in gene_calls_dict:
            gene_call = gene_calls_dict[gene_callers_id]
            partial = gene_call['partial']
            contig_name = gene_call['contig']

            if contig_name not in contig_sequences:
                # remove the partial contigs database so things don't get screwed later
                os.remove(self.db_path)
                raise ConfigError(
                    "You are in big trouble :( The contig name '%s' in your external gene calls file "
                    "does not appear to be among your contigs. Rhetorical question time: "
                    "HOW DID THIS HAPPEN?" % contig_name)

            # if this is a gene call that is not CODING, we have no interest in trying to get amino acid seqeunces for it
            if gene_calls_dict[gene_callers_id][
                    'call_type'] != constants.gene_call_types['CODING']:
                report["num_non_coding_gene_calls"] += 1
                continue

            sequence = contig_sequences[contig_name]['sequence'][
                gene_call['start']:gene_call['stop']]
            if gene_call['direction'] == 'r':
                sequence = utils.rev_comp(sequence)

            # a let's keep track of partial gene calls
            if partial:
                report["num_partial_gene_calls"] += 1

            if gene_callers_id in gene_caller_ids_with_user_provided_amino_acid_sequences:
                # FIXME / NOTE: Here we actually move on with the assumption that the start/stop positions
                #               for the gene call are appropriate, and the user-provided amino acid sequence actually
                #               matches to those essential information. It may have been a better strategy to
                #               spend just a little more here test the frame, and start/stop positions. This probably
                #               will explode at some point due to some user error, and some poor soul will spend hours
                #               in the codebase to figure out how the hell did it happen.
                report[P(
                    "num_%sgene_calls_with_user_provided_amino_acid_sequences"
                )] += 1
                amino_acid_sequence = amino_acid_sequences[gene_callers_id]
            elif predict_frame:
                # no amino acid sequence is provided, BUT USER WANTS FRAME TO BE PREDICTED
                # we may be good, if we can try to predict one for it.
                frame, amino_acid_sequence = utils.get_most_likely_translation_frame(
                    sequence,
                    model=model,
                    stop_prob=stop_prob,
                    null_prob=null_prob)

                if frame is None:
                    # we not good because we couldn't find a frame for it. because this gene call has no predicted frame,
                    # and no user-provided amino acid sequence, we will mark this one as noncoding. BAM:
                    gene_calls_dict[gene_callers_id][
                        'call_type'] = constants.gene_call_types['NONCODING']
                    report[P("num_%sgene_calls_with_no_predicted_frame")] += 1
                    continue
                else:
                    # we good. found the amino acid sequence. we will update the gene call so start/stop
                    # matches to the frame, and report the amino acid sequence
                    report[P("num_%sgene_calls_with_predicted_frame")] += 1
                    gene_calls_dict[gene_callers_id] = self.update_gene_call(
                        gene_call, frame)
                    amino_acid_sequences[gene_callers_id] = amino_acid_sequence
            elif not predict_frame:
                # no amino acid sequence is provided, AND USER DOES NOW WANTS FRAME TO BE PREDICTED (what an a-hole)
                # we will do the dumb thing, and try to translate the DNA sequence directly
                try:
                    amino_acid_sequence = utils.get_translated_sequence_for_gene_call(
                        sequence, gene_callers_id)
                except ConfigError as non_divisible_by_3_error:
                    raise ConfigError(
                        non_divisible_by_3_error.e +
                        ". Since you are creating a contigs database, "
                        "anvi'o is willing to strike you a deal, but it will require you to trust her a bit more and give her "
                        "the power power to modify the external gene calls you provided. In your external gene calls file you "
                        "have at least one gene call for which you did not provide an amino acid sequence, and marked it as a "
                        "CODING type gene call. But becasue YOU ELECTED TO SKIP anvi'o frame prediction to estimate a proper amino "
                        "acid sequence, anvi'o simply tried to translate your DNA sequence from the start to the end. But as you "
                        "can tell, it didn't go well. This may be happening because you simply didn't follow the instructions for "
                        "external gene calls file format. We hope it is not the case as we have described the format of this file "
                        "here: http://merenlab.org/software/anvio/help/artifacts/external-gene-calls/. But this may also happen "
                        "even if you have follwed it carefully, but your amino acid sequences are simply not translatable because "
                        "they are partial. In these cases you have two options. Either you provide amino acid sequences for these "
                        "gene calls explicitly, or do not use the `--skip-predict-frame` so anvi'o can do the following whenever "
                        "this problem arises using a Markov model: (1) translate all 3 possible amino acid sequences for the "
                        "gene (one for each frame), (2) determine which is the most likely based on the tendency that amino acids "
                        "tend to co-occur as neighbors [nerd speak: a 4th order markov state model trained on the uniprot50 dataset], "
                        "and finally (3) trim the start and/or stop of your gene to match the most likley frame. The trimming of your "
                        "start/stop positions will be reflected in the anvi'o contigs database, but will *not* be changed in the external "
                        "gene calls file you've provided (if you want the modified gene calls as they will appear in your contigs database, "
                        "you can use `anvi-export-gene-calls` after your contigs database has been created). If all this sounds good to you, "
                        "go ahead and remove the --skip-predict-frame flag. If not, well then you are on your own :( Find more info here: "
                        "http://merenlab.org/software/anvio/help/programs/anvi-gen-contigs-database/"
                    )

            else:
                raise ConfigError(
                    "You broke anvi'o and ended up somewhere no one should ever end up in its codebase. Not nice."
                )

            # when we are here, we one way or another recovered amino acid sequences either by predicting them or by relying
            # upon user provided data. we have one last control before moving on with our lives:
            if amino_acid_sequence.find('*') > -1:
                if ignore_internal_stop_codons:
                    amino_acid_sequence = amino_acid_sequence.replace('*', 'X')
                    report[P("num_%sgenes_with_internal_stops")] += 1
                else:
                    os.remove(self.db_path)
                    raise ConfigError(
                        "Oops. Anvi'o run into an amino acid sequence (that corresponds to the gene callers id '%s') "
                        "which had an internal stop codon :/ This is sometimes due to errors in the external gene "
                        "calls file, but more often it is due to the non-standard genetic code. You still can continue "
                        "by asking anvi'o to ignore internal stop codons via the flag `--ignore-internal-stop-codons`. "
                        "It will probably look very ugly on your screen, but here is the "
                        "DNA sequence for that gene in case you don't trust anvi'o (which only would be fair since "
                        "anvi'o does not trust you either): '%s'. And here is the amino acid sequence of the "
                        "same gene call if you would like to BLAST it around and see whether it actually makes "
                        "sense as a gene call: '%s'." %
                        (str(gene_callers_id), sequence, amino_acid_sequence))

            amino_acid_sequences[gene_callers_id] = amino_acid_sequence

        # reporting time
        self.run.warning(None,
                         header="EXTERNAL GENE CALLS PARSER REPORT",
                         lc="cyan")
        self.run.info("Num gene calls in file", len(gene_calls_dict))
        self.run.info("Non-coding gene calls",
                      report["num_non_coding_gene_calls"])
        self.run.info("Partial gene calls", report["num_partial_gene_calls"])
        self.run.info(
            "Num amino acid sequences provided",
            report[
                "num_partial_gene_calls_with_user_provided_amino_acid_sequences"]
            + report["num_gene_calls_with_user_provided_amino_acid_sequences"],
            mc="green")
        self.run.info(
            "  - For complete gene calls",
            report["num_gene_calls_with_user_provided_amino_acid_sequences"])
        self.run.info(
            "  - For partial gene calls", report[
                "num_partial_gene_calls_with_user_provided_amino_acid_sequences"]
        )
        self.run.info(
            "Frames predicted",
            report["num_partial_gene_calls_with_predicted_frame"] +
            report["num_gene_calls_with_predicted_frame"])
        self.run.info("  - For complete gene calls",
                      report["num_gene_calls_with_predicted_frame"])
        self.run.info("  - For partial gene calls",
                      report["num_partial_gene_calls_with_predicted_frame"])
        self.run.info(
            "Gene calls marked as NONCODING",
            report["num_partial_gene_calls_with_no_predicted_frame"] +
            report["num_gene_calls_with_no_predicted_frame"],
            mc="red")
        self.run.info("  - For complete gene calls",
                      report["num_gene_calls_with_no_predicted_frame"],
                      mc="red")
        self.run.info("  - For partial gene calls",
                      report["num_partial_gene_calls_with_no_predicted_frame"],
                      mc="red")
        self.run.info(
            "Gene calls with internal stops",
            report["num_genes_with_internal_stops"] +
            report["num_partial_genes_with_internal_stops"])
        self.run.info("  - For complete gene calls",
                      report["num_genes_with_internal_stops"])
        self.run.info("  - For partial gene calls",
                      report["num_partial_genes_with_internal_stops"],
                      nl_after=1)

        return gene_calls_dict, amino_acid_sequences
Exemple #21
0
    def get_amino_acid_sequences_for_genes_in_gene_calls_dict(
            self, gene_calls_dict, ignore_internal_stop_codons=False):
        '''Recover amino acid sequences for gene calls in a gene_calls_dict.

           During this operation we are going to have to read all contig sequences
           into the damn memory. anvi'o is doing a pretty bad job with memory management :(
        '''

        amino_acid_sequences = {}

        # FIXME: this is a very poor practice for memory management:
        contig_sequences = {}

        if self.contigs_fasta:
            fasta = u.SequenceSource(self.contigs_fasta)
            while next(fasta):
                contig_sequences[fasta.id] = {'sequence': fasta.seq}
            fasta.close()
        else:
            database = db.DB(self.db_path,
                             utils.get_required_version_for_db(self.db_path))
            contig_sequences = database.get_table_as_dict(
                t.contig_sequences_table_name)

        num_genes_with_internal_stops = 0
        number_of_impartial_gene_calls = 0
        for gene_callers_id in gene_calls_dict:
            gene_call = gene_calls_dict[gene_callers_id]
            contig_name = gene_call['contig']

            if contig_name not in contig_sequences:
                # remove the partial contigs database so things don't get screwed later
                os.remove(self.db_path)
                raise ConfigError(
                    "You are in big trouble :( The contig name '%s' in your external gene callers file\
                                    does not appear to be in the contigs FASTA file. How did this happen?"
                    % contig_name)

            if gene_call['partial']:
                amino_acid_sequences[gene_callers_id] = ''
                number_of_impartial_gene_calls += 1
                continue

            sequence = contig_sequences[contig_name]['sequence'][
                gene_call['start']:gene_call['stop']]
            if gene_call['direction'] == 'r':
                sequence = utils.rev_comp(sequence)

            amino_acid_sequence = utils.get_DNA_sequence_translated(
                sequence, gene_callers_id)

            # check if there are any internal stops:
            if amino_acid_sequence.find('*') > -1:
                if ignore_internal_stop_codons:
                    amino_acid_sequence = amino_acid_sequence.replace('*', 'X')
                    num_genes_with_internal_stops += 1
                else:
                    os.remove(self.db_path)
                    raise ConfigError(
                        "Oops. Anvi'o run into an amino acid sequence (that corresponds to the gene callers id '%s')\
                                       which had an internal stop codon :/ This usually indicates that your external gene calls\
                                       have problems. If you still want to continue, you can ask anvi'o to ignore internal stop\
                                       codons on your own risk. It will probably look very ugly on your screen, but here is the\
                                       DNA sequence for that gene in case you don't trust anvi'o (which only would be fair since\
                                       anvi'o does not trust you either): %s" %
                        (str(gene_callers_id), sequence))

            amino_acid_sequences[gene_callers_id] = amino_acid_sequence

        if num_genes_with_internal_stops:
            percent_genes_with_internal_stops = num_genes_with_internal_stops * 100.0 / len(
                gene_calls_dict)
            self.run.warning("Please read this carefully: Your external gene calls contained open reading frames with internal\
                              stop codons, and you asked anvi'o to ignore those. Anvi'o replaced internal stop codons with 'X'\
                              characters, and stored them in the contigs database that way. %d of your genes, which corresponded\
                              to %.2f%% of the total %d genes, had internal stop codons. We hope you are happy."                                                                                                                 % \
                                        (num_genes_with_internal_stops, percent_genes_with_internal_stops, len(gene_calls_dict)))

        if number_of_impartial_gene_calls:
            self.run.warning(
                '%d of your %d gene calls were impartial, hence the translated amino acid sequences for those\
                              were not stored in the database.' %
                (number_of_impartial_gene_calls, len(gene_calls_dict)))

        return amino_acid_sequences