コード例 #1
0
    def generate(self):
        output_r1 = open(self.output_sample_name + '-R1.fastq', 'w')
        output_r2 = open(self.output_sample_name + '-R2.fastq', 'w')

        self.run.info('Read lenth', self.short_read_length)
        self.run.info('Insert size', self.insert_size)
        self.run.info('Insert size std', self.insert_size_std)

        x = self.short_read_length
        self.Q_str = ''.join(['A'] * x)

        for index_fasta in range(0, len(self.fasta_files)):
            f = self.fasta_files_dict[self.fasta_files[index_fasta]]

            c = f['coverage']

            self.progress.new(
                'Working on file %d of %d (%s) with expected coverage of %d' %
                (index_fasta + 1, len(self.fasta_files), f['alias'], c))

            fasta = fastalib.SequenceSource(f['path'])
            total_r1_errors = 0
            total_r2_errors = 0
            total_num_reads = 0
            while next(fasta):
                L = len(fasta.seq)

                av_num_short_reads_needed = int(L / x * c)
                total_num_reads += av_num_short_reads_needed

                av_num_pairs_needed = int(av_num_short_reads_needed / 2)

                for index_pair in range(0, av_num_pairs_needed):
                    if (index_pair + 1) % 100 == 0:
                        self.progress.update('Seq %s :: %s nts :: reads %s of %s :: num errors: %s ...'\
                                                        % (pp(fasta.pos + 1), pp(len(fasta.seq)),
                                                           pp(index_pair + 1), pp(av_num_pairs_needed),
                                                           pp(total_r1_errors + total_r2_errors)))

                    I = int(
                        round(
                            random.gauss(self.insert_size,
                                         self.insert_size_std)))
                    if L - ((x * 2) + I) > 0:
                        start_pos = random.randint(0, L - ((x * 2) + I))
                    else:
                        start_pos = random.randint(0, L - (x * 2))

                    read_1_start = start_pos
                    read_1_stop = read_1_start + x

                    read_2_start = read_1_stop + I
                    read_2_stop = read_2_start + x

                    read_1, num_errors_r1 = simulate_errors(
                        self.error_rate, fasta.seq[read_1_start:read_1_stop])
                    read_2, num_errors_r2 = simulate_errors(
                        self.error_rate, fasta.seq[read_2_start:read_2_stop])

                    total_r1_errors += num_errors_r1
                    total_r2_errors += num_errors_r2

                    c1, c2 = random.randint(1, 10000), random.randint(1, 10000)
                    output_r1.write(
                        '@%s:23:B02CBACXX:8:2315:%d:%d 1:N:0:GATCAG\n' %
                        (f['alias'], c1, c2))
                    output_r1.write(read_1 + '\n')
                    output_r1.write(
                        '+source:%s; start:%d; stop:%d; insert_size:%d\n' %
                        (fasta.id, read_1_start, read_1_stop, I))
                    output_r1.write('%s\n' % self.Q_str)

                    output_r2.write(
                        '@%s:23:B02CBACXX:8:2315:%d:%d 2:N:0:GATCAG\n' %
                        (f['alias'], c1, c2))
                    output_r2.write(u.rev_comp(read_2) + '\n')
                    output_r2.write(
                        '+source:%s; start:%d; stop:%d; insert_size:%d\n' %
                        (fasta.id, read_2_start, read_2_stop, I))
                    output_r2.write('%s\n' % self.Q_str)

            self.progress.end()
            total_num_errors = total_r1_errors + total_r2_errors
            self.run.info('%s w/ %d contigs' % (f['alias'], fasta.pos),
                     '%s reads in %s pairs with %s errors (avg %.4f) for %sX avg cov.'\
                                        % (pp(total_num_reads),
                                           pp(total_num_reads / 2),
                                           pp(total_num_errors),
                                           total_num_errors * 1.0 / (total_num_reads * x),
                                           pp(c),
                                           ))

        output_r1.close()
        output_r2.close()
        self.run.info('FASTQ R1', self.output_sample_name + '-R1.fastq')
        self.run.info('FASTQ R2', self.output_sample_name + '-R2.fastq')
コード例 #2
0
ファイル: genecalls.py プロジェクト: meren/anvio
    def get_amino_acid_sequences_for_genes_in_gene_calls_dict(self, gene_calls_dict, ignore_internal_stop_codons=False):
        '''Recover amino acid sequences for gene calls in a gene_calls_dict.

           During this operation we are going to have to read all contig sequences
           into the damn memory. anvi'o is doing a pretty bad job with memory management :(
        '''

        amino_acid_sequences = {}

        # FIXME: this is a very poor practice for memory management:
        contig_sequences = {}

        if self.contigs_fasta:
            fasta = u.SequenceSource(self.contigs_fasta)
            while next(fasta):
                contig_sequences[fasta.id] = {'sequence': fasta.seq}
            fasta.close()
        else:
            database = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path))
            contig_sequences = database.get_table_as_dict(t.contig_sequences_table_name)

        num_genes_with_internal_stops = 0
        number_of_impartial_gene_calls = 0
        for gene_callers_id in gene_calls_dict:
            gene_call = gene_calls_dict[gene_callers_id]
            contig_name = gene_call['contig']

            if contig_name not in contig_sequences:
                # remove the partial contigs database so things don't get screwed later
                os.remove(self.db_path)
                raise ConfigError("You are in big trouble :( The contig name '%s' in your external gene callers file\
                                    does not appear to be in the contigs FASTA file. How did this happen?" % contig_name)

            if gene_call['partial']:
                amino_acid_sequences[gene_callers_id] = ''
                number_of_impartial_gene_calls += 1
                continue

            sequence = contig_sequences[contig_name]['sequence'][gene_call['start']:gene_call['stop']]
            if gene_call['direction'] == 'r':
                sequence = utils.rev_comp(sequence)

            amino_acid_sequence = utils.get_DNA_sequence_translated(sequence, gene_callers_id)

            # check if there are any internal stops:
            if amino_acid_sequence.find('*') > -1:
                if ignore_internal_stop_codons:
                    amino_acid_sequence = amino_acid_sequence.replace('*', 'X')
                    num_genes_with_internal_stops += 1
                else:
                    os.remove(self.db_path)
                    raise ConfigError("Oops. Anvi'o run into an amino acid seqeunce (that corresponds to the gene callers id '%s')\
                                       which had an internal stop codon :/ This usually indicates that your external gene calls\
                                       have problems. If you still want to continue, you can ask anvi'o to ignore internal stop\
                                       codons on your own risk. It will probably look very ugly on your screen, but here is the\
                                       DNA sequence for that gene in case you don't trust anvi'o (which only would be fair since\
                                       anvi'o does not trust you either): %s" % (str(gene_callers_id), sequence))

            amino_acid_sequences[gene_callers_id] = amino_acid_sequence

        if num_genes_with_internal_stops:
            percent_genes_with_internal_stops = num_genes_with_internal_stops * 100.0 / len(gene_calls_dict)
            self.run.warning("Please read this carefully: Your external gene calls contained open reading frames with internal\
                              stop codons, and you asked anvi'o to ignore those. Anvi'o replaced internal stop codons with 'X'\
                              characters, and stored them in the contigs database that way. %d of your genes, which corresponded\
                              to %.2f%% of the total %d genes, had internal stop codons. We hope you are happy." % \
                                        (num_genes_with_internal_stops, percent_genes_with_internal_stops, len(gene_calls_dict)))

        if number_of_impartial_gene_calls:
            self.run.warning('%d of your %d gene calls were impartial, hence the translated amino acid sequences for those\
                              were not stored in the database.' % (number_of_impartial_gene_calls, len(gene_calls_dict)))

        return amino_acid_sequences
コード例 #3
0
    def get_amino_acid_sequences_for_genes_in_gene_calls_dict(self, gene_calls_dict, ignore_internal_stop_codons=False):
        '''Recover amino acid sequences for gene calls in a gene_calls_dict.

        If 'aa_sequence' exists as keys in the gene_calls_dict[<key>] objects, this trivially
        reorganizes the data and returns a sequence dictionary. Otherwise, the sequence dictionary
        is created by reading all contig sequences into memory. Anvi'o is doing a pretty bad job
        with memory management in this case :(

        Parameters
        ==========
        ignore_internal_stop_codons : bool, False
            If False, ConfigError will be raised if a stop codon is found inside any gene. If True,
            this is suppressed and the stop codon is replaced with the character `X`.
        '''

        if 'aa_sequence' in gene_calls_dict[list(gene_calls_dict.keys())[0]]:
            # we already have AA sequences
            return {gene_caller_id: info['aa_sequence'] for gene_caller_id, info in gene_calls_dict.items()}

        amino_acid_sequences = {}

        # FIXME: this is a very poor practice for memory management:
        contig_sequences = {}

        if self.contigs_fasta:
            fasta = u.SequenceSource(self.contigs_fasta)
            while next(fasta):
                contig_sequences[fasta.id] = {'sequence': fasta.seq}
            fasta.close()
        else:
            database = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path))
            contig_sequences = database.get_table_as_dict(t.contig_sequences_table_name)

        num_genes_with_internal_stops = 0
        number_of_impartial_gene_calls = 0
        for gene_callers_id in gene_calls_dict:
            gene_call = gene_calls_dict[gene_callers_id]
            contig_name = gene_call['contig']

            if contig_name not in contig_sequences:
                # remove the partial contigs database so things don't get screwed later
                os.remove(self.db_path)
                raise ConfigError("You are in big trouble :( The contig name '%s' in your external gene callers file "
                                   "does not appear to be in the contigs FASTA file. How did this happen?" % contig_name)

            if gene_call['partial']:
                amino_acid_sequences[gene_callers_id] = ''
                number_of_impartial_gene_calls += 1
                continue

            sequence = contig_sequences[contig_name]['sequence'][gene_call['start']:gene_call['stop']]
            if gene_call['direction'] == 'r':
                sequence = utils.rev_comp(sequence)

            amino_acid_sequence = utils.get_DNA_sequence_translated(sequence, gene_callers_id)

            # check if there are any internal stops:
            if amino_acid_sequence.find('*') > -1:
                if ignore_internal_stop_codons:
                    amino_acid_sequence = amino_acid_sequence.replace('*', 'X')
                    num_genes_with_internal_stops += 1
                else:
                    os.remove(self.db_path)
                    raise ConfigError("Oops. Anvi'o run into an amino acid sequence (that corresponds to the gene callers id '%s') "
                                      "which had an internal stop codon :/ This usually indicates that your external gene calls "
                                      "have problems. If you still want to continue, you can ask anvi'o to ignore internal stop "
                                      "codons on your own risk. It will probably look very ugly on your screen, but here is the "
                                      "DNA sequence for that gene in case you don't trust anvi'o (which only would be fair since "
                                      "anvi'o does not trust you either): %s" % (str(gene_callers_id), sequence))

            amino_acid_sequences[gene_callers_id] = amino_acid_sequence

        if num_genes_with_internal_stops:
            percent_genes_with_internal_stops = num_genes_with_internal_stops * 100.0 / len(gene_calls_dict)
            self.run.warning("Please read this carefully: Your external gene calls contained open reading frames with internal "
                             "stop codons, and you asked anvi'o to ignore those. Anvi'o replaced internal stop codons with 'X' "
                             "characters, and stored them in the contigs database that way. %d of your genes, which corresponded "
                             "to %.2f%% of the total %d genes, had internal stop codons. We hope you are happy." % \
                                        (num_genes_with_internal_stops, percent_genes_with_internal_stops, len(gene_calls_dict)))

        if number_of_impartial_gene_calls:
            self.run.warning('%d of your %d gene calls were impartial, hence the translated amino acid sequences for those '
                             'were not stored in the database.' % (number_of_impartial_gene_calls, len(gene_calls_dict)))

        return amino_acid_sequences
コード例 #4
0
ファイル: hmmops.py プロジェクト: meren/anvio
    def get_sequences_dict_for_hmm_hits_in_splits(self, splits_dict, return_amino_acid_sequences=False, return_best_hits=False):
        """splits dict is what you get from ccollections.GetSplitNamesInBins(args).get_dict(), and
           its struture goes like this:

                {
                    'bin_x': set['split_a, split_b, ...'],
                    'bin_y': set['split_c, split_d, ...'],
                    ...
                }

            This function will return DNA seqeunces by default. If `return_amino_acid_sequences` parameter
            is True, it will return AA sequences instead.

            `return_best_hit=True` will filter the resulting dictionary to remove weak hits if there are more
            than one hit for a given gene name in a bin for a given hmm source.
        """

        # trim hmm hits if sources
        if len(self.sources):
            self.hmm_hits_splits = utils.get_filtered_dict(self.hmm_hits_splits, 'source', self.sources)
            self.hmm_hits = utils.get_filtered_dict(self.hmm_hits, 'source', self.sources)
        else:
            self.sources = list(self.hmm_hits_info.keys())

        hits_in_splits, split_name_to_bin_id = self.get_hmm_hits_in_splits(splits_dict)

        hmm_sequences_dict_for_splits = {}

        unique_hits_taken_care_of = set([])
        for split_entry in list(hits_in_splits.values()):
            hmm_hit = self.hmm_hits[split_entry['hmm_hit_entry_id']]

            split_name = split_entry['split']
            source = hmm_hit['source']
            gene_name = hmm_hit['gene_name']
            e_value = hmm_hit['e_value']
            hit_unique_id = '___'.join([source, hmm_hit['gene_unique_identifier']])

            if hit_unique_id in unique_hits_taken_care_of:
                continue
            else:
                unique_hits_taken_care_of.add(hit_unique_id)

            gene_callers_id = hmm_hit['gene_callers_id']
            gene_call = self.genes_in_contigs[gene_callers_id]

            contig_name = gene_call['contig']
            start, stop, forward = gene_call['start'], gene_call['stop'], gene_call['direction'] == 'f'

            if return_amino_acid_sequences:
                sequence = self.aa_sequences[gene_callers_id]['sequence']
            else:
                sequence = self.contig_sequences[contig_name]['sequence'][start:stop]
                if not forward:
                    sequence = utils.rev_comp(sequence)

            hmm_sequences_dict_for_splits[hit_unique_id] = {'sequence': sequence,
                                                            'source': source,
                                                            'bin_id': split_name_to_bin_id[split_name],
                                                            'gene_name': gene_name,
                                                            'e_value': e_value,
                                                            'contig': contig_name,
                                                            'start': start,
                                                            'stop': stop,
                                                            'gene_callers_id': gene_callers_id,
                                                            'rev_comped': (not forward),
                                                            'length': stop - start}

        if return_best_hits:
            return self.filter_hmm_sequences_dict_for_splits_to_keep_only_best_hits(hmm_sequences_dict_for_splits)
        else:
            return hmm_sequences_dict_for_splits
コード例 #5
0
ファイル: genecalls.py プロジェクト: AstrobioMike/anvio
    def use_external_gene_calls_to_populate_genes_in_contigs_table(self, input_file_path, gene_calls_dict=None, ignore_internal_stop_codons=False):
        """Add genes to the contigs database.

           Either provide an `input_file_path` for external gene calls, or provide an
           external gene calls dictionary. The format should follow this:

                {
                  "1": {
                      "contig": "contig_name",
                      "start": 20,
                      "stop": 1544,
                      "direction": "f",
                      "partial": 0,
                      "source": "source_name",
                      "version": "unknown"
                  },

                  "2": {
                    (...)
                  },

                (...)
                }

            If you provide a `gene_calls_dict`, they will be APPENDED to the database. So you
            need to make sure gene caller ids in your dict does not overlap with the ones in
            the database.

        """

        # by default we assume that this is a pristine run. but if the user sends a dictionary
        append_to_the_db = False

        gene_calls_found = False
        # let's do a rigorous check whether the user provided a gene_calls_dict.
        if (gene_calls_dict is not None and gene_calls_dict is not False):
            if not isinstance(gene_calls_dict, dict):
                raise ConfigError("'Use external gene calls' function received a non-empty gene_calls_dict object,\
                                    but it is of type '%s', and not '%s'" % (type(gene_calls_dict), type({})))

            # congrats, we have a dict.
            gene_calls_found = True

            if not len(gene_calls_dict):
                # but it is empty ... silly user.
                self.run.info_single("'Use external gene calls' function found an empty gene calls dict, returning\
                                      prematurely and assuming you know what's up. If you don't, stop here and try to\
                                      identify what decisions you've made might have led you to this weird point your\
                                      workflow (or 'life', totally up to you and your mood, but anvi'o thinks you've\
                                      done great so far.", nl_before=1, nl_after=1)
                return


        if (not input_file_path and not gene_calls_found) or (input_file_path and gene_calls_found):
            raise ConfigError("You must provide either an input file, or an gene calls dict to process external\
                               gene calls. You called `use_external_gene_calls_to_populate_genes_in_contigs_table`\
                               with wrong parameters.")

        Table.__init__(self, self.db_path, anvio.__contigs__version__, self.run, self.progress, simple=True)

        # take care of gene calls dict
        if not gene_calls_found:
            gene_calls_dict = utils.get_TAB_delimited_file_as_dictionary(input_file_path,
                                                                         expected_fields=t.genes_in_contigs_table_structure,
                                                                         only_expected_fields=True,
                                                                         column_mapping=[int, str, int, int, str, int, str, str])

            if not len(gene_calls_dict):
                raise ConfigError("You provided an external gene calls file, but it returned zero gene calls. Assuming that\
                                   this is an error, anvi'o will stop here and complain. If this is not an error and you\
                                   in fact expected this, the proper way of doing this is to use `--skip-gene-calls` flag,\
                                   instead of providing an emtpy external gene calls file. You don't agree? You need this\
                                   for some weird step for you weird pipeline? Let us know, and we will consider changing\
                                   this.")

            self.run.info("External gene calls", "%d gene calls recovered and will be processed." % len(gene_calls_dict))
        else:
            # FIXME: we need to make sure the gene caller ids in the incoming directory is not going to
            #        overwrite an existing gene call. Something like this would have returned the
            #        current max, which could be cross-checked with what's in the dict:
            #
            #            contigs_db = ContigsDatabase(self.db_path)
            #            next_id = contigs_db.db.get_max_value_in_column('genes_in_contigs', 'gene_callers_id') + 1
            #            contigs_db.disconnect()
            append_to_the_db = True

        # recover amino acid sequences. during this operation we are going to have to read all contig sequences
        # into the damn memory. anvi'o is doing a pretty bad job with memory management :(
        amino_acid_sequences = {}

        contig_sequences = {}
        if self.contigs_fasta:
            fasta = u.SequenceSource(self.contigs_fasta)
            while next(fasta):
                contig_sequences[fasta.id] = {'sequence': fasta.seq}
            fasta.close()
        else:
            database = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path))
            contig_sequences = database.get_table_as_dict(t.contig_sequences_table_name)

        num_genes_with_internal_stops = 0
        number_of_impartial_gene_calls = 0
        for gene_callers_id in gene_calls_dict:
            gene_call = gene_calls_dict[gene_callers_id]
            contig_name = gene_call['contig']

            if contig_name not in contig_sequences:
                # remove the partial contigs database so things don't get screwed later
                os.remove(self.db_path)
                raise ConfigError("You are in big trouble :( The contig name '%s' in your external gene callers file\
                                    does not appear to be in the contigs FASTA file. How did this happen?" % contig_name)

            if gene_call['partial']:
                amino_acid_sequences[gene_callers_id] = ''
                number_of_impartial_gene_calls += 1
                continue

            sequence = contig_sequences[contig_name]['sequence'][gene_call['start']:gene_call['stop']]
            if gene_call['direction'] == 'r':
                sequence = utils.rev_comp(sequence)

            amino_acid_sequence = utils.get_DNA_sequence_translated(sequence, gene_callers_id)

            # check if there are any internal stops:
            if amino_acid_sequence.find('*') > -1:
                if ignore_internal_stop_codons:
                    amino_acid_sequence = amino_acid_sequence.replace('*', 'X')
                    num_genes_with_internal_stops += 1
                else:
                    os.remove(self.db_path)
                    raise ConfigError("Oops. Anvi'o run into an amino acid seqeunce (that corresponds to the gene callers id '%s')\
                                       which had an internal stop codon :/ This usually indicates that your external gene calls\
                                       have problems. If you still want to continue, you can ask anvi'o to ignore internal stop\
                                       codons on your own risk. It will probably look very ugly on your screen, but here is the\
                                       DNA sequence for that gene in case you don't trust anvi'o (which only would be fair since\
                                       anvi'o does not trust you either): %s" % (str(gene_callers_id), sequence))

            amino_acid_sequences[gene_callers_id] = amino_acid_sequence

        # populate genes_in_contigs, and gene_amino_acid_sequences table in contigs db.
        self.populate_genes_in_contigs_table(gene_calls_dict, amino_acid_sequences, append_to_the_db=append_to_the_db)

        if num_genes_with_internal_stops:
            percent_genes_with_internal_stops = num_genes_with_internal_stops * 100.0 / len(gene_calls_dict)
            self.run.warning("Please read this carefully: Your external gene calls contained open reading frames with internal\
                              stop codons, and you asked anvi'o to ignore those. Anvi'o replaced internal stop codons with 'X'\
                              characters, and stored them in the contigs database that way. %d of your genes, which corresponded\
                              to %.2f%% of the total %d genes, had internal stop codons. We hope you are happy." % \
                                        (num_genes_with_internal_stops, percent_genes_with_internal_stops, len(gene_calls_dict)))

        if number_of_impartial_gene_calls:
            self.run.warning('%d of your %d gene calls were impartial, hence the translated amino acid sequences for those\
                              were not stored in the database.' % (number_of_impartial_gene_calls, len(gene_calls_dict)))
コード例 #6
0
ファイル: genecalls.py プロジェクト: satish162/anvio
    def use_external_gene_calls_to_populate_genes_in_contigs_table(
            self,
            input_file_path,
            gene_calls_dict=None,
            ignore_internal_stop_codons=False):
        """Add genes to the contigs database.

           Either provide an `input_file_path` for external gene calls, or provide an
           external gene calls dictionary. The format should follow this:

                {
                  "1": {
                      "contig": "contig_name",
                      "start": 20,
                      "stop": 1544,
                      "direction": "f",
                      "partial": 0,
                      "source": "source_name",
                      "version": "unknown"
                  },

                  "2": {
                    (...)
                  },

                (...)
                }

            If you provide a `gene_calls_dict`, they will be APPENDED to the database. So you
            need to make sure gene caller ids in your dict does not overlap with the ones in
            the database.

        """

        # by default we assume that this is a pristine run. but if the user sends a dictionary
        append_to_the_db = False

        gene_calls_found = False
        # let's do a rigorous check whether the user provided a gene_calls_dict.
        if (gene_calls_dict is not None and gene_calls_dict is not False):
            if not isinstance(gene_calls_dict, dict):
                raise ConfigError(
                    "'Use external gene calls' function received a non-empty gene_calls_dict object,\
                                    but it is of type '%s', and not '%s'" %
                    (type(gene_calls_dict), type({})))

            # congrats, we have a dict.
            gene_calls_found = True

            if not len(gene_calls_dict):
                # but it is empty ... silly user.
                self.run.info_single(
                    "'Use external gene calls' function found an empty gene calls dict, returning\
                                      prematurely and assuming you know what's up. If you don't, stop here and try to\
                                      identify what decisions you've made might have led you to this weird point your\
                                      workflow (or 'life', totally up to you and your mood, but anvi'o thinks you've\
                                      done great so far.",
                    nl_before=1,
                    nl_after=1)
                return

        if (not input_file_path
                and not gene_calls_found) or (input_file_path
                                              and gene_calls_found):
            raise ConfigError(
                "You must provide either an input file, or an gene calls dict to process external\
                               gene calls. You called `use_external_gene_calls_to_populate_genes_in_contigs_table`\
                               with wrong parameters.")

        Table.__init__(self,
                       self.db_path,
                       anvio.__contigs__version__,
                       self.run,
                       self.progress,
                       simple=True)

        # take care of gene calls dict
        if not gene_calls_found:
            gene_calls_dict = utils.get_TAB_delimited_file_as_dictionary(
                input_file_path,
                expected_fields=t.genes_in_contigs_table_structure,
                only_expected_fields=True,
                column_mapping=[int, str, int, int, str, int, str, str])

            if not len(gene_calls_dict):
                raise ConfigError(
                    "You provided an external gene calls file, but it returned zero gene calls. Assuming that\
                                   this is an error, anvi'o will stop here and complain. If this is not an error and you\
                                   in fact expected this, the proper way of doing this is to use `--skip-gene-calls` flag,\
                                   instead of providing an emtpy external gene calls file. You don't agree? You need this\
                                   for some weird step for you weird pipeline? Let us know, and we will consider changing\
                                   this.")

            self.run.info(
                "External gene calls",
                "%d gene calls recovered and will be processed." %
                len(gene_calls_dict))
        else:
            # FIXME: we need to make sure the gene caller ids in the incoming directory is not going to
            #        overwrite an existing gene call. Something like this would have returned the
            #        current max, which could be cross-checked with what's in the dict:
            #
            #            contigs_db = ContigsDatabase(self.db_path)
            #            next_id = contigs_db.db.get_max_value_in_column('genes_in_contigs', 'gene_callers_id') + 1
            #            contigs_db.disconnect()
            append_to_the_db = True

        # recover amino acid sequences. during this operation we are going to have to read all contig sequences
        # into the damn memory. anvi'o is doing a pretty bad job with memory management :(
        amino_acid_sequences = {}

        contig_sequences = {}
        if self.contigs_fasta:
            fasta = u.SequenceSource(self.contigs_fasta)
            while next(fasta):
                contig_sequences[fasta.id] = {'sequence': fasta.seq}
            fasta.close()
        else:
            database = db.DB(self.db_path,
                             utils.get_required_version_for_db(self.db_path))
            contig_sequences = database.get_table_as_dict(
                t.contig_sequences_table_name)

        num_genes_with_internal_stops = 0
        number_of_impartial_gene_calls = 0
        for gene_callers_id in gene_calls_dict:
            gene_call = gene_calls_dict[gene_callers_id]
            contig_name = gene_call['contig']

            if contig_name not in contig_sequences:
                # remove the partial contigs database so things don't get screwed later
                os.remove(self.db_path)
                raise ConfigError(
                    "You are in big trouble :( The contig name '%s' in your external gene callers file\
                                    does not appear to be in the contigs FASTA file. How did this happen?"
                    % contig_name)

            if gene_call['partial']:
                amino_acid_sequences[gene_callers_id] = ''
                number_of_impartial_gene_calls += 1
                continue

            sequence = contig_sequences[contig_name]['sequence'][
                gene_call['start']:gene_call['stop']]
            if gene_call['direction'] == 'r':
                sequence = utils.rev_comp(sequence)

            amino_acid_sequence = utils.get_DNA_sequence_translated(
                sequence, gene_callers_id)

            # check if there are any internal stops:
            if amino_acid_sequence.find('*') > -1:
                if ignore_internal_stop_codons:
                    amino_acid_sequence = amino_acid_sequence.replace('*', 'X')
                    num_genes_with_internal_stops += 1
                else:
                    os.remove(self.db_path)
                    raise ConfigError(
                        "Oops. Anvi'o run into an amino acid seqeunce (that corresponds to the gene callers id '%s')\
                                       which had an internal stop codon :/ This usually indicates that your external gene calls\
                                       have problems. If you still want to continue, you can ask anvi'o to ignore internal stop\
                                       codons on your own risk. It will probably look very ugly on your screen, but here is the\
                                       DNA sequence for that gene in case you don't trust anvi'o (which only would be fair since\
                                       anvi'o does not trust you either): %s" %
                        (str(gene_callers_id), sequence))

            amino_acid_sequences[gene_callers_id] = amino_acid_sequence

        # populate genes_in_contigs, and gene_amino_acid_sequences table in contigs db.
        self.populate_genes_in_contigs_table(gene_calls_dict,
                                             amino_acid_sequences,
                                             append_to_the_db=append_to_the_db)

        if num_genes_with_internal_stops:
            percent_genes_with_internal_stops = num_genes_with_internal_stops * 100.0 / len(
                gene_calls_dict)
            self.run.warning("Please read this carefully: Your external gene calls contained open reading frames with internal\
                              stop codons, and you asked anvi'o to ignore those. Anvi'o replaced internal stop codons with 'X'\
                              characters, and stored them in the contigs database that way. %d of your genes, which corresponded\
                              to %.2f%% of the total %d genes, had internal stop codons. We hope you are happy."                                                                                                                 % \
                                        (num_genes_with_internal_stops, percent_genes_with_internal_stops, len(gene_calls_dict)))

        if number_of_impartial_gene_calls:
            self.run.warning(
                '%d of your %d gene calls were impartial, hence the translated amino acid sequences for those\
                              were not stored in the database.' %
                (number_of_impartial_gene_calls, len(gene_calls_dict)))
コード例 #7
0
    def populate_search_tables(self, contigs_db_path):
        utils.is_contigs_db(contigs_db_path)

        info_table = hmmops.SequencesForHMMHits(contigs_db_path).hmm_hits_info

        if self.source_name in info_table:
            if self.just_do_it:
                TablesForHMMHits(contigs_db_path,
                                 run=self.run,
                                 progress=self.progress).remove_source(
                                     self.source_name)
            else:
                raise ConfigError(
                    "There is already information for %s in the database :/ Anvi'o will not overwrite this "
                    "unless you ask for it explicitly. You can either use `anvi-delete-hmms` to remove it first, "
                    "or run `anvi-scan-trnas` with `--just-do-it` flag so anvi'o would remove it for you."
                    % (self.source_name))

        filesnpaths.is_output_file_writable(contigs_db_path, ok_if_exists=True)

        contig_sequences_fasta_path = os.path.join(self.tmp_directory_path,
                                                   'contig_sequences.fa')

        utils.export_sequences_from_contigs_db(contigs_db_path,
                                               contig_sequences_fasta_path)

        search_results_dict = self.run_trnascan_on_FASTA(
            fasta_file_path=contig_sequences_fasta_path)

        # At this point we need to turn this search_results_dict into one that matches how it is used
        # in HMM operations. Here is an entry from tRNA results dict:
        #
        # {1: {'contig': 'Bfragilis_0100_000000000001',
        #      'trna_no': '1',
        #      'start': 135361,
        #      'stop': 135433,
        #      'amino_acid': 'Thr',
        #      'codon': 'CGT',
        #      'score': 67.6}}
        #
        # and here is one exmple from the rRNA HMMs results dict:
        #
        # {1: {'entry_id': 0,
        #      'gene_name': 'Bacterial_23S_rRNA',
        #      'gene_hmm_id': '-',
        #      'contig_name': 'Bfragilis_0100_000000000001',
        #      'start': 1110877,
        #      'stop': 1113757,
        #      'e_value': 0.0}}
        #
        # so we will have to make the former look like the latter. I have the feeling that the
        # score / e_value will cause issues later :(

        missing_amino_acids = Counter()
        missing_codons = Counter()
        entries_to_remove = set([])
        for entry_id in search_results_dict:
            entry = search_results_dict[entry_id]

            aa, codon = entry['amino_acid'], utils.rev_comp(entry['anticodon'])

            if codon not in self.codons:
                missing_codons[codon] += 1
                entries_to_remove.add(entry_id)
                continue

            if aa not in self.amino_acids:
                missing_amino_acids[aa] += 1
                entries_to_remove.add(entry_id)
                continue

            aa_codon = '%s_%s' % (aa, codon)

            entry['gene_name'] = aa_codon
            entry['e_value'] = entry['score']
            entry['gene_hmm_id'] = '-'
            if entry['stop'] > entry['start']:
                # so we are forward
                entry['start'] = entry[
                    'start'] - 1  # setting the pythonic start.
            else:
                # so this one is reverse
                entry['stop'] = entry['stop'] - 1

            # just to double check for surprises (see https://github.com/merenlab/anvio/issues/1367 for details)
            for pos in ['start', 'stop']:
                if entry[pos] < 0:
                    entry[pos] = 0

        for entry_id in entries_to_remove:
            search_results_dict.pop(entry_id)

        self.run.info("Num tRNA genes recovered", len(search_results_dict))

        if len(missing_codons):
            self.run.warning(
                "While anvi'o was trying to parse the output from tRNAScan-SE, it "
                "became clear that some of the codons the tool identified was not "
                "known to anvi'o, so we conservatively discareded those entries. "
                "Here is the list of codons that were discareded and their frequency "
                "among your contigs: '%s'." % (', '.join([
                    '%s (%d)' % (codon, missing_codons[codon])
                    for codon in missing_codons
                ])),
                header="WEIRD CODONS ALERT")

        if len(missing_amino_acids):
            self.run.warning(
                "While anvi'o was trying to parse the output from tRNAScan-SE, it "
                "run into some amino acid names that were not known to anvi'o. "
                "All those entries are now gone :/ But here is the list of amino "
                "acids and their frequencies: '%s'." % (', '.join([
                    '%s (%d)' % (amino_acid, missing_amino_acids[amino_acid])
                    for amino_acid in missing_amino_acids
                ])),
                header="WEIRD AMINO ACIDS ALERT")

        search_results_dict = utils.get_pruned_HMM_hits_dict(
            search_results_dict)

        tables_for_hmm_hits = TablesForHMMHits(contigs_db_path,
                                               run=self.run,
                                               progress=self.progress)
        search_results_dict = tables_for_hmm_hits.add_new_gene_calls_to_contigs_db_and_update_serach_results_dict(
            self.kind_of_search,
            search_results_dict,
            skip_amino_acid_sequences=True)
        tables_for_hmm_hits.append(self.source_name, self.reference,
                                   self.kind_of_search, self.domain,
                                   self.all_genes_searched_against,
                                   search_results_dict)

        # when the code comes all the way here, the entries in the search results dict already look like
        # this, so we have a gene callers id for the newly generate genes for tRNAs. we will use it
        # to populate a functions dict and submit it to the contigs database as well:
        #
        #     {'contig_name': 'Bfragilis_0100_000000000001',
        #      'trna_no': '1',
        #      'start': 135361,
        #      'stop': 135433,
        #      'amino_acid': 'Thr',
        #      'anticodon': 'CGT',
        #      'score': 67.6,
        #      'gene_name': 'Thr_ACG',
        #      'e_value': 67.6,
        #      'gene_hmm_id': '-',
        #      'gene_callers_id': 4502}
        #
        functions_dict = {}
        for entry_id in search_results_dict:
            entry = search_results_dict[entry_id]

            function_text = 'tRNA gene for amino acid %s (codon: %s; anticodon:%s; score:%.1f; intron_start:%d; intron_end:%d)' \
                                            % (entry['amino_acid'], utils.rev_comp(entry['anticodon']), entry['anticodon'], \
                                               entry['score'], entry['intron_start'], entry['intron_end'])

            functions_dict[entry_id] = {
                'gene_callers_id':
                entry['gene_callers_id'],
                'source':
                self.source_name,
                'accession':
                '%s_%s_%d' % (entry['amino_acid'], entry['anticodon'],
                              entry['gene_callers_id']),
                'function':
                function_text,
                'e_value':
                0.0
            }

        gene_function_calls_table = TableForGeneFunctions(
            contigs_db_path,
            run=terminal.Run(verbose=False),
            progress=terminal.Progress(verbose=False))
        gene_function_calls_table.create(functions_dict)

        if not anvio.DEBUG:
            self.clean_tmp_directory()
            self.run.info_single(
                "Temp directory is now cleaned (if you would like to keep it the "
                "next time use the flag `--debug`).",
                nl_before=1)
        else:
            self.run.info_single(
                "Due to the `--debug` flag, anvi'o did not remove the temoporary files "
                "directory (which is still at '%s')." %
                (self.tmp_directory_path),
                nl_before=1)
コード例 #8
0
    def generate(self):
        output_r1 = open(self.output_sample_name + '-R1.fastq', 'w')
        output_r2 = open(self.output_sample_name + '-R2.fastq', 'w')

        self.run.info('Read lenth', self.short_read_length)
        self.run.info('Insert size', self.insert_size)
        self.run.info('Insert size std', self.insert_size_std)

        x = self.short_read_length
        self.Q_str = ''.join(['A'] * x)

        for index_fasta in range(0, len(self.fasta_files)):
            f = self.fasta_files_dict[self.fasta_files[index_fasta]]

            c = f['coverage']

            self.progress.new('Working on file %d of %d (%s) with expected coverage of %d' % (index_fasta + 1, len(self.fasta_files), f['alias'], c))

            fasta = fastalib.SequenceSource(f['path'])
            total_r1_errors = 0
            total_r2_errors = 0
            total_num_reads = 0
            while next(fasta):
                L = len(fasta.seq)

                av_num_short_reads_needed = int(L / x * c)
                total_num_reads += av_num_short_reads_needed

                av_num_pairs_needed = int(av_num_short_reads_needed / 2)

                for index_pair in range(0, av_num_pairs_needed):
                    if (index_pair + 1) % 100 == 0:
                        self.progress.update('Seq %s :: %s nts :: reads %s of %s :: num errors: %s ...'\
                                                        % (pp(fasta.pos + 1), pp(len(fasta.seq)),
                                                           pp(index_pair + 1), pp(av_num_pairs_needed),
                                                           pp(total_r1_errors + total_r2_errors)))

                    I = int(round(random.gauss(self.insert_size, self.insert_size_std)))
                    if L - ((x * 2) + I) > 0:
                        start_pos = random.randint(0, L - ((x * 2) + I))
                    else:
                        start_pos = random.randint(0, L - (x * 2))

                    read_1_start = start_pos
                    read_1_stop = read_1_start + x

                    read_2_start = read_1_stop + I
                    read_2_stop = read_2_start + x

                    read_1, num_errors_r1 = simulate_errors(self.error_rate, fasta.seq[read_1_start:read_1_stop])
                    read_2, num_errors_r2 = simulate_errors(self.error_rate, fasta.seq[read_2_start:read_2_stop])

                    total_r1_errors += num_errors_r1
                    total_r2_errors += num_errors_r2

                    c1, c2 = random.randint(1, 10000), random.randint(1, 10000)
                    output_r1.write('@%s:23:B02CBACXX:8:2315:%d:%d 1:N:0:GATCAG\n' % (f['alias'], c1, c2))
                    output_r1.write(read_1 + '\n')
                    output_r1.write('+source:%s; start:%d; stop:%d; insert_size:%d\n' % (fasta.id, read_1_start, read_1_stop, I))
                    output_r1.write('%s\n' % self.Q_str)

                    output_r2.write('@%s:23:B02CBACXX:8:2315:%d:%d 2:N:0:GATCAG\n' % (f['alias'], c1, c2))
                    output_r2.write(u.rev_comp(read_2) + '\n')
                    output_r2.write('+source:%s; start:%d; stop:%d; insert_size:%d\n' % (fasta.id, read_2_start, read_2_stop, I))
                    output_r2.write('%s\n' % self.Q_str)

            self.progress.end()
            total_num_errors = total_r1_errors + total_r2_errors
            self.run.info('%s w/ %d contigs' % (f['alias'], fasta.pos),
                     '%s reads in %s pairs with %s errors (avg %.4f) for %sX avg cov.'\
                                        % (pp(total_num_reads),
                                           pp(total_num_reads / 2),
                                           pp(total_num_errors),
                                           total_num_errors * 1.0 / (total_num_reads * x),
                                           pp(c),
                                           ))

        output_r1.close()
        output_r2.close()
        self.run.info('FASTQ R1',self.output_sample_name + '-R1.fastq')
        self.run.info('FASTQ R2',self.output_sample_name + '-R2.fastq')
コード例 #9
0
ファイル: splitter.py プロジェクト: AstrobioMike/anvio
    def store_locus_as_contigs_db(self, contig_name, sequence, gene_calls, output_path_prefix, reverse_complement=False):
        """Generates a contigs database and a blank profile for a given locus"""

        temporary_files = []

        # dealing with some output file business.
        E = lambda e: output_path_prefix + e
        locus_output_db_path = E(".db")
        locus_sequence_fasta = E("_sequence.fa")
        locus_external_gene_calls = E("_external_gene_calls.txt")
        temporary_files.extend([locus_external_gene_calls, locus_sequence_fasta])

        # we will generate a blank profile database at the end of this. let's get the directory
        # business sorted.
        profile_output_dir = output_path_prefix + '-PROFILE'
        if os.path.exists(profile_output_dir):
            if self.overwrite_output_destinations:
                filesnpaths.shutil.rmtree(profile_output_dir)
            else:
                raise ConfigError("The directory %s exists, which kinda messes things up here. Either remove\
                                   it manually, or use the flag  --overwrite-output-destinations so anvi'o can\
                                   do it for you." % profile_output_dir)

        # sort out the contigs database output path
        if filesnpaths.is_file_exists(locus_output_db_path, dont_raise=True):
            if self.overwrite_output_destinations:
                os.remove(locus_output_db_path)
            else:
                raise ConfigError("There is already a contigs database at the output file path :( Either remove it first,\
                                   or use the --overwrite-output-destinations flag to give anvi'o full authority to wipe\
                                   your disk.")

        # do we need to reverse complement this guy? if yes, we will take care of the contigs sequence and
        # gene calls here, and remember this for later.
        gene_calls_list = list(gene_calls.keys())
        if reverse_complement:
            sequence = utils.rev_comp(sequence)
            gene_calls, gene_caller_id_conversion_dict = utils.rev_comp_gene_calls_dict(gene_calls, sequence)
        else:
            gene_caller_id_conversion_dict = dict([(gene_calls_list[g], g) for g in range(0, len(gene_calls_list))])
            new_gene_calls = {}
            for g in range(0, len(gene_calls_list)):
                gene_call = copy.deepcopy(gene_calls[gene_calls_list[g]])
                new_gene_calls[g] = gene_call
            gene_calls = new_gene_calls


        # write the sequene as a temporary FASTA file since the design of ContigsDatabase::create
        # will work seamlessly with this approach:
        with open(locus_sequence_fasta, 'w') as f:
            f.write('>%s\n%s\n' % (contig_name, sequence))

        # similarly, here we will store external gene calls so there will be no gene calling during
        # the generation of the contigs database
        headers = ['gene_callers_id', 'contig', 'start', 'stop', 'direction', 'partial', 'source', 'version']
        utils.store_dict_as_TAB_delimited_file(gene_calls, locus_external_gene_calls, headers=headers)

        # this is where magic happens. we ask anvi'o to create a contigs database for us.
        args = argparse.Namespace(contigs_fasta=locus_sequence_fasta,
                                  project_name=os.path.basename(output_path_prefix),
                                  split_length=sys.maxsize,
                                  kmer_size=4,
                                  external_gene_calls=locus_external_gene_calls)
        dbops.ContigsDatabase(locus_output_db_path, run=self.run_object).create(args)

        # while we are at it, here we generate a blank profile, too. so visualization of the
        # new contigs database for debugging or other purposes through anvi'o.
        args = argparse.Namespace(blank_profile=True,
                                  contigs_db=locus_output_db_path,
                                  skip_hierarchical_clustering=False,
                                  output_dir=profile_output_dir,
                                  sample_name=os.path.basename(output_path_prefix))
        profiler.BAMProfiler(args, r=self.run_object)._run()

        # so we have a contigs database! but there isn't much in it. the following where clause will
        # help us read from the tables of the original contigs database, and store it into the
        # new one throughout the following sections of the code.
        where_clause = "gene_callers_id in (%s)" % ', '.join(['"%d"' % g for g in gene_caller_id_conversion_dict])

        # a lousy anonymous function to read data from tables given the gene calls of interest
        R = lambda table_name: db.DB(self.input_contigs_db_path, None, ignore_version=True) \
                                              .get_some_rows_from_table_as_dict(table_name,
                                                                                where_clause=where_clause,
                                                                                error_if_no_data=False)

        G = lambda g: gene_caller_id_conversion_dict[g]

        ############################################################################################
        # DO FUNCTIONS
        ###########################################################################################
        function_calls = R(t.gene_function_calls_table_name)

        for entry_id in function_calls:
            function_calls[entry_id]['gene_callers_id'] = G(function_calls[entry_id]['gene_callers_id'])

        gene_function_calls_table = TableForGeneFunctions(locus_output_db_path, run=self.run_object)
        gene_function_calls_table.create(function_calls)

        self.run.info("Output contigs DB path", locus_output_db_path)
        self.run.info("Output blank profile DB path", os.path.join(profile_output_dir, 'PROFILE.db'))

        ############################################################################################
        # DO AMINO ACID SEQUENCES -- we are using external gene calls to generate the new contigs
        #                            database, but amino acid sequnces are kept in a different table
        #                            and anvi'o checks whether provided gene calls resolve to amino
        #                            acid sequences with proper starts and stops. if not, it skips
        #                            them. but amino acid sequences for each gene call was stored
        #                            in the original contigs database, and the best practice is to
        #                            carry them into the new one. so here we will remove all data
        #                            from the amino acid seqeunces table in the new database, and
        #                            copy the contents from the original one.
        ############################################################################################
        amino_acid_sequences = R(t.gene_amino_acid_sequences_table_name)

        entries = [(gene_caller_id_conversion_dict[g], amino_acid_sequences[g]['sequence']) for g in amino_acid_sequences]
        db.DB(locus_output_db_path, None, ignore_version=True).insert_many(t.gene_amino_acid_sequences_table_name, entries=entries)

        ############################################################################################
        # REMOVE TEMP FILES
        ###########################################################################################
        if anvio.DEBUG:
            self.run.info_single("Temp output files were kept for inspection due to --debug")
        else:
            [os.remove(f) for f in temporary_files]
コード例 #10
0
ファイル: splitter.py プロジェクト: AstrobioMike/anvio
    def export_locus(self, gene_callers_id, output_path_prefix):
        """Takes a gene callers ID, and exports a contigs database.

           Output path prefix should be unique for every export locus call. If the prefix you provide
           looks like this:

                >>> output_path_prefix = '/path/to/dir/file_name_prefix'

           the output files will be stored as this:

                >>> '/path/to/dir/file_name_prefix.fa'
                >>> '/path/to/dir/file_name_prefix.db'

           """

        if os.path.isdir(output_path_prefix):
            raise ConfigError("Output path prefix can't be a directory name...")

        filesnpaths.is_output_file_writable(output_path_prefix + '.fa')

        if not self.contigs_db:
            self.contigs_db = dbops.ContigsSuperclass(self.args, r=self.run_object)
            self.contigs_db.init_functions()

        gene_call = self.contigs_db.genes_in_contigs_dict[gene_callers_id]
        contig_name = self.contigs_db.genes_in_contigs_dict[gene_callers_id]['contig']
        genes_in_contig_sorted = sorted(list(self.contigs_db.contig_name_to_genes[contig_name]))

        D = lambda: 1 if gene_call['direction'] == 'f' else -1
        premature = False

        self.run.info("Contig name", contig_name)
        self.run.info("Contig length", self.contigs_db.contigs_basic_info[contig_name]['length'])
        self.run.info("Num genes in contig", len(genes_in_contig_sorted))
        self.run.info("Target gene call", gene_callers_id)
        self.run.info("Target gene direction", "Forward" if D() == 1 else "Reverse", mc = 'green' if D() == 1 else 'red')

        gene_1 = gene_callers_id - self.num_genes_list[0] * D()
        gene_2 = gene_callers_id + self.num_genes_list[1] * D()
        first_gene_of_the_block = min(gene_1, gene_2)
        last_gene_of_the_block = max(gene_1, gene_2)

        self.run.info("First and last gene of the locus (raw)", "%d and %d" % (first_gene_of_the_block, last_gene_of_the_block))

        # getting the ids for the first and last genes in the contig
        last_gene_in_contig = genes_in_contig_sorted[-1][0]
        first_gene_in_contig = genes_in_contig_sorted[0][0]

        if last_gene_of_the_block > last_gene_in_contig:
            last_gene_of_the_block = last_gene_in_contig
            premature = True

        if first_gene_of_the_block < first_gene_in_contig:
            first_gene_of_the_block = first_gene_in_contig
            premature = True

        if premature and self.remove_partial_hits:
            self.run.info_single("A premature locus is found .. the current configuration says 'skip'. Skipping.", mc="red", nl_before=1)
            return
        elif premature and not self.remove_partial_hits:
            self.run.info_single("A premature locus is found .. the current configuration says 'whatevs'. Anvi'o will continue.", mc="yellow", nl_before=1, nl_after=1)

        self.run.info("First and last gene of the locus (final)", "%d and %d" % (first_gene_of_the_block, last_gene_of_the_block))

        locus_start = self.contigs_db.genes_in_contigs_dict[first_gene_of_the_block]['start']
        locus_stop = self.contigs_db.genes_in_contigs_dict[last_gene_of_the_block]['stop']

        # being a performance nerd here yes
        contig_sequence = db.DB(self.input_contigs_db_path, None, ignore_version=True) \
                            .get_some_rows_from_table(t.contig_sequences_table_name,
                                                      where_clause="contig='%s'" % contig_name)[0][1]
        locus_sequence = contig_sequence[locus_start:locus_stop]

        # here we will create a gene calls dict for genes that are specific to our locus. since we trimmed
        # the contig sequence to the locus of interest, we will have to adjust start and stop positions of
        # genes in teh gene calls dict.
        locus_gene_calls_dict = {}
        for g in range(first_gene_of_the_block, last_gene_of_the_block + 1):
            locus_gene_calls_dict[g] = copy.deepcopy(self.contigs_db.genes_in_contigs_dict[g])
            excess = self.contigs_db.genes_in_contigs_dict[first_gene_of_the_block]['start']
            locus_gene_calls_dict[g]['start'] -= excess
            locus_gene_calls_dict[g]['stop'] -= excess

        self.run.info("Locus gene call start/stops excess (nts)", excess)

        if D() != 1 and self.reverse_complement_if_necessary:
            reverse_complement = True
        else:
            reverse_complement = False

        self.run.info('Reverse complementing everything', reverse_complement, mc='green')

        # report a stupid FASTA file.
        if self.include_fasta_output:
            fasta_file_path = output_path_prefix + ".fa"

            self.run.info("Output FASTA file", fasta_file_path)
            with open(fasta_file_path, 'w') as f:
                locus_header = contig_name + ' ' + \
                               '|'.join(['target:%s' % ','.join(self.targets),
                                         'sources:%s' % ','.join(self.sources),
                                         'query:%s' % self.search_term or 'None',
                                         'hit_contig:%s' % contig_name,
                                         'hit_gene_callers_id:%s' % str(gene_callers_id),
                                         'project_name:%s' % self.contigs_db.a_meta['project_name'].replace(' ', '_').replace("'", '_').replace('"', '_'),
                                         'locus:%s,%s' % (str(first_gene_of_the_block), str(last_gene_of_the_block)),
                                         'nt_positions_in_contig:%s:%s' % (str(locus_start), str(locus_stop)),
                                         'premature:%s' % str(premature),
                                         'reverse_complemented:%s' % str(reverse_complement)])

                f.write('>%s\n' % locus_header)
                f.write('%s\n' % utils.rev_comp(locus_sequence) if reverse_complement else locus_sequence)

        # report a fancy anvi'o contigs database
        self.store_locus_as_contigs_db(contig_name,
                                       locus_sequence,
                                       locus_gene_calls_dict,
                                       output_path_prefix,
                                       reverse_complement)
コード例 #11
0
    def get_amino_acid_sequences_for_genes_in_gene_calls_dict(
            self, gene_calls_dict, ignore_internal_stop_codons=False):
        '''Recover amino acid sequences for gene calls in a gene_calls_dict.

           During this operation we are going to have to read all contig sequences
           into the damn memory. anvi'o is doing a pretty bad job with memory management :(
        '''

        amino_acid_sequences = {}

        # FIXME: this is a very poor practice for memory management:
        contig_sequences = {}

        if self.contigs_fasta:
            fasta = u.SequenceSource(self.contigs_fasta)
            while next(fasta):
                contig_sequences[fasta.id] = {'sequence': fasta.seq}
            fasta.close()
        else:
            database = db.DB(self.db_path,
                             utils.get_required_version_for_db(self.db_path))
            contig_sequences = database.get_table_as_dict(
                t.contig_sequences_table_name)

        num_genes_with_internal_stops = 0
        number_of_impartial_gene_calls = 0
        for gene_callers_id in gene_calls_dict:
            gene_call = gene_calls_dict[gene_callers_id]
            contig_name = gene_call['contig']

            if contig_name not in contig_sequences:
                # remove the partial contigs database so things don't get screwed later
                os.remove(self.db_path)
                raise ConfigError(
                    "You are in big trouble :( The contig name '%s' in your external gene callers file\
                                    does not appear to be in the contigs FASTA file. How did this happen?"
                    % contig_name)

            if gene_call['partial']:
                amino_acid_sequences[gene_callers_id] = ''
                number_of_impartial_gene_calls += 1
                continue

            sequence = contig_sequences[contig_name]['sequence'][
                gene_call['start']:gene_call['stop']]
            if gene_call['direction'] == 'r':
                sequence = utils.rev_comp(sequence)

            amino_acid_sequence = utils.get_DNA_sequence_translated(
                sequence, gene_callers_id)

            # check if there are any internal stops:
            if amino_acid_sequence.find('*') > -1:
                if ignore_internal_stop_codons:
                    amino_acid_sequence = amino_acid_sequence.replace('*', 'X')
                    num_genes_with_internal_stops += 1
                else:
                    os.remove(self.db_path)
                    raise ConfigError(
                        "Oops. Anvi'o run into an amino acid sequence (that corresponds to the gene callers id '%s')\
                                       which had an internal stop codon :/ This usually indicates that your external gene calls\
                                       have problems. If you still want to continue, you can ask anvi'o to ignore internal stop\
                                       codons on your own risk. It will probably look very ugly on your screen, but here is the\
                                       DNA sequence for that gene in case you don't trust anvi'o (which only would be fair since\
                                       anvi'o does not trust you either): %s" %
                        (str(gene_callers_id), sequence))

            amino_acid_sequences[gene_callers_id] = amino_acid_sequence

        if num_genes_with_internal_stops:
            percent_genes_with_internal_stops = num_genes_with_internal_stops * 100.0 / len(
                gene_calls_dict)
            self.run.warning("Please read this carefully: Your external gene calls contained open reading frames with internal\
                              stop codons, and you asked anvi'o to ignore those. Anvi'o replaced internal stop codons with 'X'\
                              characters, and stored them in the contigs database that way. %d of your genes, which corresponded\
                              to %.2f%% of the total %d genes, had internal stop codons. We hope you are happy."                                                                                                                 % \
                                        (num_genes_with_internal_stops, percent_genes_with_internal_stops, len(gene_calls_dict)))

        if number_of_impartial_gene_calls:
            self.run.warning(
                '%d of your %d gene calls were impartial, hence the translated amino acid sequences for those\
                              were not stored in the database.' %
                (number_of_impartial_gene_calls, len(gene_calls_dict)))

        return amino_acid_sequences