Esempio n. 1
0
    def generate_variabile_codons_table(self):
        if self.skip_SNV_profiling or not self.profile_SCVs:
            return

        variable_codons_table = TableForCodonFrequencies(
            self.profile_db_path, progress=self.progress)

        codon_frequencies = bamops.CodonFrequencies()

        codons_in_genes_to_profile_SCVs_dict = {}
        for gene_callers_id, codon_order in self.codons_in_genes_to_profile_SCVs:
            if gene_callers_id not in codons_in_genes_to_profile_SCVs_dict:
                codons_in_genes_to_profile_SCVs_dict[gene_callers_id] = set([])
            codons_in_genes_to_profile_SCVs_dict[gene_callers_id].add(
                codon_order)

        gene_caller_ids_to_profile = list(
            codons_in_genes_to_profile_SCVs_dict.keys())

        for i in range(len(gene_caller_ids_to_profile)):
            gene_callers_id = gene_caller_ids_to_profile[i]
            codons_to_profile = codons_in_genes_to_profile_SCVs_dict[
                gene_callers_id]

            gene_call = self.genes_in_contigs_dict[gene_callers_id]
            contig_name = gene_call['contig']
            codon_frequencies_dict = codon_frequencies.process_gene_call(
                self.bam, gene_call,
                self.contig_sequences[contig_name]['sequence'],
                codons_to_profile)

            for codon_order in codon_frequencies_dict:
                e = codon_frequencies_dict[codon_order]

                db_entry = {
                    'sample_id': self.sample_id,
                    'corresponding_gene_call': gene_callers_id
                }
                db_entry['reference'] = e['reference']
                db_entry['coverage'] = e['coverage']
                db_entry['departure_from_reference'] = e[
                    'departure_from_reference']
                db_entry['codon_order_in_gene'] = codon_order
                for codon in list(constants.codon_to_AA.keys()):
                    db_entry[codon] = e['frequencies'][codon]

                variable_codons_table.append(db_entry)

        variable_codons_table.store()

        # clear contents of set
        self.codons_in_genes_to_profile_SCVs.clear()

        if len(codon_frequencies.not_reported_items):
            items = codon_frequencies.not_reported_items
            self.run.warning(
                "The profiler of single-codon variants failed to report anything for a\
                              total of %d items, because they looked weird to anvi'o :( Here is a list\
                              of those that did ended up being ignored: '%s'."
                % (len(items), ', '.join(items)))
Esempio n. 2
0
    def profile_contig_worker(self, available_index_queue, output_queue):
        bam_file = pysam.Samfile(self.input_file_path, 'rb')

        while True:
            index = available_index_queue.get(True)
            contig_name = self.contig_names[index]
            contig = contigops.Contig(contig_name)
            contig.length = self.contig_lengths[index]
            contig.split_length = self.a_meta['split_length']
            contig.min_coverage_for_variability = self.min_coverage_for_variability
            contig.skip_SNV_profiling = self.skip_SNV_profiling
            contig.report_variability_full = self.report_variability_full
            contig.ignore_orphans = not self.include_orphans
            contig.max_coverage_depth = self.max_coverage_depth

            # populate contig with empty split objects and
            for split_name in self.contig_name_to_splits[contig_name]:
                s = self.splits_basic_info[split_name]
                split_sequence = self.contig_sequences[contig_name][
                    'sequence'][s['start']:s['end']]
                split = contigops.Split(split_name, split_sequence,
                                        contig_name, s['order_in_parent'],
                                        s['start'], s['end'])
                contig.splits.append(split)

            # analyze coverage for each split
            contig.analyze_coverage(bam_file)

            # test the mean coverage of the contig.
            if contig.coverage.mean < self.min_mean_coverage:
                output_queue.put(None)
                continue

            if not self.skip_SNV_profiling:
                contig.analyze_auxiliary(bam_file)
                codons_in_genes_to_profile_SCVs = set([])
                for split in contig.splits:
                    for column_profile in list(split.column_profiles.values()):
                        pos_in_contig = column_profile['pos_in_contig']
                        column_profile['in_partial_gene_call'], \
                        column_profile['in_complete_gene_call'],\
                        column_profile['base_pos_in_codon'] = self.get_nt_position_info(contig.name, pos_in_contig)

                        column_profile['sample_id'] = self.sample_id
                        column_profile[
                            'corresponding_gene_call'] = -1  # this means there is no gene call that corresponds to this
                        # nt position, which will be updated in the following lines.
                        # yeah, we use '-1', because genecaller ids start from 0 :/
                        column_profile['codon_order_in_gene'] = -1

                        # if this particular position (`pos_in_contig`) falls within a COMPLETE gene call,
                        # we would like to find out which unique gene caller id(s) match to this position.
                        if column_profile['in_complete_gene_call']:
                            corresponding_gene_caller_ids = self.get_corresponding_gene_caller_ids_for_base_position(
                                contig.name, pos_in_contig)

                            # if there are more than one corresponding gene call, this usually indicates an assembly error
                            # just to be on the safe side, we will not report a corresopnding unique gene callers id for this
                            # position
                            if len(corresponding_gene_caller_ids) == 1:
                                # if we are here, it means this nucleotide position is in a complete gene call. we will do two things here.
                                # first, we will store the gene_callers_id that corresponds to this nt position, and then we will store the
                                # order of the corresponding codon in the gene for this nt position.
                                gene_callers_id = corresponding_gene_caller_ids[
                                    0]
                                column_profile[
                                    'corresponding_gene_call'] = gene_callers_id
                                column_profile[
                                    'codon_order_in_gene'] = self.get_corresponding_codon_order_in_gene(
                                        gene_callers_id, contig.name,
                                        pos_in_contig)

                                # save this information for later use
                                codons_in_genes_to_profile_SCVs.add(
                                    (gene_callers_id,
                                     column_profile['codon_order_in_gene']), )

                codon_frequencies = bamops.CodonFrequencies()

                codons_in_genes_to_profile_SCVs_dict = {}
                for gene_callers_id, codon_order in codons_in_genes_to_profile_SCVs:
                    if gene_callers_id not in codons_in_genes_to_profile_SCVs_dict:
                        codons_in_genes_to_profile_SCVs_dict[
                            gene_callers_id] = set([])
                    codons_in_genes_to_profile_SCVs_dict[gene_callers_id].add(
                        codon_order)

                gene_caller_ids_to_profile = list(
                    codons_in_genes_to_profile_SCVs_dict.keys())

                for i in range(len(gene_caller_ids_to_profile)):
                    gene_callers_id = gene_caller_ids_to_profile[i]
                    codons_to_profile = codons_in_genes_to_profile_SCVs_dict[
                        gene_callers_id]

                    gene_call = self.genes_in_contigs_dict[gene_callers_id]
                    contig_name = gene_call['contig']
                    contig.codon_frequencies_dict[
                        gene_callers_id] = codon_frequencies.process_gene_call(
                            bam_file, gene_call,
                            self.contig_sequences[contig_name]['sequence'],
                            codons_to_profile)

            output_queue.put(contig)

            for split in contig.splits:
                del split.coverage
                del split.auxiliary
                del split
            del contig.splits[:]
            del contig.coverage
            del contig

        # we are closing this object here for clarity, although w
        # are not really closing it since the code never reaches here
        # and the worker is killed by its parent:
        bam_file.close()
        return