Ejemplo n.º 1
0
    def process_gene_call(self,
                          bam_file_object,
                          gene_call,
                          contig_sequence,
                          codons_to_profile=None):
        if gene_call['partial']:
            return None

        contig_name = gene_call['contig']

        # here we will create a dictionary to translate codons in a gene to nucleotide positions in the context
        # of the contig. thanks to this dict, we will be able to profile only a small number of codons from a
        # gene, if they are specified in `codons_to_profile` variable. for instance, this function is called
        # during profiling only with codon positions in genes that possess nucleotide variation.
        codon_order_to_nt_positions = utils.get_codon_order_to_nt_positions_dict(
            gene_call)

        # here we generate the actual 'linkmers' information.
        d = {}
        linkmers = LinkMersData()
        linkmers.quiet = True
        for codon_order in codon_order_to_nt_positions:
            if codons_to_profile and codon_order not in codons_to_profile:
                continue

            nt_positions = codon_order_to_nt_positions[codon_order]

            reference_codon_sequence = contig_sequence[
                nt_positions[0]:nt_positions[2] + 1]

            # if concensus sequence contains shitty characters, we will not continue
            if reference_codon_sequence not in codon_to_AA:
                continue

            linkmers.data = []
            linkmers.append(bam_file_object,
                            'sample_id',
                            None,
                            contig_name,
                            nt_positions,
                            only_complete_links=True)
            data = linkmers.data[0][2]

            hash_to_oligotype = {}
            unique_hashes = set([datum.read_unique_id for datum in data])
            for unique_hash in unique_hashes:
                hash_to_oligotype[unique_hash] = []

            for datum in data:
                hash_to_oligotype[datum.read_unique_id].append(
                    (datum.pos_in_contig, datum.base), )

            for unique_hash in unique_hashes:
                hash_to_oligotype[unique_hash] = ''.join(
                    [e[1] for e in sorted(hash_to_oligotype[unique_hash])])

            nt_frequencies = Counter(hash_to_oligotype.values())
            aa_frequencies = Counter({})

            # if the gene is reverse, we want to use the dict for reverse complementary conversions for DNA to AA
            conv_dict = codon_to_AA_RC if gene_call[
                'direction'] == 'r' else codon_to_AA

            reference_codon_AA = conv_dict[reference_codon_sequence]
            for nt in nt_frequencies:
                if conv_dict[
                        nt]:  # <-- this check here eliminates any codon that contains anything but [A, T, C, G].
                    aa_frequencies[conv_dict[nt]] += nt_frequencies[nt]

            coverage = sum(aa_frequencies.values())

            if not coverage:
                # FIXME: there was at least one case where the coverage here in this context was 0,
                #        which crashed the profiling. we never went after this issue, and it is
                #        important to understand how often this happens, and why.
                continue

            # here we quantify the ratio of frequencies of non-reference-aas observed in this codon
            # to the overall overage, and that is our `departure_from_reference`:
            total_frequency_of_all_codons_but_the_conensus = sum([
                aa_frequencies[aa] for aa in aa_frequencies
                if aa != reference_codon_AA
            ])
            departure_from_reference = total_frequency_of_all_codons_but_the_conensus / coverage

            d[codon_order] = {
                'reference': reference_codon_AA,
                'coverage': coverage,
                'frequencies': aa_frequencies,
                'departure_from_reference': departure_from_reference
            }

        return d
Ejemplo n.º 2
0
    def recover_base_frequencies_for_all_samples(self):
        self.progress.new('Recovering AA frequencies for all')

        samples_wanted = self.samples_of_interest if self.samples_of_interest else self.sample_ids
        splits_wanted = self.splits_of_interest if self.splits_of_interest else set(
            self.splits_basic_info.keys())
        next_available_entry_id = max(self.data.keys()) + 1

        unique_pos_identifier_str_to_consenus_codon = {}
        unique_pos_identifier_str_to_unique_pos_identifier = {}
        for e in self.data.values():
            upi = e['unique_pos_identifier_str']
            unique_pos_identifier_str_to_consenus_codon[upi] = e['consensus']
            unique_pos_identifier_str_to_unique_pos_identifier[upi] = e[
                'unique_pos_identifier']

        self.progress.update(
            'creating a dict to track missing AA frequencies for each sample / split / pos'
        )

        splits_to_consider = {}
        for split_name in splits_wanted:
            splits_to_consider[split_name] = {}

        self.progress.update(
            'populating the dict to track missing AA frequencies for each sample / split / pos'
        )
        for entry_id in self.data:
            v = self.data[entry_id]
            gene_codon_key = '%d_%d' % (v['corresponding_gene_call'],
                                        v['codon_order_in_gene'])
            d = splits_to_consider[v['split_name']]

            if d.has_key(gene_codon_key):
                d[gene_codon_key].remove(v['sample_id'])
            else:
                d[gene_codon_key] = copy.deepcopy(samples_wanted)
                d[gene_codon_key].remove(v['sample_id'])

        counter = 0
        for split_name in splits_to_consider:
            counter += 1
            self.progress.update(
                'accessing split coverages and updating variable positions dict :: %s'
                % pp(counter))

            split_coverage_across_samples = self.merged_split_coverage_values.get(
                split_name)

            split_info = self.splits_basic_info[split_name]
            contig_name = split_info['parent']

            for gene_codon_key in splits_to_consider[split_name]:
                corresponding_gene_call, codon_order_in_gene = [
                    int(k) for k in gene_codon_key.split('_')
                ]

                for sample_name in splits_to_consider[split_name][
                        gene_codon_key]:
                    unique_pos_identifier_str = '_'.join([
                        split_name,
                        str(corresponding_gene_call),
                        str(codon_order_in_gene)
                    ])
                    consensus_codon = unique_pos_identifier_str_to_consenus_codon[
                        unique_pos_identifier_str]

                    self.data[next_available_entry_id] = {
                        'unique_pos_identifier_str':
                        unique_pos_identifier_str,
                        'unique_pos_identifier':
                        unique_pos_identifier_str_to_unique_pos_identifier[
                            unique_pos_identifier_str],
                        'sample_id':
                        sample_name,
                        'split_name':
                        split_name,
                        'contig_name':
                        contig_name,
                        'corresponding_gene_call':
                        corresponding_gene_call,
                        'codon_order_in_gene':
                        codon_order_in_gene,
                        'departure_from_consensus':
                        0,
                        'coverage':
                        None,
                        'consensus':
                        consensus_codon
                    }

                    # DEALING WITH COVERAGE ##################################################################
                    # some very cool but expensive shit is going on here, let me break it down for poor souls of the future.
                    # what we want to do is to learn the coverage of this codon in the sample. all we have is the corresponding
                    # gene call id, and the order of this codon in the gene. so here how it goes:
                    #
                    # learn the gene call
                    gene_call = self.genes_in_contigs_dict[
                        corresponding_gene_call]

                    # the following dict converts codon orders into nt positions in contig for a geven gene call
                    codon_order_to_nt_positions_in_contig = utils.get_codon_order_to_nt_positions_dict(
                        gene_call)

                    # so the nucleotide positions for this codon in the contig is the following:
                    nt_positions_for_codon_in_contig = codon_order_to_nt_positions_in_contig[
                        codon_order_in_gene]

                    # but we need to convert those positions to the context of this split. so here is the start pos:
                    split_start = self.splits_basic_info[split_name]['start']

                    # here we map nt positions from the contig context to split context using the start position
                    nt_positions_for_codon_in_split = [
                        p - split_start
                        for p in nt_positions_for_codon_in_contig
                    ]

                    # we acquire coverages that match to these positions
                    coverages = split_coverage_across_samples[sample_name][
                        nt_positions_for_codon_in_split]
                    coverage = int(round(sum(coverages) / 3))

                    # and finally update the data table
                    self.data[next_available_entry_id]['coverage'] = coverage

                    # DEALING WITH AAs ##################################################################
                    # here we need to put all the codons into the data table for this sample
                    for codon in set(codon_to_AA.values()):
                        self.data[next_available_entry_id][codon] = 0

                    # and finally update the frequency of the consensus codon with the coverage (WHICH IS VERY BAD,
                    # WE HAVE NO CLUE WHAT IS THE ACTUAL COVERAGE OF TRIPLICATE LINKMERS):
                    self.data[next_available_entry_id][
                        consensus_codon] = coverage

                    next_available_entry_id += 1

        self.progress.end()
Ejemplo n.º 3
0
    def recover_base_frequencies_for_all_samples(self):
        self.progress.new('Recovering AA frequencies for all')

        samples_wanted = self.samples_of_interest if self.samples_of_interest else self.sample_ids
        splits_wanted = self.splits_of_interest if self.splits_of_interest else set(self.splits_basic_info.keys())
        next_available_entry_id = max(self.data.keys()) + 1

        unique_pos_identifier_str_to_consenus_codon = {}
        unique_pos_identifier_str_to_unique_pos_identifier = {}
        for e in self.data.values():
            upi = e['unique_pos_identifier_str']
            unique_pos_identifier_str_to_consenus_codon[upi] = e['reference']
            unique_pos_identifier_str_to_unique_pos_identifier[upi] = e['unique_pos_identifier']

        self.progress.update('creating a dict to track missing AA frequencies for each sample / split / pos')

        splits_to_consider = {}
        for split_name in splits_wanted:
            splits_to_consider[split_name] = {}

        self.progress.update('populating the dict to track missing AA frequencies for each sample / split / pos')
        for entry_id in self.data:
            v = self.data[entry_id]
            gene_codon_key = '%d_%d' % (v['corresponding_gene_call'], v['codon_order_in_gene'])
            d = splits_to_consider[v['split_name']]

            if gene_codon_key in d:
                d[gene_codon_key].remove(v['sample_id'])
            else:
                d[gene_codon_key] = copy.deepcopy(samples_wanted)
                d[gene_codon_key].remove(v['sample_id'])

        counter = 0
        for split_name in splits_to_consider:
            counter += 1
            self.progress.update('accessing split coverages and updating variable positions dict :: %s' % pp(counter))

            split_coverage_across_samples = self.merged_split_coverage_values.get(split_name)

            split_info = self.splits_basic_info[split_name]
            contig_name = split_info['parent']

            for gene_codon_key in splits_to_consider[split_name]:
                corresponding_gene_call, codon_order_in_gene = [int(k) for k in gene_codon_key.split('_')]

                for sample_name in splits_to_consider[split_name][gene_codon_key]:
                    unique_pos_identifier_str = '_'.join([split_name, str(corresponding_gene_call), str(codon_order_in_gene)])
                    reference_codon = unique_pos_identifier_str_to_consenus_codon[unique_pos_identifier_str]

                    self.data[next_available_entry_id] = {'unique_pos_identifier_str': unique_pos_identifier_str,
                                                          'unique_pos_identifier': unique_pos_identifier_str_to_unique_pos_identifier[unique_pos_identifier_str],
                                                          'sample_id': sample_name,
                                                          'split_name': split_name,
                                                          'contig_name': contig_name,
                                                          'corresponding_gene_call': corresponding_gene_call,
                                                          'codon_order_in_gene': codon_order_in_gene,
                                                          'departure_from_reference': 0,
                                                          'coverage': None,
                                                          'reference': reference_codon}

                    # DEALING WITH COVERAGE ##################################################################
                    # some very cool but expensive shit is going on here, let me break it down for poor souls of the future.
                    # what we want to do is to learn the coverage of this codon in the sample. all we have is the corresponding
                    # gene call id, and the order of this codon in the gene. so here how it goes:
                    #
                    # learn the gene call
                    gene_call = self.genes_in_contigs_dict[corresponding_gene_call]

                    # the following dict converts codon orders into nt positions in contig for a geven gene call
                    codon_order_to_nt_positions_in_contig = utils.get_codon_order_to_nt_positions_dict(gene_call)

                    # so the nucleotide positions for this codon in the contig is the following:
                    nt_positions_for_codon_in_contig = codon_order_to_nt_positions_in_contig[codon_order_in_gene]

                    # but we need to convert those positions to the context of this split. so here is the start pos:
                    split_start = self.splits_basic_info[split_name]['start']

                    # here we map nt positions from the contig context to split context using the start position
                    nt_positions_for_codon_in_split = [p - split_start for p in nt_positions_for_codon_in_contig]

                    # we acquire coverages that match to these positions
                    coverages = split_coverage_across_samples[sample_name][nt_positions_for_codon_in_split]
                    coverage = int(round(sum(coverages) / 3))

                    # and finally update the data table
                    self.data[next_available_entry_id]['coverage'] = coverage

                    # DEALING WITH AAs ##################################################################
                    # here we need to put all the codons into the data table for this sample
                    for codon in set(codon_to_AA.values()):
                        self.data[next_available_entry_id][codon] = 0

                    # and finally update the frequency of the reference codon with the coverage (WHICH IS VERY BAD,
                    # WE HAVE NO CLUE WHAT IS THE ACTUAL COVERAGE OF TRIPLICATE LINKMERS):
                    self.data[next_available_entry_id][reference_codon] = coverage

                    # insert additional fields for this newly added data point
                    self.insert_additional_fields([next_available_entry_id])

                    next_available_entry_id += 1

        self.progress.end()
Ejemplo n.º 4
0
    def process_gene_call(self, bam_file_object, gene_call, contig_sequence, codons_to_profile=None, return_AA_frequencies_instead=False):
        if gene_call['partial']:
            return None

        contig_name = gene_call['contig']

        # here we will create a dictionary to translate codons in a gene to nucleotide positions in the context
        # of the contig. thanks to this dict, we will be able to profile only a small number of codons from a
        # gene, if they are specified in `codons_to_profile` variable. for instance, this function is called
        # during profiling only with codon positions in genes that possess nucleotide variation.
        codon_order_to_nt_positions = utils.get_codon_order_to_nt_positions_dict(gene_call)

        # here we generate the actual 'linkmers' information.
        d = {}
        linkmers = LinkMersData()
        linkmers.quiet = True
        for codon_order in codon_order_to_nt_positions:
            if codons_to_profile and codon_order not in codons_to_profile:
                continue

            nt_positions = codon_order_to_nt_positions[codon_order]

            reference_codon_sequence = contig_sequence[nt_positions[0]:nt_positions[2] + 1]

            # if consensus sequence contains shitty characters, we will not continue
            if reference_codon_sequence not in constants.codon_to_AA:
                continue

            linkmers.data = []
            linkmers.append(bam_file_object, 'sample_id', None, contig_name, nt_positions, only_complete_links=True)
            data = linkmers.data[0][2]

            hash_to_oligotype = {}
            unique_hashes = set([datum.read_unique_id for datum in data])
            for unique_hash in unique_hashes:
                hash_to_oligotype[unique_hash] = []

            for datum in data:
                hash_to_oligotype[datum.read_unique_id].append((datum.pos_in_contig, datum.base),)

            for unique_hash in unique_hashes:
                hash_to_oligotype[unique_hash] = ''.join([e[1] for e in sorted(hash_to_oligotype[unique_hash])])

            codon_frequencies = Counter(list(hash_to_oligotype.values()))

            # depending on what we want to return item frequencies will contain frequencies for amino acids or
            # codons.
            item_frequencies = Counter({})

            # our conversion dicts will differ if the user is asking for codons or AAs, and if the gene is
            # reverse or forward.
            conv_dict = None
            gene_is_reverse = gene_call['direction'] == 'r'
            if return_AA_frequencies_instead:
                if gene_is_reverse:
                    conv_dict = constants.codon_to_AA_RC
                else:
                    conv_dict = constants.codon_to_AA
            else:
                if gene_is_reverse:
                    conv_dict = constants.codon_to_codon_RC
                else:
                    # no conversion is necessary, so this is a mock dictionary that
                    # resturns the key.
                    conv_dict = dict(zip(constants.codons, constants.codons))

            # the magic happens here:
            for codon in codon_frequencies:
                if codon in conv_dict:
                    item_frequencies[conv_dict[codon]] += codon_frequencies[codon]
                else:
                    # so there is a way the programmer can learn that some weird
                    # stuff did not get reported
                    self.not_reported_items[codon] += 1

            reference_item = conv_dict[reference_codon_sequence]
            coverage = sum(item_frequencies.values())

            if not coverage:
                # FIXME: there was at least one case where the coverage here in this context was 0,
                #        which crashed the profiling. we never went after this issue, and it is
                #        important to understand how often this happens, and why.
                continue

            # here we quantify the ratio of frequencies of non-reference-aas observed in this codon
            # to the overall overage, and that is our `departure_from_reference`:
            total_frequency_of_all_items_but_the_conensus = sum([item_frequencies[item] for item in item_frequencies if item != reference_item])
            departure_from_reference = total_frequency_of_all_items_but_the_conensus / coverage

            d[codon_order] = {'reference': reference_item,
                              'coverage': coverage,
                              'frequencies': item_frequencies,
                              'departure_from_reference': departure_from_reference}

        return d
Ejemplo n.º 5
0
    def process_gene_call(self, bam_file_object, gene_call, contig_sequence, codons_to_profile=None):
        if gene_call['partial']:
            return None

        contig_name = gene_call['contig']

        # here we will create a dictionary to translate codons in a gene to nucleotide positions in the context
        # of the contig. thanks to this dict, we will be able to profile only a small number of codons from a
        # gene, if they are specified in `codons_to_profile` variable. for instance, this function is called
        # during profiling only with codon positions in genes that possess nucleotide variation.
        codon_order_to_nt_positions = utils.get_codon_order_to_nt_positions_dict(gene_call)

        # here we generate the actual 'linkmers' information.
        d = {}
        linkmers = LinkMersData()
        linkmers.quiet = True
        for codon_order in codon_order_to_nt_positions:
            if codons_to_profile and codon_order not in codons_to_profile:
                continue

            nt_positions = codon_order_to_nt_positions[codon_order]

            reference_codon_sequence = contig_sequence[nt_positions[0]:nt_positions[2] + 1]

            # if concensus sequence contains shitty characters, we will not continue
            if reference_codon_sequence not in codon_to_AA:
                continue

            linkmers.data = []
            linkmers.append(bam_file_object, 'sample_id', None, contig_name, nt_positions, only_complete_links=True)
            data = linkmers.data[0][2]

            hash_to_oligotype = {}
            unique_hashes = set([datum.read_unique_id for datum in data])
            for unique_hash in unique_hashes:
                hash_to_oligotype[unique_hash] = []

            for datum in data:
                hash_to_oligotype[datum.read_unique_id].append((datum.pos_in_contig, datum.base),)

            for unique_hash in unique_hashes:
                hash_to_oligotype[unique_hash] = ''.join([e[1] for e in sorted(hash_to_oligotype[unique_hash])])

            nt_frequencies = Counter(hash_to_oligotype.values())
            aa_frequencies = Counter({})

            # if the gene is reverse, we want to use the dict for reverse complementary conversions for DNA to AA
            conv_dict = codon_to_AA_RC if gene_call['direction'] == 'r' else codon_to_AA

            reference_codon_AA = conv_dict[reference_codon_sequence]
            for nt in nt_frequencies:
                if conv_dict[nt]: # <-- this check here eliminates any codon that contains anything but [A, T, C, G].
                    aa_frequencies[conv_dict[nt]] += nt_frequencies[nt]

            coverage = sum(aa_frequencies.values())

            if not coverage:
                # FIXME: there was at least one case where the coverage here in this context was 0,
                #        which crashed the profiling. we never went after this issue, and it is
                #        important to understand how often this happens, and why.
                continue

            # here we quantify the ratio of frequencies of non-reference-aas observed in this codon
            # to the overall overage, and that is our `departure_from_reference`:
            total_frequency_of_all_codons_but_the_conensus = sum([aa_frequencies[aa] for aa in aa_frequencies if aa != reference_codon_AA])
            departure_from_reference = total_frequency_of_all_codons_but_the_conensus / coverage

            d[codon_order] = {'reference': reference_codon_AA,
                              'coverage': coverage,
                              'frequencies': aa_frequencies,
                              'departure_from_reference': departure_from_reference}

        return d