Exemple #1
0
    def increment(self, info):
        # first do overall mute freqs
        freq = utils.get_mutation_rate(self.germline_seqs, info)
        self.mean_rates['all'].fill(freq)

        # then per-region stuff
        for region in utils.regions:
            # per-region mean freqs
            freq = utils.get_mutation_rate(self.germline_seqs, info, restrict_to_region=region)
            self.mean_rates[region].fill(freq)

            # per-gene per-position
            if info[region + '_gene'] not in self.counts:
                self.counts[info[region + '_gene']] = {}
                # self.tmpcounts[info[region + '_gene']] = {}
            mute_counts = self.counts[info[region + '_gene']]  # temporary variable to avoid long dict access
            # tmpmute_counts = self.tmpcounts[info[region + '_gene']]  # temporary variable to avoid long dict access
            germline_seq = info[region + '_gl_seq']
            query_seq = info[region + '_qr_seq']
            assert len(germline_seq) == len(query_seq)
            for inuke in range(len(germline_seq)):
                i_germline = inuke + int(info[region + '_5p_del'])  # account for left-side deletions in the indexing
                if germline_seq[inuke] in utils.ambiguous_bases or query_seq[inuke] in utils.ambiguous_bases:
                    continue
                if i_germline not in mute_counts:  # if we have not yet observed this position in a query sequence, initialize it
                    mute_counts[i_germline] = {n : 0 for n in utils.nukes + ['total', ]}
                    mute_counts[i_germline]['gl_nuke'] = germline_seq[inuke]
                mute_counts[i_germline]['total'] += 1
                mute_counts[i_germline][query_seq[inuke]] += 1
Exemple #2
0
    def increment(self, info):
        self.mean_rates['all'].fill(utils.get_mutation_rate(info))  # mean freq over whole sequence (excluding insertions)

        for region in utils.regions:
            # first do mean freqs
            regional_freq = utils.get_mutation_rate(info, restrict_to_region=region)
            self.mean_rates[region].fill(regional_freq)  # per-region mean freq

            # then do per-gene per-position freqs
            gene = info[region + '_gene']

            if gene not in self.counts:
                self.counts[gene] = {}
            gcts = self.counts[gene]  # shorthand name

            germline_seq = info[region + '_gl_seq']
            query_seq = info[region + '_qr_seq']
            assert len(germline_seq) == len(query_seq)

            for ipos in range(len(germline_seq)):
                igl = ipos + int(info[region + '_5p_del'])  # account for left-side deletions in the indexing

                if germline_seq[ipos] in utils.ambiguous_bases or query_seq[ipos] in utils.ambiguous_bases:  # skip if either germline or query sequence is ambiguous at this position
                    continue

                if igl not in gcts:  # if we have not yet observed this position in a query sequence, initialize it
                    gcts[igl] = {n : 0 for n in utils.nukes + ['total', ]}
                    gcts[igl]['gl_nuke'] = germline_seq[ipos]

                gcts[igl]['total'] += 1
                gcts[igl][query_seq[ipos]] += 1  # note that if <query_seq[ipos]> isn't among <utils.nukes>, this will toss a key error
    def evaluate(self, true_line, inf_line):
        for column in self.values:
            if column in bool_columns:
                if utils.are_alleles(true_line[column], inf_line[column]):  # NOTE you have to change this above as well!
                # if true_line[column] == inf_line[column]:
                    self.values[column]['right'] += 1
                else:
                    self.values[column]['wrong'] += 1
            else:
                trueval, guessval = 0, 0
                if column[2:] == '_insertion':  # insertion length
                    trueval = len(true_line[column])
                    guessval = len(inf_line[column])
                # elif '_content' in column:
                #     seq_to_use = inf_line[column[ : column.find('_', 3)]]  # NOTE has to work for seq_content *and* vd_insertion_content, hence the 3
                #         for nuke in seq_to_use:
                #             self.counts[col][nuke] += 1
                elif 'hamming_to_true_naive' in column:
                    trueval = 0  # NOTE this is a kind of weird way to do it, since diff ends up as really just the guessval, but it does the job
                    restrict_to_region = column[0].replace('h', '')  # if fist char in <column> is not an 'h', restrict to that region
                    normalize = '_norm' in column
                    guessval = self.hamming_distance_to_true_naive(true_line, inf_line, inf_line['unique_id'], restrict_to_region=restrict_to_region, normalize=normalize)
                else:
                    trueval = int(true_line[column])
                    guessval = int(inf_line[column])

                diff = guessval - trueval
                if diff not in self.values[column]:
                    self.values[column][diff] = 0
                self.values[column][diff] += 1

        for column in self.hists:
            trueval = utils.get_mutation_rate(self.germlines, true_line)
            guessval = utils.get_mutation_rate(self.germlines, inf_line)
            self.hists[column].fill(guessval - trueval)
    def evaluate(self, true_line, inf_line):

        overall_mute_freq = utils.get_mutation_rate(true_line,
                                                    iseq=0)  # true value

        for column in self.values:
            if column in bool_columns:
                self.set_bool_column(
                    true_line, inf_line, column, overall_mute_freq
                )  # this also sets the fraction-correct-vs-mute-freq hists
            else:  # these should all be integer-valued
                trueval, guessval = 0, 0
                if column[2:] == '_insertion':  # insertion length
                    trueval = len(true_line[column])
                    guessval = len(inf_line[column])
                elif 'hamming_to_true_naive' in column:
                    restrict_to_region = column.replace(
                        'hamming_to_true_naive', '').replace('_', '')
                    trueval = 0
                    guessval = self.hamming_to_true_naive(
                        true_line,
                        inf_line,
                        restrict_to_region=restrict_to_region)
                elif 'muted_bases' in column:
                    restrict_to_region = column.replace('muted_bases',
                                                        '').replace('_', '')
                    trueval = utils.get_n_muted(
                        true_line,
                        iseq=0,
                        restrict_to_region=restrict_to_region
                    )  # when we're evaluating on multi-seq hmm output, we synthesize single-sequence lines for each sequence
                    guessval = utils.get_n_muted(
                        inf_line,
                        iseq=0,
                        restrict_to_region=restrict_to_region)
                else:
                    trueval = int(true_line[column])
                    guessval = int(inf_line[column])

                diff = guessval - trueval
                if diff not in self.values[column]:
                    self.values[column][diff] = 0
                self.values[column][diff] += 1

        for region in utils.regions:
            if region + '_per_gene_support' in inf_line:
                self.set_per_gene_support(true_line, inf_line, region)

        tmptrueval = utils.get_mutation_rate(
            true_line, iseq=0, restrict_to_region=''
        )  # when we're evaluating on multi-seq hmm output, we synthesize single-sequence lines for each sequence
        tmpguessval = utils.get_mutation_rate(inf_line,
                                              iseq=0,
                                              restrict_to_region='')
        self.hists['mute_freqs'].fill(tmpguessval - tmptrueval)
    def evaluate(self, true_line, inf_line, padfo=None):
 	#CHANGES FOR MIXCR
        #overall_mute_freq = utils.get_mutation_rate(self.germlines, true_line)  # true value

        for column in self.values:
            if self.only_correct_gene_fractions and column not in bool_columns:
                continue
            if column in bool_columns:
                if utils.are_alleles(true_line[column], inf_line[column]):  # NOTE you have to change this above as well!
                    self.values[column]['right'] += 1
                    self.hists[column + '_right_vs_mute_freq'].fill(overall_mute_freq)  # NOTE this'll toss a KeyError if you add bool column that aren't [vdj]_gene
                else:
                    self.values[column]['wrong'] += 1
                    self.hists[column + '_wrong_vs_mute_freq'].fill(overall_mute_freq)
            else:
                trueval, guessval = 0, 0
                if column[2:] == '_insertion':  # insertion length
                    trueval = len(true_line[column])
                    guessval = len(inf_line[column])
                # elif '_content' in column:
                #     seq_to_use = inf_line[column[ : column.find('_', 3)]]  # NOTE has to work for seq_content *and* vd_insertion_content, hence the 3
                #         for nuke in seq_to_use:
                #             self.counts[col][nuke] += 1
                elif 'hamming_to_true_naive' in column:
                    trueval = 0  # NOTE this is a kind of weird way to do it, since diff ends up as really just the guessval, but it does the job
                    restrict_to_region = column[0].replace('h', '')  # if fist char in <column> is not an 'h', restrict to that region
                    normalize = '_norm' in column
                    guessval = self.hamming_distance_to_true_naive(true_line, inf_line, inf_line['unique_id'], restrict_to_region=restrict_to_region, normalize=normalize, padfo=padfo)
                else:
		    #CHANGES FOR MIXCR
		    return
                    #trueval = int(true_line[column])
                    #guessval = int(inf_line[column])

                diff = guessval - trueval
                if diff not in self.values[column]:
                    self.values[column][diff] = 0
                self.values[column][diff] += 1

        for column in self.hists:
            if '_vs_mute_freq' in column:  # fill these above
                continue
            if len(re.findall('[vdj]_', column)) == 1:
                region = re.findall('[vdj]_', column)[0][0]
            else:
                region = ''
            trueval = utils.get_mutation_rate(self.germlines, true_line, restrict_to_region=region)
            guessval = utils.get_mutation_rate(self.germlines, inf_line, restrict_to_region=region)
            self.hists[column].fill(guessval - trueval)
Exemple #6
0
    def check_tree_simulation(self, mean_total_height, regional_heights, chosen_tree, scaled_trees, regional_naive_seqs, mseqs, reco_event, debug=False):
        assert reco_event.line is not None  # make sure we already set it

        # check the height for each region
        mean_observed = {n : 0.0 for n in ['all'] + utils.regions}
        for iseq in range(len(reco_event.final_seqs)):
            mean_observed['all'] += reco_event.line['mut_freqs'][iseq]
            for region in utils.regions:  # NOTE for simulating, we mash the insertions in with the D, but this isn't accounted for here
                rrate = utils.get_mutation_rate(reco_event.line, iseq=iseq, restrict_to_region=region)
                mean_observed[region] += rrate
        if debug:
            print '             in          out'
        for rname in ['all'] + utils.regions:
            mean_observed[rname] /= float(len(reco_event.final_seqs))
            if rname == 'all':
                input_height = mean_total_height
                if self.args.mutation_multiplier is not None:  # multiply the branch lengths by some factor
                    input_height *= self.args.mutation_multiplier
            else:
                input_height = regional_heights[rname]
            self.validation_values['heights'][rname]['in'].append(input_height)
            self.validation_values['heights'][rname]['out'].append(mean_observed[rname])
            if debug:
                print '  %4s    %7.3f     %7.3f' % (rname, input_height, mean_observed[rname])

        treeutils.get_tree_difference_metrics('all', chosen_tree, reco_event.final_seqs, reco_event.line['naive_seq'])
    def evaluate(self, true_line, inf_line, padfo=None):

        overall_mute_freq = utils.get_mutation_rate(self.germlines, true_line)  # true value

        for column in self.values:
            if self.only_correct_gene_fractions and column not in bool_columns:
                continue
            if column in bool_columns:
                self.set_bool_column(true_line, inf_line, column, overall_mute_freq)
            else:
                trueval, guessval = 0, 0
                if column[2:] == '_insertion':  # insertion length
                    trueval = len(true_line[column])
                    guessval = len(inf_line[column])
                # elif '_content' in column:
                #     seq_to_use = inf_line[column[ : column.find('_', 3)]]  # NOTE has to work for seq_content *and* vd_insertion_content, hence the 3
                #         for nuke in seq_to_use:
                #             self.counts[col][nuke] += 1
                elif 'hamming_to_true_naive' in column:
                    trueval = 0  # NOTE this is a kind of weird way to do it, since diff ends up as really just the guessval, but it does the job
                    restrict_to_region = column[0].replace('h', '')  # if fist char in <column> is not an 'h', restrict to that region
                    normalize = '_norm' in column
                    guessval = self.hamming_distance_to_true_naive(true_line, inf_line, inf_line['unique_id'], restrict_to_region=restrict_to_region, normalize=normalize, padfo=padfo)
                else:
                    trueval = int(true_line[column])
                    guessval = int(inf_line[column])

                diff = guessval - trueval
                if diff not in self.values[column]:
                    self.values[column][diff] = 0
                self.values[column][diff] += 1

        for region in utils.regions:
            if region + '_per_gene_support' in inf_line:
                self.set_per_gene_support(true_line, inf_line, region)

        for column in self.hists:
            if '_vs_mute_freq' in column or '_per_gene_support' in column:  # fill these above
                continue
            if len(re.findall('[vdj]_', column)) == 1:
                region = re.findall('[vdj]_', column)[0][0]
            else:
                region = ''
            trueval = utils.get_mutation_rate(self.germlines, true_line, restrict_to_region=region)
            guessval = utils.get_mutation_rate(self.germlines, inf_line, restrict_to_region=region)
            self.hists[column].fill(guessval - trueval)
Exemple #8
0
    def choose_clonal_representatives(self, swfo, debug=False):
        # NOTE do *not* modify <self.glfo> (in the future it would be nice to just modify <self.glfo>, but for now we need it to be super clear in partitiondriver what is happening to <self.glfo>)

        # first remove non-full-length sequences
        full_length_queries = [q for q in swfo['queries'] if swfo[q]['v_5p_del'] == 0 and swfo[q]['j_3p_del'] == 0]
        print '   removing %d/%d sequences with v_5p or j_3p deletions' % (len(swfo['queries']) - len(full_length_queries), len(swfo['queries']))
        if len(full_length_queries) == 0:
            return None, None, None

        # then cluster by full-length (v+d+j) naive sequence
        clusters = utils.collapse_naive_seqs(swfo, queries=full_length_queries)

        # then build <qr_seqs> from the v sequences corresponding to the least-j-mutated sequence in each of these clusters (skipping clusterings that are too mutated)
        qr_seqs = {}
        self.all_j_mutations = {}
        for cluster in clusters:
            clusterstr = ':'.join(cluster)
            j_mutations = {q : utils.get_n_muted(swfo[q], iseq=0, restrict_to_region=self.other_region) for q in cluster}
            best_query, smallest_j_mutations = sorted(j_mutations.items(), key=operator.itemgetter(1))[0]  # take the sequence with the lowest j mutation for each cluster, if it doesn't have too many j mutations NOTE choose_cluster_representatives() in allelefinder is somewhat similar
            if smallest_j_mutations < self.max_mutations['j']:
                qr_seqs[best_query] = indelutils.get_qr_seqs_with_indels_reinstated(swfo[best_query], iseq=0)[self.region]
            for query in cluster:
                self.all_j_mutations[query] = j_mutations[query]  # I don't think I can key by the cluster str, since here things correspond to the naive-seq-collapsed clusters, then we remove some of the clusters, and then cluster with vsearch
        print '   collapsed %d input sequences into %d representatives from %d clones (removed %d clones with >= %d j mutations)' % (len(full_length_queries), len(qr_seqs), len(clusters), len(clusters) - len(qr_seqs), self.max_mutations['j'])

        self.gene_info = {q : swfo[q][self.region + '_gene'] for q in qr_seqs}  # assigned gene for the clonal representative from each cluster that we used (i.e. *not* from every sequence in the sample)
        self.mfreqs = {  # NOTE only includes cluster representatives, i.e. it's biased towards sequences with low overall mutation, and low j mutation
            'v' : {q : utils.get_mutation_rate(swfo[q], iseq=0, restrict_to_region='v') for q in qr_seqs},
            'j' : {q : utils.get_mutation_rate(swfo[q], iseq=0, restrict_to_region='j') for q in qr_seqs},
        }
        self.mean_mfreqs = {r : numpy.mean(self.mfreqs[r].values()) for r in self.mfreqs}
        # assert self.region == 'v'  # this won't work if our region is j, since it's too short; there's always/often a dip/gap between 0 mutations and the rest of the distribution
        # self.mfreq_hists = {self.region : Hist(30, 0., 0.3)}  # not reall sure whether it's better to use n_mutes or mfreq, but I already have mfreq
        # for query in qr_seqs:
        #     for region in self.mfreq_hists:
        #         self.mfreq_hists[region].fill(self.mfreqs[region][query])
        print '    mutation among all cluster representatives:   v / j = %6.3f / %6.3f = %6.3f' % (self.mean_mfreqs['v'], self.mean_mfreqs['j'], self.mean_mfreqs['v'] / self.mean_mfreqs['j'])

        assert self.region == 'v'  # need to think about whether this should always be j, or if it should be self.other_region
        j_mfreqs = [utils.get_mutation_rate(swfo[q], iseq=0, restrict_to_region='j') for q in qr_seqs]
        threshold = numpy.mean(j_mfreqs) / 1.5  # v mut freq will be way off for any very different new alleles

        return qr_seqs, threshold
Exemple #9
0
    def increment(self, info):
        # first do overall mute freqs
        freq = utils.get_mutation_rate(self.germline_seqs, info)
        self.mean_rates['all'].fill(freq)

        # then per-region stuff
        for region in utils.regions:
            # per-region mean freqs
            freq = utils.get_mutation_rate(self.germline_seqs,
                                           info,
                                           restrict_to_region=region)
            self.mean_rates[region].fill(freq)

            # per-gene per-position
            if info[region + '_gene'] not in self.counts:
                self.counts[info[region + '_gene']] = {}
                # self.tmpcounts[info[region + '_gene']] = {}
            mute_counts = self.counts[
                info[region +
                     '_gene']]  # temporary variable to avoid long dict access
            # tmpmute_counts = self.tmpcounts[info[region + '_gene']]  # temporary variable to avoid long dict access
            germline_seq = info[region + '_gl_seq']
            query_seq = info[region + '_qr_seq']
            assert len(germline_seq) == len(query_seq)
            for inuke in range(len(germline_seq)):
                i_germline = inuke + int(
                    info[region + '_5p_del']
                )  # account for left-side deletions in the indexing
                if germline_seq[inuke] in utils.ambiguous_bases or query_seq[
                        inuke] in utils.ambiguous_bases:
                    continue
                if i_germline not in mute_counts:  # if we have not yet observed this position in a query sequence, initialize it
                    mute_counts[i_germline] = {
                        n: 0
                        for n in utils.nukes + [
                            'total',
                        ]
                    }
                    mute_counts[i_germline]['gl_nuke'] = germline_seq[inuke]
                mute_counts[i_germline]['total'] += 1
                mute_counts[i_germline][query_seq[inuke]] += 1
Exemple #10
0
    def increment(self, info):
        self.mean_rates['all'].fill(utils.get_mutation_rate(self.germline_seqs, info))  # mean freq over whole sequence (excluding insertions)

        for region in utils.regions:
            regional_freq, len_excluding_ambig = utils.get_mutation_rate(self.germline_seqs, info, restrict_to_region=region, return_len_excluding_ambig=True)
            n_mutes = regional_freq * len_excluding_ambig  # total number of mutations in the region (for tigger stuff)
            if abs(n_mutes - int(n_mutes)) > 1e6:
                raise Exception('n mutated %f not an integer' % n_mutes)
            n_mutes = int(n_mutes)
            self.mean_rates[region].fill(regional_freq)  # per-region mean freq

            # per-gene per-position freqs
            gene = info[region + '_gene']
            if gene not in self.counts:
                self.counts[gene] = {}
            gcounts = self.counts[gene]  # temporary variable to avoid long dict access
            germline_seq = info[region + '_gl_seq']
            query_seq = info[region + '_qr_seq']
            assert len(germline_seq) == len(query_seq)
            for ipos in range(len(germline_seq)):
                igl = ipos + int(info[region + '_5p_del'])  # account for left-side deletions in the indexing
                if germline_seq[ipos] in utils.ambiguous_bases or query_seq[ipos] in utils.ambiguous_bases:
                    continue
                if igl not in gcounts:  # if we have not yet observed this position in a query sequence, initialize it
                    gcounts[igl] = {n : 0 for n in utils.nukes + ['total', ]}
                    gcounts[igl]['gl_nuke'] = germline_seq[ipos]
                    gcounts[igl]['tigger'] = {}
                gcounts[igl]['total'] += 1
                gcounts[igl][query_seq[ipos]] += 1  # note that if <query_seq[ipos]> isn't among <utils.nukes>, this will toss a key error

                if self.tigger:
                    if igl not in gcounts:
                        gcounts[igl]['tigger'] = {}
                    if utils.get_region(gene) == 'v':
                        if n_mutes not in gcounts[igl]['tigger']:
                            gcounts[igl]['tigger'][n_mutes] = {'muted' : 0, 'total' : 0}
                        gcounts[igl]['tigger'][n_mutes]['total'] += 1
                        if query_seq[ipos] != germline_seq[ipos]:  # if this position is mutated
                            gcounts[igl]['tigger'][n_mutes]['muted'] += 1  # mark that we saw this germline position mutated once in a sequence with <n_mutes> regional mutation frequency
    def add_partial_fail(self, true_line, line):

        overall_mute_freq = utils.get_mutation_rate(self.germlines, true_line)  # true value

        for column in self.values:
            if column in bool_columns:
                if column in line and utils.are_alleles(true_line[column], line[column]):  # NOTE you have to change this below as well!
                    self.values[column]['right'] += 1
                    self.hists[column + '_right_vs_mute_freq'].fill(overall_mute_freq)  # NOTE this'll toss a KeyError if you add bool column that aren't [vdj]_gene
                else:
                    self.values[column]['wrong'] += 1
                    self.hists[column + '_wrong_vs_mute_freq'].fill(overall_mute_freq)
            else:
                pass
    def add_partial_fail(self, true_line, line):

        overall_mute_freq = utils.get_mutation_rate(self.germlines, true_line)  # true value

        for column in self.values:
            if column in bool_columns:
                if column in line and utils.are_alleles(true_line[column], line[column]):  # NOTE you have to change this below as well!
                    self.values[column]['right'] += 1
                    self.hists[column + '_right_vs_mute_freq'].fill(overall_mute_freq)  # NOTE this'll toss a KeyError if you add bool column that aren't [vdj]_gene
                else:
                    self.values[column]['wrong'] += 1
                    self.hists[column + '_wrong_vs_mute_freq'].fill(overall_mute_freq)
            else:
                pass
    def add_partial_fail(self, true_line, line):
        # NOTE does not fill all the hists ('cause it kind of can't, right?)

        overall_mute_freq = utils.get_mutation_rate(self.germlines, true_line)  # true value

        for column in self.values:
            if column in bool_columns:
                if column in line:
                    self.set_bool_column(true_line, line, column, overall_mute_freq)
            else:
                pass

        for region in utils.regions:
            if region + '_per_gene_support' in inf_line:
                self.set_per_gene_support(true_line, inf_line, region)
    def evaluate(self, true_line, inf_line):

        overall_mute_freq = utils.get_mutation_rate(true_line, iseq=0)  # true value

        for column in self.values:
            if column in bool_columns:
                self.set_bool_column(true_line, inf_line, column, overall_mute_freq)  # this also sets the fraction-correct-vs-mute-freq hists
            else:  # these should all be integer-valued
                trueval, guessval = 0, 0
                if column[2:] == '_insertion':  # insertion length
                    trueval = len(true_line[column])
                    guessval = len(inf_line[column])
                elif 'hamming_to_true_naive' in column:
                    restrict_to_region = column.replace('hamming_to_true_naive', '').replace('_', '')
                    trueval = 0
                    guessval = self.hamming_to_true_naive(true_line, inf_line, restrict_to_region=restrict_to_region)
                elif 'muted_bases' in column:
                    restrict_to_region = column.replace('muted_bases', '').replace('_', '')
                    trueval = utils.get_n_muted(true_line, iseq=0, restrict_to_region=restrict_to_region)  # when we're evaluating on multi-seq hmm output, we synthesize single-sequence lines for each sequence
                    guessval = utils.get_n_muted(inf_line, iseq=0, restrict_to_region=restrict_to_region)
                else:
                    trueval = int(true_line[column])
                    guessval = int(inf_line[column])

                diff = guessval - trueval
                if diff not in self.values[column]:
                    self.values[column][diff] = 0
                self.values[column][diff] += 1

        for region in utils.regions:
            if region + '_per_gene_support' in inf_line:
                self.set_per_gene_support(true_line, inf_line, region)

        tmptrueval = utils.get_mutation_rate(true_line, iseq=0, restrict_to_region='')  # when we're evaluating on multi-seq hmm output, we synthesize single-sequence lines for each sequence
        tmpguessval = utils.get_mutation_rate(inf_line, iseq=0, restrict_to_region='')
        self.hists['mute_freqs'].fill(tmpguessval - tmptrueval)
Exemple #15
0
    def add_partial_fail(self, true_line, line):
        # NOTE does not fill all the hists ('cause it kind of can't, right?)

        overall_mute_freq = utils.get_mutation_rate(true_line,
                                                    iseq=0)  # true value

        for column in self.values:
            if column in bool_columns:
                if column in line:
                    self.set_bool_column(true_line, line, column,
                                         overall_mute_freq)
            else:
                pass

        for region in utils.regions:
            if region + '_per_gene_support' in inf_line:
                self.set_per_gene_support(true_line, inf_line, region)
Exemple #16
0
    def increment(self, info):
        self.mfreqer.increment(info)

        for region in utils.regions:
            regional_freq, len_excluding_ambig = utils.get_mutation_rate(info, restrict_to_region=region, return_len_excluding_ambig=True)
            n_mutes = regional_freq * len_excluding_ambig  # total number of mutations in the region (for allele finding stuff)
            if abs(n_mutes - int(n_mutes)) > 1e6:
                raise Exception('n mutated %f not an integer' % n_mutes)
            n_mutes = int(n_mutes)

            gene = info[region + '_gene']
            if gene not in self.counts:
                self.counts[gene] = {}
                self.gene_obs_counts[gene] = 0
            self.gene_obs_counts[gene] += 1

            gcts = self.counts[gene]  # shorthand name

            germline_seq = info[region + '_gl_seq']
            query_seq = info[region + '_qr_seq']
            assert len(germline_seq) == len(query_seq)

            for ipos in range(len(germline_seq)):
                igl = ipos + int(info[region + '_5p_del'])  # account for left-side deletions in the indexing

                if germline_seq[ipos] in utils.ambiguous_bases or query_seq[ipos] in utils.ambiguous_bases:  # skip if either germline or query sequence is ambiguous at this position
                    continue

                if igl not in gcts:  # if we have not yet observed this position in a query sequence, initialize it
                    gcts[igl] = {}

                if igl not in gcts:
                    gcts[igl] = {}
                if utils.get_region(gene) == 'v':
                    if n_mutes not in gcts[igl]:
                        gcts[igl][n_mutes] = {n : 0 for n in ['muted', 'total'] + utils.nukes}
                    gcts[igl][n_mutes]['total'] += 1
                    if query_seq[ipos] != germline_seq[ipos]:  # if this position is mutated
                        gcts[igl][n_mutes]['muted'] += 1  # mark that we saw this germline position mutated once in a sequence with <n_mutes> regional mutation frequency
                    gcts[igl][n_mutes][query_seq[ipos]] += 1  # only used to work out what the snp'd base is if there's a new allele