def increment(self, info): # first do overall mute freqs freq = utils.get_mutation_rate(self.germline_seqs, info) self.mean_rates['all'].fill(freq) # then per-region stuff for region in utils.regions: # per-region mean freqs freq = utils.get_mutation_rate(self.germline_seqs, info, restrict_to_region=region) self.mean_rates[region].fill(freq) # per-gene per-position if info[region + '_gene'] not in self.counts: self.counts[info[region + '_gene']] = {} # self.tmpcounts[info[region + '_gene']] = {} mute_counts = self.counts[info[region + '_gene']] # temporary variable to avoid long dict access # tmpmute_counts = self.tmpcounts[info[region + '_gene']] # temporary variable to avoid long dict access germline_seq = info[region + '_gl_seq'] query_seq = info[region + '_qr_seq'] assert len(germline_seq) == len(query_seq) for inuke in range(len(germline_seq)): i_germline = inuke + int(info[region + '_5p_del']) # account for left-side deletions in the indexing if germline_seq[inuke] in utils.ambiguous_bases or query_seq[inuke] in utils.ambiguous_bases: continue if i_germline not in mute_counts: # if we have not yet observed this position in a query sequence, initialize it mute_counts[i_germline] = {n : 0 for n in utils.nukes + ['total', ]} mute_counts[i_germline]['gl_nuke'] = germline_seq[inuke] mute_counts[i_germline]['total'] += 1 mute_counts[i_germline][query_seq[inuke]] += 1
def increment(self, info): self.mean_rates['all'].fill(utils.get_mutation_rate(info)) # mean freq over whole sequence (excluding insertions) for region in utils.regions: # first do mean freqs regional_freq = utils.get_mutation_rate(info, restrict_to_region=region) self.mean_rates[region].fill(regional_freq) # per-region mean freq # then do per-gene per-position freqs gene = info[region + '_gene'] if gene not in self.counts: self.counts[gene] = {} gcts = self.counts[gene] # shorthand name germline_seq = info[region + '_gl_seq'] query_seq = info[region + '_qr_seq'] assert len(germline_seq) == len(query_seq) for ipos in range(len(germline_seq)): igl = ipos + int(info[region + '_5p_del']) # account for left-side deletions in the indexing if germline_seq[ipos] in utils.ambiguous_bases or query_seq[ipos] in utils.ambiguous_bases: # skip if either germline or query sequence is ambiguous at this position continue if igl not in gcts: # if we have not yet observed this position in a query sequence, initialize it gcts[igl] = {n : 0 for n in utils.nukes + ['total', ]} gcts[igl]['gl_nuke'] = germline_seq[ipos] gcts[igl]['total'] += 1 gcts[igl][query_seq[ipos]] += 1 # note that if <query_seq[ipos]> isn't among <utils.nukes>, this will toss a key error
def evaluate(self, true_line, inf_line): for column in self.values: if column in bool_columns: if utils.are_alleles(true_line[column], inf_line[column]): # NOTE you have to change this above as well! # if true_line[column] == inf_line[column]: self.values[column]['right'] += 1 else: self.values[column]['wrong'] += 1 else: trueval, guessval = 0, 0 if column[2:] == '_insertion': # insertion length trueval = len(true_line[column]) guessval = len(inf_line[column]) # elif '_content' in column: # seq_to_use = inf_line[column[ : column.find('_', 3)]] # NOTE has to work for seq_content *and* vd_insertion_content, hence the 3 # for nuke in seq_to_use: # self.counts[col][nuke] += 1 elif 'hamming_to_true_naive' in column: trueval = 0 # NOTE this is a kind of weird way to do it, since diff ends up as really just the guessval, but it does the job restrict_to_region = column[0].replace('h', '') # if fist char in <column> is not an 'h', restrict to that region normalize = '_norm' in column guessval = self.hamming_distance_to_true_naive(true_line, inf_line, inf_line['unique_id'], restrict_to_region=restrict_to_region, normalize=normalize) else: trueval = int(true_line[column]) guessval = int(inf_line[column]) diff = guessval - trueval if diff not in self.values[column]: self.values[column][diff] = 0 self.values[column][diff] += 1 for column in self.hists: trueval = utils.get_mutation_rate(self.germlines, true_line) guessval = utils.get_mutation_rate(self.germlines, inf_line) self.hists[column].fill(guessval - trueval)
def evaluate(self, true_line, inf_line): overall_mute_freq = utils.get_mutation_rate(true_line, iseq=0) # true value for column in self.values: if column in bool_columns: self.set_bool_column( true_line, inf_line, column, overall_mute_freq ) # this also sets the fraction-correct-vs-mute-freq hists else: # these should all be integer-valued trueval, guessval = 0, 0 if column[2:] == '_insertion': # insertion length trueval = len(true_line[column]) guessval = len(inf_line[column]) elif 'hamming_to_true_naive' in column: restrict_to_region = column.replace( 'hamming_to_true_naive', '').replace('_', '') trueval = 0 guessval = self.hamming_to_true_naive( true_line, inf_line, restrict_to_region=restrict_to_region) elif 'muted_bases' in column: restrict_to_region = column.replace('muted_bases', '').replace('_', '') trueval = utils.get_n_muted( true_line, iseq=0, restrict_to_region=restrict_to_region ) # when we're evaluating on multi-seq hmm output, we synthesize single-sequence lines for each sequence guessval = utils.get_n_muted( inf_line, iseq=0, restrict_to_region=restrict_to_region) else: trueval = int(true_line[column]) guessval = int(inf_line[column]) diff = guessval - trueval if diff not in self.values[column]: self.values[column][diff] = 0 self.values[column][diff] += 1 for region in utils.regions: if region + '_per_gene_support' in inf_line: self.set_per_gene_support(true_line, inf_line, region) tmptrueval = utils.get_mutation_rate( true_line, iseq=0, restrict_to_region='' ) # when we're evaluating on multi-seq hmm output, we synthesize single-sequence lines for each sequence tmpguessval = utils.get_mutation_rate(inf_line, iseq=0, restrict_to_region='') self.hists['mute_freqs'].fill(tmpguessval - tmptrueval)
def evaluate(self, true_line, inf_line, padfo=None): #CHANGES FOR MIXCR #overall_mute_freq = utils.get_mutation_rate(self.germlines, true_line) # true value for column in self.values: if self.only_correct_gene_fractions and column not in bool_columns: continue if column in bool_columns: if utils.are_alleles(true_line[column], inf_line[column]): # NOTE you have to change this above as well! self.values[column]['right'] += 1 self.hists[column + '_right_vs_mute_freq'].fill(overall_mute_freq) # NOTE this'll toss a KeyError if you add bool column that aren't [vdj]_gene else: self.values[column]['wrong'] += 1 self.hists[column + '_wrong_vs_mute_freq'].fill(overall_mute_freq) else: trueval, guessval = 0, 0 if column[2:] == '_insertion': # insertion length trueval = len(true_line[column]) guessval = len(inf_line[column]) # elif '_content' in column: # seq_to_use = inf_line[column[ : column.find('_', 3)]] # NOTE has to work for seq_content *and* vd_insertion_content, hence the 3 # for nuke in seq_to_use: # self.counts[col][nuke] += 1 elif 'hamming_to_true_naive' in column: trueval = 0 # NOTE this is a kind of weird way to do it, since diff ends up as really just the guessval, but it does the job restrict_to_region = column[0].replace('h', '') # if fist char in <column> is not an 'h', restrict to that region normalize = '_norm' in column guessval = self.hamming_distance_to_true_naive(true_line, inf_line, inf_line['unique_id'], restrict_to_region=restrict_to_region, normalize=normalize, padfo=padfo) else: #CHANGES FOR MIXCR return #trueval = int(true_line[column]) #guessval = int(inf_line[column]) diff = guessval - trueval if diff not in self.values[column]: self.values[column][diff] = 0 self.values[column][diff] += 1 for column in self.hists: if '_vs_mute_freq' in column: # fill these above continue if len(re.findall('[vdj]_', column)) == 1: region = re.findall('[vdj]_', column)[0][0] else: region = '' trueval = utils.get_mutation_rate(self.germlines, true_line, restrict_to_region=region) guessval = utils.get_mutation_rate(self.germlines, inf_line, restrict_to_region=region) self.hists[column].fill(guessval - trueval)
def check_tree_simulation(self, mean_total_height, regional_heights, chosen_tree, scaled_trees, regional_naive_seqs, mseqs, reco_event, debug=False): assert reco_event.line is not None # make sure we already set it # check the height for each region mean_observed = {n : 0.0 for n in ['all'] + utils.regions} for iseq in range(len(reco_event.final_seqs)): mean_observed['all'] += reco_event.line['mut_freqs'][iseq] for region in utils.regions: # NOTE for simulating, we mash the insertions in with the D, but this isn't accounted for here rrate = utils.get_mutation_rate(reco_event.line, iseq=iseq, restrict_to_region=region) mean_observed[region] += rrate if debug: print ' in out' for rname in ['all'] + utils.regions: mean_observed[rname] /= float(len(reco_event.final_seqs)) if rname == 'all': input_height = mean_total_height if self.args.mutation_multiplier is not None: # multiply the branch lengths by some factor input_height *= self.args.mutation_multiplier else: input_height = regional_heights[rname] self.validation_values['heights'][rname]['in'].append(input_height) self.validation_values['heights'][rname]['out'].append(mean_observed[rname]) if debug: print ' %4s %7.3f %7.3f' % (rname, input_height, mean_observed[rname]) treeutils.get_tree_difference_metrics('all', chosen_tree, reco_event.final_seqs, reco_event.line['naive_seq'])
def evaluate(self, true_line, inf_line, padfo=None): overall_mute_freq = utils.get_mutation_rate(self.germlines, true_line) # true value for column in self.values: if self.only_correct_gene_fractions and column not in bool_columns: continue if column in bool_columns: self.set_bool_column(true_line, inf_line, column, overall_mute_freq) else: trueval, guessval = 0, 0 if column[2:] == '_insertion': # insertion length trueval = len(true_line[column]) guessval = len(inf_line[column]) # elif '_content' in column: # seq_to_use = inf_line[column[ : column.find('_', 3)]] # NOTE has to work for seq_content *and* vd_insertion_content, hence the 3 # for nuke in seq_to_use: # self.counts[col][nuke] += 1 elif 'hamming_to_true_naive' in column: trueval = 0 # NOTE this is a kind of weird way to do it, since diff ends up as really just the guessval, but it does the job restrict_to_region = column[0].replace('h', '') # if fist char in <column> is not an 'h', restrict to that region normalize = '_norm' in column guessval = self.hamming_distance_to_true_naive(true_line, inf_line, inf_line['unique_id'], restrict_to_region=restrict_to_region, normalize=normalize, padfo=padfo) else: trueval = int(true_line[column]) guessval = int(inf_line[column]) diff = guessval - trueval if diff not in self.values[column]: self.values[column][diff] = 0 self.values[column][diff] += 1 for region in utils.regions: if region + '_per_gene_support' in inf_line: self.set_per_gene_support(true_line, inf_line, region) for column in self.hists: if '_vs_mute_freq' in column or '_per_gene_support' in column: # fill these above continue if len(re.findall('[vdj]_', column)) == 1: region = re.findall('[vdj]_', column)[0][0] else: region = '' trueval = utils.get_mutation_rate(self.germlines, true_line, restrict_to_region=region) guessval = utils.get_mutation_rate(self.germlines, inf_line, restrict_to_region=region) self.hists[column].fill(guessval - trueval)
def choose_clonal_representatives(self, swfo, debug=False): # NOTE do *not* modify <self.glfo> (in the future it would be nice to just modify <self.glfo>, but for now we need it to be super clear in partitiondriver what is happening to <self.glfo>) # first remove non-full-length sequences full_length_queries = [q for q in swfo['queries'] if swfo[q]['v_5p_del'] == 0 and swfo[q]['j_3p_del'] == 0] print ' removing %d/%d sequences with v_5p or j_3p deletions' % (len(swfo['queries']) - len(full_length_queries), len(swfo['queries'])) if len(full_length_queries) == 0: return None, None, None # then cluster by full-length (v+d+j) naive sequence clusters = utils.collapse_naive_seqs(swfo, queries=full_length_queries) # then build <qr_seqs> from the v sequences corresponding to the least-j-mutated sequence in each of these clusters (skipping clusterings that are too mutated) qr_seqs = {} self.all_j_mutations = {} for cluster in clusters: clusterstr = ':'.join(cluster) j_mutations = {q : utils.get_n_muted(swfo[q], iseq=0, restrict_to_region=self.other_region) for q in cluster} best_query, smallest_j_mutations = sorted(j_mutations.items(), key=operator.itemgetter(1))[0] # take the sequence with the lowest j mutation for each cluster, if it doesn't have too many j mutations NOTE choose_cluster_representatives() in allelefinder is somewhat similar if smallest_j_mutations < self.max_mutations['j']: qr_seqs[best_query] = indelutils.get_qr_seqs_with_indels_reinstated(swfo[best_query], iseq=0)[self.region] for query in cluster: self.all_j_mutations[query] = j_mutations[query] # I don't think I can key by the cluster str, since here things correspond to the naive-seq-collapsed clusters, then we remove some of the clusters, and then cluster with vsearch print ' collapsed %d input sequences into %d representatives from %d clones (removed %d clones with >= %d j mutations)' % (len(full_length_queries), len(qr_seqs), len(clusters), len(clusters) - len(qr_seqs), self.max_mutations['j']) self.gene_info = {q : swfo[q][self.region + '_gene'] for q in qr_seqs} # assigned gene for the clonal representative from each cluster that we used (i.e. *not* from every sequence in the sample) self.mfreqs = { # NOTE only includes cluster representatives, i.e. it's biased towards sequences with low overall mutation, and low j mutation 'v' : {q : utils.get_mutation_rate(swfo[q], iseq=0, restrict_to_region='v') for q in qr_seqs}, 'j' : {q : utils.get_mutation_rate(swfo[q], iseq=0, restrict_to_region='j') for q in qr_seqs}, } self.mean_mfreqs = {r : numpy.mean(self.mfreqs[r].values()) for r in self.mfreqs} # assert self.region == 'v' # this won't work if our region is j, since it's too short; there's always/often a dip/gap between 0 mutations and the rest of the distribution # self.mfreq_hists = {self.region : Hist(30, 0., 0.3)} # not reall sure whether it's better to use n_mutes or mfreq, but I already have mfreq # for query in qr_seqs: # for region in self.mfreq_hists: # self.mfreq_hists[region].fill(self.mfreqs[region][query]) print ' mutation among all cluster representatives: v / j = %6.3f / %6.3f = %6.3f' % (self.mean_mfreqs['v'], self.mean_mfreqs['j'], self.mean_mfreqs['v'] / self.mean_mfreqs['j']) assert self.region == 'v' # need to think about whether this should always be j, or if it should be self.other_region j_mfreqs = [utils.get_mutation_rate(swfo[q], iseq=0, restrict_to_region='j') for q in qr_seqs] threshold = numpy.mean(j_mfreqs) / 1.5 # v mut freq will be way off for any very different new alleles return qr_seqs, threshold
def increment(self, info): # first do overall mute freqs freq = utils.get_mutation_rate(self.germline_seqs, info) self.mean_rates['all'].fill(freq) # then per-region stuff for region in utils.regions: # per-region mean freqs freq = utils.get_mutation_rate(self.germline_seqs, info, restrict_to_region=region) self.mean_rates[region].fill(freq) # per-gene per-position if info[region + '_gene'] not in self.counts: self.counts[info[region + '_gene']] = {} # self.tmpcounts[info[region + '_gene']] = {} mute_counts = self.counts[ info[region + '_gene']] # temporary variable to avoid long dict access # tmpmute_counts = self.tmpcounts[info[region + '_gene']] # temporary variable to avoid long dict access germline_seq = info[region + '_gl_seq'] query_seq = info[region + '_qr_seq'] assert len(germline_seq) == len(query_seq) for inuke in range(len(germline_seq)): i_germline = inuke + int( info[region + '_5p_del'] ) # account for left-side deletions in the indexing if germline_seq[inuke] in utils.ambiguous_bases or query_seq[ inuke] in utils.ambiguous_bases: continue if i_germline not in mute_counts: # if we have not yet observed this position in a query sequence, initialize it mute_counts[i_germline] = { n: 0 for n in utils.nukes + [ 'total', ] } mute_counts[i_germline]['gl_nuke'] = germline_seq[inuke] mute_counts[i_germline]['total'] += 1 mute_counts[i_germline][query_seq[inuke]] += 1
def increment(self, info): self.mean_rates['all'].fill(utils.get_mutation_rate(self.germline_seqs, info)) # mean freq over whole sequence (excluding insertions) for region in utils.regions: regional_freq, len_excluding_ambig = utils.get_mutation_rate(self.germline_seqs, info, restrict_to_region=region, return_len_excluding_ambig=True) n_mutes = regional_freq * len_excluding_ambig # total number of mutations in the region (for tigger stuff) if abs(n_mutes - int(n_mutes)) > 1e6: raise Exception('n mutated %f not an integer' % n_mutes) n_mutes = int(n_mutes) self.mean_rates[region].fill(regional_freq) # per-region mean freq # per-gene per-position freqs gene = info[region + '_gene'] if gene not in self.counts: self.counts[gene] = {} gcounts = self.counts[gene] # temporary variable to avoid long dict access germline_seq = info[region + '_gl_seq'] query_seq = info[region + '_qr_seq'] assert len(germline_seq) == len(query_seq) for ipos in range(len(germline_seq)): igl = ipos + int(info[region + '_5p_del']) # account for left-side deletions in the indexing if germline_seq[ipos] in utils.ambiguous_bases or query_seq[ipos] in utils.ambiguous_bases: continue if igl not in gcounts: # if we have not yet observed this position in a query sequence, initialize it gcounts[igl] = {n : 0 for n in utils.nukes + ['total', ]} gcounts[igl]['gl_nuke'] = germline_seq[ipos] gcounts[igl]['tigger'] = {} gcounts[igl]['total'] += 1 gcounts[igl][query_seq[ipos]] += 1 # note that if <query_seq[ipos]> isn't among <utils.nukes>, this will toss a key error if self.tigger: if igl not in gcounts: gcounts[igl]['tigger'] = {} if utils.get_region(gene) == 'v': if n_mutes not in gcounts[igl]['tigger']: gcounts[igl]['tigger'][n_mutes] = {'muted' : 0, 'total' : 0} gcounts[igl]['tigger'][n_mutes]['total'] += 1 if query_seq[ipos] != germline_seq[ipos]: # if this position is mutated gcounts[igl]['tigger'][n_mutes]['muted'] += 1 # mark that we saw this germline position mutated once in a sequence with <n_mutes> regional mutation frequency
def add_partial_fail(self, true_line, line): overall_mute_freq = utils.get_mutation_rate(self.germlines, true_line) # true value for column in self.values: if column in bool_columns: if column in line and utils.are_alleles(true_line[column], line[column]): # NOTE you have to change this below as well! self.values[column]['right'] += 1 self.hists[column + '_right_vs_mute_freq'].fill(overall_mute_freq) # NOTE this'll toss a KeyError if you add bool column that aren't [vdj]_gene else: self.values[column]['wrong'] += 1 self.hists[column + '_wrong_vs_mute_freq'].fill(overall_mute_freq) else: pass
def add_partial_fail(self, true_line, line): # NOTE does not fill all the hists ('cause it kind of can't, right?) overall_mute_freq = utils.get_mutation_rate(self.germlines, true_line) # true value for column in self.values: if column in bool_columns: if column in line: self.set_bool_column(true_line, line, column, overall_mute_freq) else: pass for region in utils.regions: if region + '_per_gene_support' in inf_line: self.set_per_gene_support(true_line, inf_line, region)
def evaluate(self, true_line, inf_line): overall_mute_freq = utils.get_mutation_rate(true_line, iseq=0) # true value for column in self.values: if column in bool_columns: self.set_bool_column(true_line, inf_line, column, overall_mute_freq) # this also sets the fraction-correct-vs-mute-freq hists else: # these should all be integer-valued trueval, guessval = 0, 0 if column[2:] == '_insertion': # insertion length trueval = len(true_line[column]) guessval = len(inf_line[column]) elif 'hamming_to_true_naive' in column: restrict_to_region = column.replace('hamming_to_true_naive', '').replace('_', '') trueval = 0 guessval = self.hamming_to_true_naive(true_line, inf_line, restrict_to_region=restrict_to_region) elif 'muted_bases' in column: restrict_to_region = column.replace('muted_bases', '').replace('_', '') trueval = utils.get_n_muted(true_line, iseq=0, restrict_to_region=restrict_to_region) # when we're evaluating on multi-seq hmm output, we synthesize single-sequence lines for each sequence guessval = utils.get_n_muted(inf_line, iseq=0, restrict_to_region=restrict_to_region) else: trueval = int(true_line[column]) guessval = int(inf_line[column]) diff = guessval - trueval if diff not in self.values[column]: self.values[column][diff] = 0 self.values[column][diff] += 1 for region in utils.regions: if region + '_per_gene_support' in inf_line: self.set_per_gene_support(true_line, inf_line, region) tmptrueval = utils.get_mutation_rate(true_line, iseq=0, restrict_to_region='') # when we're evaluating on multi-seq hmm output, we synthesize single-sequence lines for each sequence tmpguessval = utils.get_mutation_rate(inf_line, iseq=0, restrict_to_region='') self.hists['mute_freqs'].fill(tmpguessval - tmptrueval)
def add_partial_fail(self, true_line, line): # NOTE does not fill all the hists ('cause it kind of can't, right?) overall_mute_freq = utils.get_mutation_rate(true_line, iseq=0) # true value for column in self.values: if column in bool_columns: if column in line: self.set_bool_column(true_line, line, column, overall_mute_freq) else: pass for region in utils.regions: if region + '_per_gene_support' in inf_line: self.set_per_gene_support(true_line, inf_line, region)
def increment(self, info): self.mfreqer.increment(info) for region in utils.regions: regional_freq, len_excluding_ambig = utils.get_mutation_rate(info, restrict_to_region=region, return_len_excluding_ambig=True) n_mutes = regional_freq * len_excluding_ambig # total number of mutations in the region (for allele finding stuff) if abs(n_mutes - int(n_mutes)) > 1e6: raise Exception('n mutated %f not an integer' % n_mutes) n_mutes = int(n_mutes) gene = info[region + '_gene'] if gene not in self.counts: self.counts[gene] = {} self.gene_obs_counts[gene] = 0 self.gene_obs_counts[gene] += 1 gcts = self.counts[gene] # shorthand name germline_seq = info[region + '_gl_seq'] query_seq = info[region + '_qr_seq'] assert len(germline_seq) == len(query_seq) for ipos in range(len(germline_seq)): igl = ipos + int(info[region + '_5p_del']) # account for left-side deletions in the indexing if germline_seq[ipos] in utils.ambiguous_bases or query_seq[ipos] in utils.ambiguous_bases: # skip if either germline or query sequence is ambiguous at this position continue if igl not in gcts: # if we have not yet observed this position in a query sequence, initialize it gcts[igl] = {} if igl not in gcts: gcts[igl] = {} if utils.get_region(gene) == 'v': if n_mutes not in gcts[igl]: gcts[igl][n_mutes] = {n : 0 for n in ['muted', 'total'] + utils.nukes} gcts[igl][n_mutes]['total'] += 1 if query_seq[ipos] != germline_seq[ipos]: # if this position is mutated gcts[igl][n_mutes]['muted'] += 1 # mark that we saw this germline position mutated once in a sequence with <n_mutes> regional mutation frequency gcts[igl][n_mutes][query_seq[ipos]] += 1 # only used to work out what the snp'd base is if there's a new allele