def is_acceptable(scol, acceptable_values, lval): if lval in acceptable_values: return True if args.any_allele and '_gene' in scol and any( utils.are_alleles(g, lval) for g in acceptable_values): return True return False
def set_bool_column(self, true_line, inf_line, column, overall_mute_freq): if utils.are_alleles(true_line[column], inf_line[column]): # NOTE this doesn't require allele to be correct, but set_per_gene_support() does self.values[column]['right'] += 1 self.hists[column + '_right_vs_mute_freq'].fill(overall_mute_freq) # NOTE this'll toss a KeyError if you add bool column that aren't [vdj]_gene else: self.values[column]['wrong'] += 1 self.hists[column + '_wrong_vs_mute_freq'].fill(overall_mute_freq)
def evaluate(self, true_line, inf_line): for column in self.values: if column in bool_columns: if utils.are_alleles(true_line[column], inf_line[column]): # NOTE you have to change this above as well! # if true_line[column] == inf_line[column]: self.values[column]['right'] += 1 else: self.values[column]['wrong'] += 1 else: trueval, guessval = 0, 0 if column[2:] == '_insertion': # insertion length trueval = len(true_line[column]) guessval = len(inf_line[column]) # elif '_content' in column: # seq_to_use = inf_line[column[ : column.find('_', 3)]] # NOTE has to work for seq_content *and* vd_insertion_content, hence the 3 # for nuke in seq_to_use: # self.counts[col][nuke] += 1 elif 'hamming_to_true_naive' in column: trueval = 0 # NOTE this is a kind of weird way to do it, since diff ends up as really just the guessval, but it does the job restrict_to_region = column[0].replace('h', '') # if fist char in <column> is not an 'h', restrict to that region normalize = '_norm' in column guessval = self.hamming_distance_to_true_naive(true_line, inf_line, inf_line['unique_id'], restrict_to_region=restrict_to_region, normalize=normalize) else: trueval = int(true_line[column]) guessval = int(inf_line[column]) diff = guessval - trueval if diff not in self.values[column]: self.values[column][diff] = 0 self.values[column][diff] += 1 for column in self.hists: trueval = utils.get_mutation_rate(self.germlines, true_line) guessval = utils.get_mutation_rate(self.germlines, inf_line) self.hists[column].fill(guessval - trueval)
def figure_out_which_damn_gene(germline_seqs, gene_name, seq, debug=False): region = utils.get_region(gene_name) seq = seq.replace(' ', '') if gene_name in germline_seqs[region]: # already have it, but maybe when we added it before it was a shorter match, so substitute with the new longer match if len(seq) > len(germline_seqs[region][gene_name]): print ' gl match longer than gl!' print ' ', seq print ' ', germline_seqs[region][gene_name] germline_seqs[region][gene_name] = seq return gene_name candidates = [] # if it doesn't specify an allele, see if any of the alleles we've got have the same sequence in the match region if gene_name.find('*') == -1: for candidate_gene in germline_seqs[region]: if candidate_gene.find(gene_name) == 0: if seq in germline_seqs[region][candidate_gene]: candidates.append(candidate_gene) # if it *does* specify an allele, see if any of the other allele have the same sequence in the match region if len(candidates) == 0: # didn't find anything... try other alleles for candidate_gene in germline_seqs[region]: if utils.are_alleles(candidate_gene, gene_name): if seq in germline_seqs[region][candidate_gene]: candidates.append(candidate_gene) # sometimes it's 3-9, but sometimes 3-09. *grrrrrr*. if len(candidates) == 0: for candidate_gene in germline_seqs[region]: if gene_name.replace('-0', '-') == candidate_gene: if seq in germline_seqs[region][candidate_gene]: candidates.append(candidate_gene) # try adding _F and _P to the end of j names if len(candidates) == 0: for candidate_gene in germline_seqs[region]: if gene_name + '_F' == candidate_gene or gene_name + '_P' == candidate_gene: if seq[ : len(germline_seqs[region][candidate_gene])] in germline_seqs[region][candidate_gene]: # shorten <seq> to account for extra bases on right of imgt j versions candidates.append(candidate_gene) # try removing the darn R at the end (and remove the zero). I hope it doesn't mean anything important if len(candidates) == 0: for candidate_gene in germline_seqs[region]: if gene_name.replace('R', '').replace('-0', '-') == candidate_gene: if seq in germline_seqs[region][candidate_gene]: candidates.append(candidate_gene) if len(candidates) == 0: print ' ERROR didn\'t find jack for', gene_name, seq assert False # elif len(candidates) > 1: # print 'NOTE found',len(candidates),'candidates, just using the first one' if debug: print ' swapping', gene_name, '-->', candidates[0] return candidates[0]
def add_partial_fail(self, true_line, line): for column in self.values: if column in bool_columns: if column in line and utils.are_alleles(true_line[column], line[column]): # NOTE you have to change this below as well! self.values[column]['right'] += 1 # if column == 'v_gene': # print ' partial right ', true_line[column], line[column] else: self.values[column]['wrong'] += 1 # if column == 'v_gene': # print ' partial wrong ', true_line[column], line[column] if column in line else 'FOO' else: pass
def add_partial_fail(self, true_line, line): overall_mute_freq = utils.get_mutation_rate(self.germlines, true_line) # true value for column in self.values: if column in bool_columns: if column in line and utils.are_alleles(true_line[column], line[column]): # NOTE you have to change this below as well! self.values[column]['right'] += 1 self.hists[column + '_right_vs_mute_freq'].fill(overall_mute_freq) # NOTE this'll toss a KeyError if you add bool column that aren't [vdj]_gene else: self.values[column]['wrong'] += 1 self.hists[column + '_wrong_vs_mute_freq'].fill(overall_mute_freq) else: pass
def evaluate(self, true_line, inf_line, padfo=None): #CHANGES FOR MIXCR #overall_mute_freq = utils.get_mutation_rate(self.germlines, true_line) # true value for column in self.values: if self.only_correct_gene_fractions and column not in bool_columns: continue if column in bool_columns: if utils.are_alleles(true_line[column], inf_line[column]): # NOTE you have to change this above as well! self.values[column]['right'] += 1 self.hists[column + '_right_vs_mute_freq'].fill(overall_mute_freq) # NOTE this'll toss a KeyError if you add bool column that aren't [vdj]_gene else: self.values[column]['wrong'] += 1 self.hists[column + '_wrong_vs_mute_freq'].fill(overall_mute_freq) else: trueval, guessval = 0, 0 if column[2:] == '_insertion': # insertion length trueval = len(true_line[column]) guessval = len(inf_line[column]) # elif '_content' in column: # seq_to_use = inf_line[column[ : column.find('_', 3)]] # NOTE has to work for seq_content *and* vd_insertion_content, hence the 3 # for nuke in seq_to_use: # self.counts[col][nuke] += 1 elif 'hamming_to_true_naive' in column: trueval = 0 # NOTE this is a kind of weird way to do it, since diff ends up as really just the guessval, but it does the job restrict_to_region = column[0].replace('h', '') # if fist char in <column> is not an 'h', restrict to that region normalize = '_norm' in column guessval = self.hamming_distance_to_true_naive(true_line, inf_line, inf_line['unique_id'], restrict_to_region=restrict_to_region, normalize=normalize, padfo=padfo) else: #CHANGES FOR MIXCR return #trueval = int(true_line[column]) #guessval = int(inf_line[column]) diff = guessval - trueval if diff not in self.values[column]: self.values[column][diff] = 0 self.values[column][diff] += 1 for column in self.hists: if '_vs_mute_freq' in column: # fill these above continue if len(re.findall('[vdj]_', column)) == 1: region = re.findall('[vdj]_', column)[0][0] else: region = '' trueval = utils.get_mutation_rate(self.germlines, true_line, restrict_to_region=region) guessval = utils.get_mutation_rate(self.germlines, inf_line, restrict_to_region=region) self.hists[column].fill(guessval - trueval)
def parse_detail(self, fk, unique_id): assert fk.iline < len(fk.lines) while fk.line[1] != "Details": fk.increment() if fk.eof: return fk.increment() info = {} info["unique_id"] = unique_id for begin_line, column, index, required, default in line_order: if fk.line[0].find(begin_line) != 0: if required: print "oop", begin_line, fk.line sys.exit() else: info[column] = default continue if column != "": info[column] = clean_value(column, fk.line[index]) # if '[' in info[column]: # print 'added', column, clean_value(column, fk.line[index]) if column.find("_gene") == 1: region = column[0] info[region + "_5p_del"] = ( int(fk.line[fk.line.index("start:") + 1]) - 1 ) # NOTE their indices are 1-based gl_length = int(fk.line[fk.line.index("gene:") + 1]) - 1 match_end = int(fk.line[fk.line.index("end:") + 1]) - 1 assert gl_length >= match_end info[region + "_3p_del"] = gl_length - match_end fk.increment() if unique_id not in self.sim_need: while not fk.eof and fk.line[1] != "Details": # skip stuff until start of next Detail block fk.increment() return info["fv_insertion"] = "" info["jf_insertion"] = "" info["seq"] = ( info["v_qr_seq"] + info["vd_insertion"] + info["d_qr_seq"] + info["dj_insertion"] + info["j_qr_seq"] ) if "-" in info["seq"]: print "ERROR found a dash in %s, returning failure" % unique_id while not fk.eof and fk.line[1] != "Details": # skip stuff until start of next Detail block fk.increment() return if ( info["seq"] not in self.siminfo[unique_id]["seq"] ): # arg. I can't do != because it tacks on v left and j right deletions print "ERROR didn't find the right sequence for %s" % unique_id print " ", info["seq"] print " ", self.siminfo[unique_id]["seq"] sys.exit() if self.args.debug: print unique_id for region in utils.regions: infer_gene = info[region + "_gene"] true_gene = self.siminfo[unique_id][region + "_gene"] if utils.are_alleles(infer_gene, true_gene): regionstr = utils.color("bold", utils.color("blue", region)) truestr = "" #'(originally %s)' % match_name else: regionstr = utils.color("bold", utils.color("red", region)) truestr = "(true: %s)" % utils.color_gene(true_gene).replace(region, "") print " %s %s %s" % (regionstr, utils.color_gene(infer_gene).replace(region, ""), truestr) utils.print_reco_event(self.germline_seqs, self.siminfo[unique_id], label="true:", extra_str=" ") utils.print_reco_event(self.germline_seqs, info, label="inferred:", extra_str=" ") for region in utils.regions: if info[region + "_gene"] not in self.germline_seqs[region]: print "ERROR %s not in germlines" % info[region + "_gene"] assert False gl_seq = info[region + "_gl_seq"] if "[" in gl_seq: # ambiguous for nuke in utils.nukes: gl_seq = gl_seq.replace("[", nuke) if gl_seq in self.germline_seqs[region][info[region + "_gene"]]: print " replaced [ with %s" % nuke break info[region + "_gl_seq"] = gl_seq if info[region + "_gl_seq"] not in self.germline_seqs[region][info[region + "_gene"]]: print "ERROR gl match not found for %s in %s" % (info[region + "_gene"], unique_id) print " ", info[region + "_gl_seq"] print " ", self.germline_seqs[region][info[region + "_gene"]] self.perfplotter.add_partial_fail(self.siminfo[unique_id], info) while not fk.eof and fk.line[1] != "Details": # skip stuff until start of next Detail block fk.increment() return self.perfplotter.evaluate(self.siminfo[unique_id], info) self.details[unique_id] = info self.sim_need.remove(unique_id) while not fk.eof and fk.line[1] != "Details": # skip stuff until start of next Detail block fk.increment()
def process_query(self, qr_info, query_name, query_lines): # split query_lines up into blocks blocks = [] for line in query_lines: if line.find('Query_') == 0: blocks.append([]) if len(line) == 0: continue if len(re.findall('<a name=#_[0-9][0-9]*_IGH', line)) == 0 and line.find('Query_') != 0: continue if len(blocks) == 0: print 'wtf? %s' % query_name # it's probably kicking a reverse match self.perfplotter.add_partial_fail(self.seqinfo[query_name], qr_info) # NOTE that's really a total failure self.n_partially_failed += 1 return blocks[-1].append(line) # then process each block for block in blocks: self.process_single_block(block, query_name, qr_info) if 'skip_gene' in qr_info: self.n_skipped += 1 return if 'fail' in qr_info: self.perfplotter.add_partial_fail(self.seqinfo[query_name], qr_info) self.n_partially_failed += 1 return for region in utils.regions: if region + '_gene' not in qr_info: print ' %d: no %s match' % (query_name, region) self.perfplotter.add_partial_fail(self.seqinfo[query_name], qr_info) self.n_partially_failed += 1 return # expand v match to left end and j match to right end qr_info['v_5p_del'] = 0 qr_info['fv_insertion'] = '' if qr_info['match_start'] > 0: if self.args.debug: print ' add to v left:', self.seqinfo[query_name]['seq'][ : qr_info['match_start']] qr_info['seq'] = self.seqinfo[query_name]['seq'][ : qr_info['match_start']] + qr_info['seq'] qr_info['j_3p_del'] = 0 qr_info['jf_insertion'] = '' if len(self.seqinfo[query_name]['seq']) > qr_info['match_end']: if self.args.debug: print ' add to j right:', self.seqinfo[query_name]['seq'][ qr_info['match_end'] - len(self.seqinfo[query_name]['seq']) : ] qr_info['seq'] = qr_info['seq'] + self.seqinfo[query_name]['seq'][ qr_info['match_end'] - len(self.seqinfo[query_name]['seq']) : ] for boundary in utils.boundaries: start = qr_info[boundary[0] + '_qr_bounds'][1] end = qr_info[boundary[1] + '_qr_bounds'][0] qr_info[boundary + '_insertion'] = qr_info['seq'][start : end] for region in utils.regions: start = qr_info[region + '_qr_bounds'][0] end = qr_info[region + '_qr_bounds'][1] qr_info[region + '_qr_seq'] = qr_info['seq'][start : end] try: resolve_overlapping_matches(qr_info, self.args.debug, self.germline_seqs) except AssertionError: print ' %s: apportionment failed' % query_name self.perfplotter.add_partial_fail(self.seqinfo[query_name], qr_info) self.n_partially_failed += 1 return if self.args.debug: print ' query seq:', qr_info['seq'] for region in utils.regions: true_gene = self.seqinfo[query_name][region + '_gene'] infer_gene = qr_info[region + '_gene'] if utils.are_alleles(infer_gene, true_gene): regionstr = utils.color('bold', utils.color('blue', region)) truestr = '' #'(originally %s)' % match_name else: regionstr = utils.color('bold', utils.color('red', region)) truestr = '(true: %s)' % utils.color_gene(true_gene).replace(region, '') # print ' %s %s %s' % (regionstr, utils.color_gene(infer_gene).replace(region, ''), truestr) print ' %s %3d %3d %s %s %s' % (regionstr, qr_info[region + '_qr_bounds'][0], qr_info[region + '_qr_bounds'][1], utils.color_gene(infer_gene).replace(region, ''), truestr, qr_info[region + '_gl_seq']) for boundary in utils.boundaries: start = qr_info[boundary[0] + '_qr_bounds'][1] end = qr_info[boundary[1] + '_qr_bounds'][0] qr_info[boundary + '_insertion'] = qr_info['seq'][start : end] if self.args.debug: print ' ', boundary, qr_info[boundary + '_insertion'] self.perfplotter.evaluate(self.seqinfo[query_name], qr_info) # for key, val in qr_info.items(): # print key, val if self.args.debug: utils.print_reco_event(self.germline_seqs, self.seqinfo[query_name], label='true:', extra_str=' ') utils.print_reco_event(self.germline_seqs, qr_info, extra_str=' ')
def figure_out_which_damn_gene(germline_seqs, gene_name, seq, debug=False): region = utils.get_region(gene_name) seq = seq.replace(' ', '') if gene_name in germline_seqs[ region]: # already have it, but maybe when we added it before it was a shorter match, so substitute with the new longer match if len(seq) > len(germline_seqs[region][gene_name]): print ' gl match longer than gl!' print ' ', seq print ' ', germline_seqs[region][gene_name] germline_seqs[region][gene_name] = seq return gene_name candidates = [] # if it doesn't specify an allele, see if any of the alleles we've got have the same sequence in the match region if gene_name.find('*') == -1: for candidate_gene in germline_seqs[region]: if candidate_gene.find(gene_name) == 0: if seq in germline_seqs[region][candidate_gene]: candidates.append(candidate_gene) # if it *does* specify an allele, see if any of the other allele have the same sequence in the match region if len(candidates) == 0: # didn't find anything... try other alleles for candidate_gene in germline_seqs[region]: if utils.are_alleles(candidate_gene, gene_name): if seq in germline_seqs[region][candidate_gene]: candidates.append(candidate_gene) # sometimes it's 3-9, but sometimes 3-09. *grrrrrr*. if len(candidates) == 0: for candidate_gene in germline_seqs[region]: if gene_name.replace('-0', '-') == candidate_gene: if seq in germline_seqs[region][candidate_gene]: candidates.append(candidate_gene) # try adding _F and _P to the end of j names if len(candidates) == 0: for candidate_gene in germline_seqs[region]: if gene_name + '_F' == candidate_gene or gene_name + '_P' == candidate_gene: if seq[:len( germline_seqs[region][candidate_gene] )] in germline_seqs[region][ candidate_gene]: # shorten <seq> to account for extra bases on right of imgt j versions candidates.append(candidate_gene) # try removing the darn R at the end (and remove the zero). I hope it doesn't mean anything important if len(candidates) == 0: for candidate_gene in germline_seqs[region]: if gene_name.replace('R', '').replace('-0', '-') == candidate_gene: if seq in germline_seqs[region][candidate_gene]: candidates.append(candidate_gene) if len(candidates) == 0: print ' ERROR didn\'t find jack for', gene_name, seq assert False # elif len(candidates) > 1: # print 'NOTE found',len(candidates),'candidates, just using the first one' if debug: print ' swapping', gene_name, '-->', candidates[0] return candidates[0]
def process_query(self, qr_info, query_name, query_lines): # split query_lines up into blocks blocks = [] for line in query_lines: if line.find('Query_') == 0: blocks.append([]) if len(line) == 0: continue if len(re.findall('<a name=#_[0-9][0-9]*_IGH', line)) == 0 and line.find('Query_') != 0: continue if len(blocks) == 0: print 'wtf? %s' % query_name # it's probably kicking a reverse match self.perfplotter.add_partial_fail( self.seqinfo[query_name], qr_info) # NOTE that's really a total failure self.n_partially_failed += 1 return blocks[-1].append(line) # then process each block for block in blocks: self.process_single_block(block, query_name, qr_info) if 'skip_gene' in qr_info: self.n_skipped += 1 return if 'fail' in qr_info: self.perfplotter.add_partial_fail(self.seqinfo[query_name], qr_info) self.n_partially_failed += 1 return for region in utils.regions: if region + '_gene' not in qr_info: print ' %d: no %s match' % (query_name, region) self.perfplotter.add_partial_fail(self.seqinfo[query_name], qr_info) self.n_partially_failed += 1 return # expand v match to left end and j match to right end qr_info['v_5p_del'] = 0 qr_info['fv_insertion'] = '' if qr_info['match_start'] > 0: if self.args.debug: print ' add to v left:', self.seqinfo[query_name][ 'seq'][:qr_info['match_start']] qr_info['seq'] = self.seqinfo[query_name][ 'seq'][:qr_info['match_start']] + qr_info['seq'] qr_info['j_3p_del'] = 0 qr_info['jf_insertion'] = '' if len(self.seqinfo[query_name]['seq']) > qr_info['match_end']: if self.args.debug: print ' add to j right:', self.seqinfo[query_name][ 'seq'][qr_info['match_end'] - len(self.seqinfo[query_name]['seq']):] qr_info['seq'] = qr_info['seq'] + self.seqinfo[query_name]['seq'][ qr_info['match_end'] - len(self.seqinfo[query_name]['seq']):] for boundary in utils.boundaries: start = qr_info[boundary[0] + '_qr_bounds'][1] end = qr_info[boundary[1] + '_qr_bounds'][0] qr_info[boundary + '_insertion'] = qr_info['seq'][start:end] for region in utils.regions: start = qr_info[region + '_qr_bounds'][0] end = qr_info[region + '_qr_bounds'][1] qr_info[region + '_qr_seq'] = qr_info['seq'][start:end] try: resolve_overlapping_matches(qr_info, self.args.debug, self.germline_seqs) except AssertionError: print ' %s: apportionment failed' % query_name self.perfplotter.add_partial_fail(self.seqinfo[query_name], qr_info) self.n_partially_failed += 1 return if self.args.debug: print ' query seq:', qr_info['seq'] for region in utils.regions: true_gene = self.seqinfo[query_name][region + '_gene'] infer_gene = qr_info[region + '_gene'] if utils.are_alleles(infer_gene, true_gene): regionstr = utils.color('bold', utils.color('blue', region)) truestr = '' #'(originally %s)' % match_name else: regionstr = utils.color('bold', utils.color('red', region)) truestr = '(true: %s)' % utils.color_gene( true_gene).replace(region, '') # print ' %s %s %s' % (regionstr, utils.color_gene(infer_gene).replace(region, ''), truestr) print ' %s %3d %3d %s %s %s' % ( regionstr, qr_info[region + '_qr_bounds'][0], qr_info[region + '_qr_bounds'][1], utils.color_gene(infer_gene).replace( region, ''), truestr, qr_info[region + '_gl_seq']) for boundary in utils.boundaries: start = qr_info[boundary[0] + '_qr_bounds'][1] end = qr_info[boundary[1] + '_qr_bounds'][0] qr_info[boundary + '_insertion'] = qr_info['seq'][start:end] if self.args.debug: print ' ', boundary, qr_info[boundary + '_insertion'] self.perfplotter.evaluate(self.seqinfo[query_name], qr_info) # for key, val in qr_info.items(): # print key, val if self.args.debug: utils.print_reco_event(self.germline_seqs, self.seqinfo[query_name], label='true:', extra_str=' ') utils.print_reco_event(self.germline_seqs, qr_info, extra_str=' ')
def parse_query_text(self, unique_id, query_info): if len(query_info ) == 0: # one for the query sequence, then one for v, d, and j print 'no info for', unique_id return {} elif len(query_info) < 4: regions_ok = '' for info in query_info: for region in utils.regions: if 'IGH' + region.upper() in info: regions_ok += region for region in utils.regions: if region not in regions_ok: print ' ERROR no %s matches' % region return {} assert False # shouldn't get here elif len(query_info) != 4: print 'info for', unique_id, 'all messed up' for info in query_info: print info sys.exit() full_qr_seq = query_info[0].replace('>', '').replace( unique_id, '') # strip off the unique id full_qr_seq = ''.join(full_qr_seq.split()).upper( ) # strip off white space and uppercase it assert full_qr_seq == self.seqinfo[unique_id]['seq'] line = {} line['unique_id'] = unique_id line['seq'] = full_qr_seq for ireg in range(len(utils.regions)): region = utils.regions[ireg] info = query_info[ireg + 1].splitlines() while unique_id not in info[ 0]: # remove the line marking cdr3 and framework regions info.pop(0) if len(info) <= 1: print info assert len(info) > 1 assert len(info[0].split()) == 2 qr_seq = info[0].split()[1].upper( ) # this line should be '<unique_id> .............<query_seq>' true_gene = self.seqinfo[unique_id][region + '_gene'] imatch = 1 # which match to take match_name = str(info[imatch].split()[2]) while match_name in just_always_friggin_skip and len( info) > imatch + 1 and len(info[imatch + 1].split()) > 2: imatch += 1 old_one = match_name match_name = str(info[imatch].split()[2]) if self.args.debug: print ' %s: taking next match: %s --> %s)' % ( unique_id, utils.color_gene(old_one), utils.color_gene(match_name)) infer_gene = match_name for gset in equivalent_genes: if match_name in gset and true_gene in gset and match_name != true_gene: # if the true gene and the inferred gene are in the same equivalence set, treat it as correct, i.e. just pretend it inferred the right name if self.args.debug: print ' %s: replacing name %s with true name %s' % ( unique_id, match_name, true_gene) infer_gene = true_gene # ---------------------------------------------------------------------------------------- # skipping bullshit def skip_gene(gene): print ' %s in list of genes to skip' % utils.color_gene( gene) if gene not in genes_actually_skipped: genes_actually_skipped[gene] = 0 genes_actually_skipped[gene] += 1 line['skip_gene'] = True if infer_gene not in self.germline_seqs[region]: print ' couldn\'t find %s in germlines (skipping)' % infer_gene skip_gene(infer_gene) return line if infer_gene in just_always_friggin_skip: skip_gene(infer_gene) return line if true_gene in just_always_friggin_skip: skip_gene(true) return line if not self.args.dont_skip_or15_genes and '/OR1' in true_gene: skip_gene(true_gene) return line if self.args.skip_missing_genes: if infer_gene in genes_to_skip: skip_gene(infer_gene) return line if true_gene in genes_to_skip: skip_gene(true_gene) return line gl_seq = info[imatch].split()[4].upper() if qr_seq.replace('.', '') not in self.seqinfo[unique_id]['seq']: # if self.args.debug: print ' qr_seq not found in seqinfo' line['failed'] = True return line if self.args.debug: if utils.are_alleles(infer_gene, true_gene): regionstr = utils.color('bold', utils.color('blue', region)) truestr = '(originally %s)' % match_name else: regionstr = utils.color('bold', utils.color('red', region)) truestr = '(true: %s)' % utils.color_gene( true_gene).replace(region, '') print ' %s %s %s' % (regionstr, utils.color_gene(infer_gene).replace( region, ''), truestr) print ' gl', gl_seq print ' ', qr_seq # replace the dots (gaps) in the gl match new_qr_seq, new_gl_seq = [], [] for inuke in range(min(len(qr_seq), len(gl_seq))): if gl_seq[inuke] == '.': pass else: new_qr_seq.append( qr_seq[inuke] ) # this should only be out of range if the v match extends through the whole query sequence, i.e. friggin never new_gl_seq.append(gl_seq[inuke]) for inuke in range(len(gl_seq), len(qr_seq)): new_qr_seq.append(qr_seq[inuke]) for inuke in range(len(qr_seq), len(gl_seq)): new_gl_seq.append(gl_seq[inuke]) qr_seq = ''.join(new_qr_seq) gl_seq = ''.join(new_gl_seq) # work out the erosions qr_ldots = qr_seq.rfind( '.') + 1 # first strip off any dots on the left of query seq qr_seq = qr_seq[qr_ldots:] gl_seq = gl_seq[qr_ldots:] gl_ldots = gl_seq.rfind( '.') + 1 # then remove dots on the left of the germline seq qr_seq = qr_seq[gl_ldots:] gl_seq = gl_seq[gl_ldots:] del_5p = qr_ldots + gl_ldots jf_insertion = '' if region == 'j': jf_insertion = qr_seq[len(gl_seq):] qr_seq = qr_seq[:len( gl_seq )] # then strip the right-hand portion of the query sequence that isn't aligned to the germline del_3p = len(gl_seq) - len( qr_seq ) # then do the same for the germline overhanging on the right of the query gl_seq = gl_seq[:len(qr_seq)] assert len(gl_seq) == len(qr_seq) new_gl_seq = [] for inuke in range(len(gl_seq)): # replace dashes (matched bases) assert gl_seq[inuke] != '.' # hoping there's no gaps in here if gl_seq[inuke] == '-': new_gl_seq.append(qr_seq[inuke]) else: new_gl_seq.append(gl_seq[inuke]) gl_seq = ''.join(new_gl_seq) if self.germline_seqs[region][infer_gene].find( gl_seq ) != del_5p: # why the *@*!! can't they make this consistent? if self.germline_seqs[region][infer_gene].find(gl_seq) < 0: print 'whooooaa' print self.germline_seqs[region][infer_gene] print gl_seq line['failed'] = True return line del_5p += self.germline_seqs[region][infer_gene].find(gl_seq) try: assert del_5p + len(gl_seq) + del_3p + len( jf_insertion) == len( self.germline_seqs[region][infer_gene]) except: print ' ERROR lengths failed for %s' % unique_id # print del_5p, len(gl_seq), del_3p, del_5p + len(gl_seq) + del_3p , len(self.germline_seqs[region][infer_gene]) # print gl_seq # print self.germline_seqs[region][infer_gene] line['failed'] = True return line # assert False if self.args.debug: utils.color_mutants(gl_seq, qr_seq, ref_label='gl ', extra_str=' ', print_result=True, post_str=' del: %d %d' % (del_5p, del_3p)) # try: # infer_gene = joinparser.figure_out_which_damn_gene(self.germline_seqs, infer_gene, gl_seq, debug=self.args.debug) # except: # print 'ERROR couldn\'t figure out the gene for %s' % infer_gene # return {} line[region + '_gene'] = infer_gene line[region + '_qr_seq'] = qr_seq line[region + '_gl_seq'] = gl_seq line[region + '_5p_del'] = del_5p line[region + '_3p_del'] = del_3p if region == 'j': line['jf_insertion'] = jf_insertion return line
def parse_detail(self, fk, unique_id): assert fk.iline < len(fk.lines) while fk.line[1] != 'Details': fk.increment() if fk.eof: return fk.increment() info = {} info['unique_id'] = unique_id for begin_line, column, index, required, default in line_order: if fk.line[0].find(begin_line) != 0: if required: print 'oop', begin_line, fk.line sys.exit() else: info[column] = default continue if column != '': info[column] = clean_value(column, fk.line[index]) # if '[' in info[column]: # print 'added', column, clean_value(column, fk.line[index]) if column.find('_gene') == 1: region = column[0] info[region + '_5p_del'] = int(fk.line[fk.line.index('start:') + 1]) - 1 # NOTE their indices are 1-based gl_length = int(fk.line[fk.line.index('gene:') + 1]) - 1 match_end = int(fk.line[fk.line.index('end:') + 1]) - 1 assert gl_length >= match_end info[region + '_3p_del'] = gl_length - match_end fk.increment() if unique_id not in self.sim_need: while not fk.eof and fk.line[1] != 'Details': # skip stuff until start of next Detail block fk.increment() return info['fv_insertion'] = '' info['jf_insertion'] = '' info['seq'] = info['v_qr_seq'] + info['vd_insertion'] + info['d_qr_seq'] + info['dj_insertion'] + info['j_qr_seq'] if '-' in info['seq']: print 'ERROR found a dash in %s, returning failure' % unique_id while not fk.eof and fk.line[1] != 'Details': # skip stuff until start of next Detail block fk.increment() return if info['seq'] not in self.siminfo[unique_id]['seq']: # arg. I can't do != because it tacks on v left and j right deletions print 'ERROR didn\'t find the right sequence for %s' % unique_id print ' ', info['seq'] print ' ', self.siminfo[unique_id]['seq'] sys.exit() if self.args.debug: print unique_id for region in utils.regions: infer_gene = info[region + '_gene'] true_gene = self.siminfo[unique_id][region + '_gene'] if utils.are_alleles(infer_gene, true_gene): regionstr = utils.color('bold', utils.color('blue', region)) truestr = '' #'(originally %s)' % match_name else: regionstr = utils.color('bold', utils.color('red', region)) truestr = '(true: %s)' % utils.color_gene(true_gene).replace(region, '') print ' %s %s %s' % (regionstr, utils.color_gene(infer_gene).replace(region, ''), truestr) utils.print_reco_event(self.germline_seqs, self.siminfo[unique_id], label='true:', extra_str=' ') utils.print_reco_event(self.germline_seqs, info, label='inferred:', extra_str=' ') for region in utils.regions: if info[region + '_gene'] not in self.germline_seqs[region]: print 'ERROR %s not in germlines' % info[region + '_gene'] assert False gl_seq = info[region + '_gl_seq'] if '[' in gl_seq: # ambiguous for nuke in utils.nukes: gl_seq = gl_seq.replace('[', nuke) if gl_seq in self.germline_seqs[region][info[region + '_gene']]: print ' replaced [ with %s' % nuke break info[region + '_gl_seq'] = gl_seq if info[region + '_gl_seq'] not in self.germline_seqs[region][info[region + '_gene']]: print 'ERROR gl match not found for %s in %s' % (info[region + '_gene'], unique_id) print ' ', info[region + '_gl_seq'] print ' ', self.germline_seqs[region][info[region + '_gene']] self.perfplotter.add_partial_fail(self.siminfo[unique_id], info) while not fk.eof and fk.line[1] != 'Details': # skip stuff until start of next Detail block fk.increment() return self.perfplotter.evaluate(self.siminfo[unique_id], info) self.details[unique_id] = info self.sim_need.remove(unique_id) while not fk.eof and fk.line[1] != 'Details': # skip stuff until start of next Detail block fk.increment()
def parse_query_text(self, unique_id, query_info): if len(query_info) == 0: # one for the query sequence, then one for v, d, and j print 'no info for',unique_id return {} elif len(query_info) < 4: regions_ok = '' for info in query_info: for region in utils.regions: if 'IGH' + region.upper() in info: regions_ok += region for region in utils.regions: if region not in regions_ok: print ' ERROR no %s matches' % region return {} assert False # shouldn't get here elif len(query_info) != 4: print 'info for', unique_id, 'all messed up' for info in query_info: print info sys.exit() full_qr_seq = query_info[0].replace('>', '').replace(unique_id, '') # strip off the unique id full_qr_seq = ''.join(full_qr_seq.split()).upper() # strip off white space and uppercase it assert full_qr_seq == self.seqinfo[unique_id]['seq'] line = {} line['unique_id'] = unique_id line['seq'] = full_qr_seq for ireg in range(len(utils.regions)): region = utils.regions[ireg] info = query_info[ireg + 1].splitlines() while unique_id not in info[0]: # remove the line marking cdr3 and framework regions info.pop(0) if len(info) <= 1: print info assert len(info) > 1 assert len(info[0].split()) == 2 qr_seq = info[0].split()[1].upper() # this line should be '<unique_id> .............<query_seq>' true_gene = self.seqinfo[unique_id][region + '_gene'] imatch = 1 # which match to take match_name = str(info[imatch].split()[2]) while match_name in just_always_friggin_skip and len(info) > imatch+1 and len(info[imatch+1].split()) > 2: imatch += 1 old_one = match_name match_name = str(info[imatch].split()[2]) if self.args.debug: print ' %s: taking next match: %s --> %s)' % (unique_id, utils.color_gene(old_one), utils.color_gene(match_name)) infer_gene = match_name for gset in equivalent_genes: if match_name in gset and true_gene in gset and match_name != true_gene: # if the true gene and the inferred gene are in the same equivalence set, treat it as correct, i.e. just pretend it inferred the right name if self.args.debug: print ' %s: replacing name %s with true name %s' % (unique_id, match_name, true_gene) infer_gene = true_gene # ---------------------------------------------------------------------------------------- # skipping bullshit def skip_gene(gene): print ' %s in list of genes to skip' % utils.color_gene(gene) if gene not in genes_actually_skipped: genes_actually_skipped[gene] = 0 genes_actually_skipped[gene] += 1 line['skip_gene'] = True if infer_gene not in self.germline_seqs[region]: print ' couldn\'t find %s in germlines (skipping)' % infer_gene skip_gene(infer_gene) return line if infer_gene in just_always_friggin_skip: skip_gene(infer_gene) return line if true_gene in just_always_friggin_skip: skip_gene(true) return line if not self.args.dont_skip_or15_genes and '/OR1' in true_gene: skip_gene(true_gene) return line if self.args.skip_missing_genes: if infer_gene in genes_to_skip: skip_gene(infer_gene) return line if true_gene in genes_to_skip: skip_gene(true_gene) return line gl_seq = info[imatch].split()[4].upper() if qr_seq.replace('.', '') not in self.seqinfo[unique_id]['seq']: # if self.args.debug: print ' qr_seq not found in seqinfo' line['failed'] = True return line if self.args.debug: if utils.are_alleles(infer_gene, true_gene): regionstr = utils.color('bold', utils.color('blue', region)) truestr = '(originally %s)' % match_name else: regionstr = utils.color('bold', utils.color('red', region)) truestr = '(true: %s)' % utils.color_gene(true_gene).replace(region, '') print ' %s %s %s' % (regionstr, utils.color_gene(infer_gene).replace(region, ''), truestr) print ' gl', gl_seq print ' ', qr_seq # replace the dots (gaps) in the gl match new_qr_seq, new_gl_seq = [], [] for inuke in range(min(len(qr_seq), len(gl_seq))): if gl_seq[inuke] == '.': pass else: new_qr_seq.append(qr_seq[inuke]) # this should only be out of range if the v match extends through the whole query sequence, i.e. friggin never new_gl_seq.append(gl_seq[inuke]) for inuke in range(len(gl_seq), len(qr_seq)): new_qr_seq.append(qr_seq[inuke]) for inuke in range(len(qr_seq), len(gl_seq)): new_gl_seq.append(gl_seq[inuke]) qr_seq = ''.join(new_qr_seq) gl_seq = ''.join(new_gl_seq) # work out the erosions qr_ldots = qr_seq.rfind('.') + 1 # first strip off any dots on the left of query seq qr_seq = qr_seq[qr_ldots : ] gl_seq = gl_seq[qr_ldots : ] gl_ldots = gl_seq.rfind('.') + 1 # then remove dots on the left of the germline seq qr_seq = qr_seq[gl_ldots : ] gl_seq = gl_seq[gl_ldots : ] del_5p = qr_ldots + gl_ldots jf_insertion = '' if region == 'j': jf_insertion = qr_seq[len(gl_seq) : ] qr_seq = qr_seq[ : len(gl_seq)] # then strip the right-hand portion of the query sequence that isn't aligned to the germline del_3p = len(gl_seq) - len(qr_seq) # then do the same for the germline overhanging on the right of the query gl_seq = gl_seq[ : len(qr_seq)] assert len(gl_seq) == len(qr_seq) new_gl_seq = [] for inuke in range(len(gl_seq)): # replace dashes (matched bases) assert gl_seq[inuke] != '.' # hoping there's no gaps in here if gl_seq[inuke] == '-': new_gl_seq.append(qr_seq[inuke]) else: new_gl_seq.append(gl_seq[inuke]) gl_seq = ''.join(new_gl_seq) if self.germline_seqs[region][infer_gene].find(gl_seq) != del_5p: # why the *@*!! can't they make this consistent? if self.germline_seqs[region][infer_gene].find(gl_seq) < 0: print 'whooooaa' print self.germline_seqs[region][infer_gene] print gl_seq line['failed'] = True return line del_5p += self.germline_seqs[region][infer_gene].find(gl_seq) try: assert del_5p + len(gl_seq) + del_3p + len(jf_insertion) == len(self.germline_seqs[region][infer_gene]) except: print ' ERROR lengths failed for %s' % unique_id # print del_5p, len(gl_seq), del_3p, del_5p + len(gl_seq) + del_3p , len(self.germline_seqs[region][infer_gene]) # print gl_seq # print self.germline_seqs[region][infer_gene] line['failed'] = True return line # assert False if self.args.debug: utils.color_mutants(gl_seq, qr_seq, ref_label='gl ', extra_str=' ', print_result=True, post_str=' del: %d %d' % (del_5p, del_3p)) # try: # infer_gene = joinparser.figure_out_which_damn_gene(self.germline_seqs, infer_gene, gl_seq, debug=self.args.debug) # except: # print 'ERROR couldn\'t figure out the gene for %s' % infer_gene # return {} line[region + '_gene'] = infer_gene line[region + '_qr_seq'] = qr_seq line[region + '_gl_seq'] = gl_seq line[region + '_5p_del'] = del_5p line[region + '_3p_del'] = del_3p if region == 'j': line['jf_insertion'] = jf_insertion return line
def parse_detail(self, fk, unique_id): assert fk.iline < len(fk.lines) while fk.line[1] != 'Details': fk.increment() if fk.eof: return fk.increment() info = {} info['unique_id'] = unique_id for begin_line, column, index, required, default in line_order: if fk.line[0].find(begin_line) != 0: if required: print 'oop', begin_line, fk.line sys.exit() else: info[column] = default continue if column != '': info[column] = clean_value(column, fk.line[index]) # if '[' in info[column]: # print 'added', column, clean_value(column, fk.line[index]) if column.find('_gene') == 1: region = column[0] info[region + '_5p_del'] = int( fk.line[fk.line.index('start:') + 1]) - 1 # NOTE their indices are 1-based gl_length = int(fk.line[fk.line.index('gene:') + 1]) - 1 match_end = int(fk.line[fk.line.index('end:') + 1]) - 1 assert gl_length >= match_end info[region + '_3p_del'] = gl_length - match_end fk.increment() if unique_id not in self.sim_need: while not fk.eof and fk.line[ 1] != 'Details': # skip stuff until start of next Detail block fk.increment() return info['fv_insertion'] = '' info['jf_insertion'] = '' info['seq'] = info['v_qr_seq'] + info['vd_insertion'] + info[ 'd_qr_seq'] + info['dj_insertion'] + info['j_qr_seq'] if '-' in info['seq']: print 'ERROR found a dash in %s, returning failure' % unique_id while not fk.eof and fk.line[ 1] != 'Details': # skip stuff until start of next Detail block fk.increment() return if info['seq'] not in self.siminfo[unique_id][ 'seq']: # arg. I can't do != because it tacks on v left and j right deletions print 'ERROR didn\'t find the right sequence for %s' % unique_id print ' ', info['seq'] print ' ', self.siminfo[unique_id]['seq'] sys.exit() if self.args.debug: print unique_id for region in utils.regions: infer_gene = info[region + '_gene'] true_gene = self.siminfo[unique_id][region + '_gene'] if utils.are_alleles(infer_gene, true_gene): regionstr = utils.color('bold', utils.color('blue', region)) truestr = '' #'(originally %s)' % match_name else: regionstr = utils.color('bold', utils.color('red', region)) truestr = '(true: %s)' % utils.color_gene( true_gene).replace(region, '') print ' %s %s %s' % (regionstr, utils.color_gene(infer_gene).replace( region, ''), truestr) utils.print_reco_event(self.germline_seqs, self.siminfo[unique_id], label='true:', extra_str=' ') utils.print_reco_event(self.germline_seqs, info, label='inferred:', extra_str=' ') for region in utils.regions: if info[region + '_gene'] not in self.germline_seqs[region]: print 'ERROR %s not in germlines' % info[region + '_gene'] assert False gl_seq = info[region + '_gl_seq'] if '[' in gl_seq: # ambiguous for nuke in utils.nukes: gl_seq = gl_seq.replace('[', nuke) if gl_seq in self.germline_seqs[region][info[region + '_gene']]: print ' replaced [ with %s' % nuke break info[region + '_gl_seq'] = gl_seq if info[region + '_gl_seq'] not in self.germline_seqs[region][info[ region + '_gene']]: print 'ERROR gl match not found for %s in %s' % ( info[region + '_gene'], unique_id) print ' ', info[region + '_gl_seq'] print ' ', self.germline_seqs[region][info[region + '_gene']] self.perfplotter.add_partial_fail(self.siminfo[unique_id], info) while not fk.eof and fk.line[ 1] != 'Details': # skip stuff until start of next Detail block fk.increment() return self.perfplotter.evaluate(self.siminfo[unique_id], info) self.details[unique_id] = info self.sim_need.remove(unique_id) while not fk.eof and fk.line[ 1] != 'Details': # skip stuff until start of next Detail block fk.increment()