def hamming_distance_to_true_naive(self, true_line, line, query_name, restrict_to_region='', normalize=False, padfo=None, debug=False): """ Hamming distance between the inferred naive sequence and the tue naive sequence. <restrict_to_region> if set, restrict the comparison to the section of the *true* sequence assigned to the given region. NOTE this will not in general correspond to the similarly-assigned region in the inferred naive sequence. if <normalize> divide by sequence length """ true_naive_seq = utils.get_full_naive_seq(self.germlines, true_line) inferred_naive_seq = utils.get_full_naive_seq(self.germlines, line) left_hack_add_on = '' right_hack_add_on = '' if len(true_line['seq']) > len(line['seq']): # ihhhmmm doesn't report the bits of the sequence it erodes off the ends, so we have to add them back on # if len(true_naive_seq) > len(inferred_naive_seq): # hm, now why did I use line['seq'] stuff before? start = true_line['seq'].find(line['seq']) assert start >= 0 end = len(line['seq']) + start left_hack_add_on = true_line['seq'][: start] right_hack_add_on = true_line['seq'][ end :] # extra_penalty = len(left_hack_add_on) + len(right_hack_add_on) inferred_naive_seq = 'N'*len(left_hack_add_on) + inferred_naive_seq + 'N'*len(right_hack_add_on) if debug: print ' adding to inferred naive seq' # if restrict_to_region == '': # print ' before', inferred_naive_seq if padfo is not None: # remove N padding from the inferred sequence inferred_naive_seq = inferred_naive_seq[padfo['padleft'] : ] if padfo['padright'] > 0: inferred_naive_seq = inferred_naive_seq[ : -padfo['padright']] # if restrict_to_region == '': # print ' after ', inferred_naive_seq bounds = None if restrict_to_region != '': bounds = utils.get_regional_naive_seq_bounds(restrict_to_region, self.germlines, true_line) # get the bounds of this *true* region true_naive_seq = true_naive_seq[bounds[0] : bounds[1]] inferred_naive_seq = inferred_naive_seq[bounds[0] : bounds[1]] if debug: print restrict_to_region, 'region, bounds', bounds print ' true ', true_naive_seq print ' infer', inferred_naive_seq if len(true_naive_seq) != len(inferred_naive_seq): raise Exception('still not the same lengths for %s\n %s\n %s' % (query_name, true_naive_seq, inferred_naive_seq)) fraction, len_excluding_ambig = utils.hamming_fraction(true_naive_seq, inferred_naive_seq, return_len_excluding_ambig=True) total_distance = int(fraction * len_excluding_ambig) if len(true_naive_seq) == 0: print 'WARNING zero length sequence in hamming_distance_to_true_naive' return 0 if normalize: return int(100 * (float(total_distance) / len(true_naive_seq))) else: return total_distance
def hamming_distance_to_true_naive(self, true_line, line, query_name, restrict_to_region='', normalize=False, debug=False): """ Hamming distance between the inferred naive sequence and the tue naive sequence. <restrict_to_region> if set, restrict the comparison to the section of the *true* sequence assigned to the given region. NOTE this will not in general correspond to the similarly-assigned region in the inferred naive sequence. if <normalize> divide by sequence length """ true_naive_seq = utils.get_full_naive_seq(self.germlines, true_line) inferred_naive_seq = utils.get_full_naive_seq(self.germlines, line) left_hack_add_on = '' right_hack_add_on = '' if len(true_line['seq']) > len(line['seq']): # ihhhmmm doesn't report the bits of the sequence it erodes off the ends, so we have to add them back on # if len(true_naive_seq) > len(inferred_naive_seq): # hm, now why I did use line['seq'] stuff before? start = true_line['seq'].find(line['seq']) assert start >= 0 end = len(line['seq']) + start left_hack_add_on = true_line['seq'][: start] right_hack_add_on = true_line['seq'][ end :] # extra_penalty = len(left_hack_add_on) + len(right_hack_add_on) inferred_naive_seq = 'x'*len(left_hack_add_on) + inferred_naive_seq + 'x'*len(right_hack_add_on) if debug: print ' adding to inferred naive seq' bounds = None if restrict_to_region != '': bounds = utils.get_regional_naive_seq_bounds(restrict_to_region, self.germlines, true_line) # get the bounds of this *true* region true_naive_seq = true_naive_seq[bounds[0] : bounds[1]] inferred_naive_seq = inferred_naive_seq[bounds[0] : bounds[1]] # if len(true_naive_seq) > len(inferred_naive_seq): if debug: print restrict_to_region, 'region, bounds', bounds print ' true ', true_naive_seq print ' infer', inferred_naive_seq if len(true_naive_seq) != len(inferred_naive_seq): print 'ERROR still not the same lengths for %s' % query_name print ' true ', true_naive_seq print ' infer', inferred_naive_seq sys.exit() total_distance = utils.hamming(true_naive_seq, inferred_naive_seq) if len(true_naive_seq) == 0: print 'WARNING zero length sequence in hamming_distance_to_true_naive' return 0 if normalize: return int(100 * (float(total_distance) / len(true_naive_seq))) else: return total_distance
def hamming_distance_to_true_naive(self, true_line, line, query_name, restrict_to_region='', normalize=False, padfo=None, debug=False): """ Hamming distance between the inferred naive sequence and the tue naive sequence. <restrict_to_region> if set, restrict the comparison to the section of the *true* sequence assigned to the given region. NOTE this will not in general correspond to the similarly-assigned region in the inferred naive sequence. if <normalize> divide by sequence length """ true_naive_seq = utils.get_full_naive_seq(self.germlines, true_line) inferred_naive_seq = utils.get_full_naive_seq(self.germlines, line) if len(true_naive_seq) != len(inferred_naive_seq): print '%20s true inf' % '' for k in true_line: print '%20s %s' % (k, true_line[k]), if k in line: print ' %s' % line[k] else: print ' NOPE' for k in line: if k not in true_line: print ' not in true line %20s %s' % (k, line[k]) raise Exception('%s true and inferred sequences not the same length\n %s\n %s\n' % (line['unique_id'], true_naive_seq, inferred_naive_seq)) # assert False # read through this whole damn thing and make sure it's ok left_hack_add_on = '' right_hack_add_on = '' # if len(true_line['seq']) > len(utils.remove_ambiguous_ends(line['seq'], line['fv_insertion'], line['jf_insertion'])): # ihhhmmm doesn't report the bits of the sequence it erodes off the ends, so we have to add them back on # # if len(true_naive_seq) > len(inferred_naive_seq): # hm, now why did I use line['seq'] stuff before? # assert False # start = true_line['seq'].find(line['seq']) # assert start >= 0 # end = len(line['seq']) + start # left_hack_add_on = true_line['seq'][: start] # right_hack_add_on = true_line['seq'][ end :] # # extra_penalty = len(left_hack_add_on) + len(right_hack_add_on) # inferred_naive_seq = 'N'*len(left_hack_add_on) + inferred_naive_seq + 'N'*len(right_hack_add_on) # if debug: # print ' adding to inferred naive seq' if padfo is not None: # remove N padding from the inferred sequence if debug: print 'removing padfo' print inferred_naive_seq if inferred_naive_seq[padfo['padleft'] : ].count('N') == padfo['padleft']: # this fails to happen if reset_effective_erosions_and_effective_insertions already removed the Ns inferred_naive_seq = inferred_naive_seq[padfo['padleft'] : ] elif debug: # NOTE if no debug, we just fall through, which isok print 'tried to remove non Ns!\n %s\n padleft %d\n' % (inferred_naive_seq, padfo['padleft']) if padfo['padright'] > 0: if inferred_naive_seq[ : padfo['padright']].count('N') == padfo['padright']: # this fails to happen if reset_effective_erosions_and_effective_insertions already removed the Ns inferred_naive_seq = inferred_naive_seq[ : -padfo['padright']] elif debug: # NOTE if no debug, we just fall through, which isok print 'tried to remove non Ns!\n %s\n padright %d\n' % (inferred_naive_seq, padfo['padright']) if debug: print padfo['padleft'] * ' ' + inferred_naive_seq + padfo['padleft'] * ' ' bounds = None if restrict_to_region != '': bounds = utils.get_regional_naive_seq_bounds(restrict_to_region, self.germlines, true_line) # get the bounds of this *true* region if debug: print 'restrict to %s' % restrict_to_region utils.color_mutants(true_naive_seq, inferred_naive_seq, print_result=True, extra_str=' ') utils.color_mutants(true_naive_seq[bounds[0] : bounds[1]], inferred_naive_seq[bounds[0] : bounds[1]], print_result=True, extra_str=' ' + bounds[0]*' ') true_naive_seq = true_naive_seq[bounds[0] : bounds[1]] inferred_naive_seq = inferred_naive_seq[bounds[0] : bounds[1]] if len(true_naive_seq) != len(inferred_naive_seq): raise Exception('still not the same lengths for %s\n %s\n %s' % (query_name, true_naive_seq, inferred_naive_seq)) fraction, len_excluding_ambig = utils.hamming_fraction(true_naive_seq, inferred_naive_seq, return_len_excluding_ambig=True) total_distance = int(fraction * len_excluding_ambig) if len(true_naive_seq) == 0: print 'WARNING zero length sequence in hamming_distance_to_true_naive' return 0 if normalize: return int(100 * (float(total_distance) / len(true_naive_seq))) else: return total_distance