def add_single_indel(indelfo, pos, length, gapped_codon_positions, keep_in_frame=False, debug=False): ifo = {'type' : None, 'pos' : pos, 'len' : length, 'seqstr' : None} if numpy.random.uniform(0, 1) < 0.5: # fifty-fifty chance of insertion and deletion ifo['type'] = 'insertion' ifo['seqstr'] = ''.join([utils.nukes[random.randint(0, len(utils.nukes) - 1)] for _ in range(length)]) if utils.gap_len(ifo['seqstr']) > 0: # this is a backup for the uncommon cases where overlaps() in the calling fcn doesn't catch something print ' failed adding indel (overlaps with previous one)' return indelfo['qr_gap_seq'] = indelfo['qr_gap_seq'][:pos] + ifo['seqstr'] + indelfo['qr_gap_seq'][pos:] indelfo['gl_gap_seq'] = indelfo['gl_gap_seq'][:pos] + length * utils.gap_chars[0] + indelfo['gl_gap_seq'][pos:] for region in gapped_codon_positions: if pos < gapped_codon_positions[region]: # this isn\'t right if the indel is actually in the codon, but in that case we just let the messed up codon through below gapped_codon_positions[region] += length for otherfo in indelfo['indels']: # correct the positions of any existing indels that're to the right of this one if otherfo['pos'] > pos: otherfo['pos'] += ifo['len'] else: ifo['type'] = 'deletion' ifo['seqstr'] = indelfo['gl_gap_seq'][pos : pos + length] # NOTE it's kind of unclear whether this should be the bit in the qr or gl seq. Using the gl like this probably makes more sense, since it corresponds to what we would infer in s-w (i.e., if we _do_ delete some SHMd positions, we will never know about it, so who cares) if utils.gap_len(ifo['seqstr']) > 0: # this is a backup for the uncommon cases where overlaps() in the calling fcn doesn't catch something print ' failed adding indel (overlaps with previous one)' return indelfo['qr_gap_seq'] = indelfo['qr_gap_seq'][:pos] + length * utils.gap_chars[0] + indelfo['qr_gap_seq'][pos + length : ] if not utils.codon_unmutated('cyst', indelfo['qr_gap_seq'], gapped_codon_positions['v']): if debug: print ' adding indel within %s codon' % 'cyst' indelfo['indels'].append(ifo) indelfo['indels'] = sorted(indelfo['indels'], key=lambda q: q['pos']) if debug: print get_dbg_str(indelfo)
def reconstruct_indelfo_from_gap_seqs_and_naive_seq(qr_gap_seq, gl_gap_seq, indel_genes, line, iseq, debug=False): # either a <line> that doesn't already have <indelfos> in it (from a new-style file), or it does but we want to reconstruct the indelfos to make sure we get the same thing # NOTE passing gap seqs separately on purpose! don't use any that might be in <line> if utils.gap_len(qr_gap_seq) == 0 and utils.gap_len(gl_gap_seq) == 0: return get_empty_indel() # make a new cigar str using the gapped sequences, then combine that cigar str with info from <line> to make a new indelfo new_cigarstr = get_cigarstr_from_gap_seqs(qr_gap_seq, gl_gap_seq, debug=debug) new_indelfo = get_indelfo_from_cigar_and_line(new_cigarstr, line, iseq, debug=debug) return new_indelfo
def connect_cost(self, connector, line1, line2, pos_offset=None): ''' Cost of connector connecting line1 and line2 Assumes the lines are approx colinear :param connector: line connecting line1/2 :param line1/2: line :param offset: pos offset: connector.pos coords = line1/2.pos coords + offset ''' if pos_offset is None: pos_offset = (0, 0) o = connector.orien if (line1.pos[1-o] > line2.pos[1-o]): # make sure the line1 has the smaller position line1, line2 = line2, line1 connector_range = connector.length_range() line1_range = line1.length_range(offset=pos_offset[1-o]) line2_range = line2.length_range(offset=pos_offset[1-o]) gap_len = utils.gap_len(line1_range, line2_range) cost = gap_len * (connector.count + ( (line1.count + line2.count) / 2)) # double count the gap # count the cost if line1 and line2 don't align perfectly with the ends of connector start_diff = abs(connector_range[0] - line1_range[0]) cost += start_diff * ( (connector.count + line1.count) / 2.0) end_diff = abs(connector_range[1] - line2_range[1]) cost += end_diff * ( (connector.count + line2.count) / 2.0) return cost
def overlaps(pos, length): # see if there's any existing indels close to where we're thinking of putting this one NOTE in practice this _really_ shouldn't happen much -- there should be only a a couple of indels per sequence at most -- this just keeps other things (e.g. indelfo consistency checks) from getting confused and crashing for gapseq in (indelfo['qr_gap_seq'], indelfo['gl_gap_seq']): if len(gapseq) < pos + length + 1: return True if utils.gap_len(gapseq[pos - length : pos + length]) > 0: # this leaves a pretty, albeit inexact, large buffer return True return False
def combine_indels(regional_indelfos, full_qrseq, qrbounds, uid=None, debug=False): # debug = 2 joint_indelfo = get_empty_indel() # make sure the regional qrbounds consist of a nice orderly progression tmpqrblist = [b for r in utils.regions for b in qrbounds[r]] if tmpqrblist != sorted(tmpqrblist): raise Exception( 'messed up qrbounds %s for qr sequence with length %d:\n %s' % (' '.join([('%s %s' % (r, qrbounds[r])) for r in utils.regions ]), len(full_qrseq), full_qrseq)) if qrbounds['j'][1] > len( full_qrseq ): # qrbounds['v'][1] > len(full_qrseq) or qrbounds['d'][1] > len(full_qrseq) or qrbounds['j'][1] > len(full_qrseq): raise Exception( 'qrbounds %s extend beyond sequence with len %d:\n %s' % (qrbounds, len(full_qrseq), full_qrseq)) if debug > 1: print 'combining %d indelfo%s from %s' % ( len(regional_indelfos), utils.plural(len(regional_indelfos)), ' '.join([r for r in utils.regions if r in regional_indelfos])) print ' qrbounds: %s' % ' '.join([('%s %s' % (r, qrbounds[r])) for r in utils.regions]) print ' full qr %s' % full_qrseq qr_gap_seq, gl_gap_seq = [], [] for region in utils.regions: ireg = utils.regions.index(region) if debug > 1: print ' %s' % region if region in regional_indelfos: rfo = regional_indelfos[region] assert has_indels( rfo ) # calling fcn needs to not add it if it doesn't have indels joint_indelfo['genes'][region] = rfo['genes'][region] if utils.non_gap_len( rfo['qr_gap_seq'] ) != qrbounds[region][1] - qrbounds[region][ 0]: # should be fixed by overlapping boundary shifter return None # UPDATE eh screw it this managed to happen *again* (see issue #310) # raise Exception('%sqr_gap_seq non-gap length %d not the same as qrbound length %d in %s region indelfo' % ('%s: ' % uid if uid is not None else '', utils.non_gap_len(rfo['qr_gap_seq']), qrbounds[region][1] - qrbounds[region][0], region)) qr_gap_seq += [rfo['qr_gap_seq']] gl_gap_seq += [rfo['gl_gap_seq']] reg_indel_list = copy.deepcopy(rfo['indels']) for i_prev_reg in range(0, ireg): # loop over previous regions prev_reg = utils.regions[i_prev_reg] if prev_reg not in regional_indelfos: # don't need to do anything if the previous region didn't have indels continue prev_reg_gaps = utils.gap_len( regional_indelfos[prev_reg]['qr_gap_seq'] ) # number of gaps in the previous region's qr gap seq for ifo in reg_indel_list: ifo['pos'] += prev_reg_gaps if debug > 1: print ' add %d to pos for gaps in %s' % ( prev_reg_gaps, prev_reg) joint_indelfo['indels'] += reg_indel_list else: qr_gap_seq += [full_qrseq[qrbounds[region][0]:qrbounds[region][1]]] gl_gap_seq += [ utils.ambig_base * (qrbounds[region][1] - qrbounds[region][0]) ] if debug > 1: print ' %s\n %s' % (qr_gap_seq[-1].replace( utils.gap_chars[0], utils.color( 'red', utils.gap_chars[0])), gl_gap_seq[-1].replace( utils.gap_chars[0], utils.color('red', utils.gap_chars[0]))) if ireg < len(utils.regions) - 1: next_reg = utils.regions[ireg + 1] assert region + next_reg in utils.boundaries qr_gap_seq += [ full_qrseq[qrbounds[region][1]:qrbounds[next_reg][0]] ] gl_gap_seq += [ utils.ambig_base * (qrbounds[next_reg][0] - qrbounds[region][1]) ] if debug > 1: print ' %s%s' % (region, next_reg) print ' %s\n %s' % ( full_qrseq[qrbounds[region][1]:qrbounds[next_reg][0]], utils.ambig_base * (qrbounds[next_reg][0] - qrbounds[region][1])) if debug > 1: print 'combined gap seqs:' print ' qr %s' % ' '.join(qr_gap_seq) print ' gl %s' % ' '.join(gl_gap_seq) joint_indelfo['qr_gap_seq'] = ''.join(qr_gap_seq) joint_indelfo['gl_gap_seq'] = ''.join(gl_gap_seq) assert len(joint_indelfo['qr_gap_seq']) == len(joint_indelfo['gl_gap_seq']) joint_indelfo['reversed_seq'] = get_reversed_seq( joint_indelfo['qr_gap_seq'], joint_indelfo['gl_gap_seq'], full_qrseq[:qrbounds['v'][0]], full_qrseq[qrbounds['j'][1]:]) # assert 'N' not in joint_indelfo['reversed_seq'] # this happens if there's Ns in the initial sequence joint_indelfo[ 'qr_gap_seq'] = full_qrseq[:qrbounds['v'][0]] + joint_indelfo[ 'qr_gap_seq'] + full_qrseq[qrbounds['j'][1]:] joint_indelfo['gl_gap_seq'] = utils.ambig_base * qrbounds['v'][ 0] + joint_indelfo['gl_gap_seq'] + utils.ambig_base * ( len(full_qrseq) - qrbounds['j'][1]) if debug: print 'combined' print get_dbg_str(joint_indelfo) return joint_indelfo