def get_splice_sites(self, block, left=False, right=False, whole=False): """Reports splice site sequence and motif 'left'=donor, 'right'=acceptor, 'whole'=donor + acceptor """ ss = [] if self.align.splice_sites: splice_sites = self.align.splice_sites orient = None if self.txt: if self.txt.strand == '-': orient = '-' elif self.txt.strand == '+': orient = '+' elif self.align.orient: orient = self.align.orient motif = '?' if left and int(block) > 1: splice_site = splice_sites[block-2] if orient == '+' and Event.splice_motifs.has_key(splice_site): motif = Event.splice_motifs[splice_site] elif orient == '-' and Event.splice_motifs.has_key(reverse_complement(splice_site)): motif = Event.splice_motifs[reverse_complement(splice_site)] if not whole: ss.append('%s%s(%s)' % (splice_site[:2].lower(), splice_site[-2:].upper(), motif)) else: ss.append('%s%s(%s)' % (splice_site[:2].upper(), splice_site[-2:].upper(), motif)) if right and int(block) < len(self.align.blocks): splice_site = splice_sites[block-1] if orient == '+' and Event.splice_motifs.has_key(splice_site): motif = Event.splice_motifs[splice_site] elif orient == '-' and Event.splice_motifs.has_key(reverse_complement(splice_site)): motif = Event.splice_motifs[reverse_complement(splice_site)] if not whole: ss.append('%s%s(%s)' % (splice_site[:2].upper(), splice_site[-2:].lower(), motif)) else: ss.append('%s%s(%s)' % (splice_site[:2].upper(), splice_site[-2:].upper(), motif)) else: print 'cannot extract splice sites: %s' % (self.align.query) return ss
def set_orient(self, splice_motifs): """Sets orientation(+/-) of contig based on splice motifs""" if not splice_motifs: return None counts = {"forward":0, "backward":0, "unknown":0} motifs = {'forward':[], 'backward':[]} for motif in splice_motifs.keys(): motifs['forward'].append(motif.lower()) motifs['backward'].append(tools.reverse_complement(motif).lower()) for ss in self.splice_sites: orient = "unknown" if ss in motifs['forward']: orient = "forward" elif ss in motifs['backward']: orient = "backward" counts[orient] += 1 orient = None if counts['forward'] > 0 and counts['backward'] == 0 and counts['forward'] > counts['unknown']: orient = '+' elif counts['backward'] > 0 and counts['forward'] == 0 and counts['backward'] > counts['unknown']: orient = '-' self.orient = orient
def fix_neighbor_gaps(self, tblock1, tblock2, tblock3, qblock1, qblock2, qblock3, splice_motifs, refseq, query_strand): """Fix neigboring gaps if necessary to see if canonical splice sites can be re-established. In case of consecutive gaps that don't have canonical splice sites, move middle block to either end to see if canonical splice can be achieved (without or without shuffling after movement) """ middle_block_size = tblock2[1] - tblock2[0] + 1 max_shuffle_size = 10 if middle_block_size > max_shuffle_size: return False, None tgap = [tblock1[1]+1, tblock3[0]-1] new_block = {'query':[], 'target':[]} possible_shuffles = [] for shuffle in (-1 * middle_block_size, middle_block_size): if shuffle > 0: left_shuffle, right_shuffle = shuffle, 0 else: left_shuffle, right_shuffle = 0, shuffle coord = tgap[0] + left_shuffle, tgap[1] + right_shuffle gap_seq = refseq.GetSequence(self.target, coord[0], coord[1]) ss = gap_seq[:2] + gap_seq[-2:] if splice_motifs.has_key(ss.lower()) or splice_motifs.has_key(tools.reverse_complement(ss).lower()): possible_shuffles.append({'motif':ss.lower(), 'left':left_shuffle, 'right':right_shuffle, 'shuffle_size':abs(left_shuffle) + abs(right_shuffle)}) splice_site = None left_shuffle = right_shuffle = None if possible_shuffles: possible_shuffles.sort(self.compare_shuffles) splice_site = possible_shuffles[0]['motif'] left_shuffle = possible_shuffles[0]['left'] right_shuffle = possible_shuffles[0]['right'] if splice_site and left_shuffle != None and right_shuffle != None: #move bases from right to left if right_shuffle == 0: tblock1[1] += left_shuffle if query_strand == '+': qblock1[1] += left_shuffle else: qblock1[1] -= left_shuffle else: tblock3[0] += right_shuffle if query_strand == '+': qblock3[0] += right_shuffle else: qblock3[0] -= right_shuffle else: splice_site, new_block = self.fix_single_gap(tblock1, tblock3, qblock1, qblock3, splice_motifs, refseq, query_strand, extra_query=range(qblock2[0], qblock2[1]+1)) if new_block != None and new_block['query']: return splice_site, new_block else: return splice_site, None
def fix_single_gap(self, tblock1, tblock2, qblock1, qblock2, splice_motifs, refseq, query_strand, extra_query=[]): """Shuffles sequence from end to end to see if canonical splice sites can be achieved""" max_shuffle_size = 10 tgap = [tblock1[1]+1, tblock2[0]-1] tsize = tgap[1] - tgap[0] + 1 min_size = 10 max_size = 100000 if tsize < min_size or tsize > max_size: return False, None fixed = False new_block = {'query':[], 'target':[]} possible_shuffles = [] for left_shuffle in range(-1 * max_shuffle_size, max_shuffle_size + 1): for right_shuffle in range(-1 * max_shuffle_size, max_shuffle_size + 1): #skip shuffle in opposite directions, and cases where one side is 0 but there is no extra sequence to move if left_shuffle * right_shuffle < 0 or (left_shuffle * right_shuffle == 0 and len(extra_query) == 0): continue coord = tgap[0] + left_shuffle, tgap[1] + right_shuffle gap_seq = refseq.GetSequence(self.target, coord[0], coord[1]) ss = gap_seq[:2] + gap_seq[-2:] if splice_motifs.has_key(ss.lower()) or splice_motifs.has_key(tools.reverse_complement(ss).lower()): possible_shuffles.append({'motif':ss.lower(), 'left':left_shuffle, 'right':right_shuffle, 'shuffle_size':abs(left_shuffle) + abs(right_shuffle)}) splice_site = None left_shuffle = right_shuffle = None if possible_shuffles: possible_shuffles.sort(self.compare_shuffles) splice_site = possible_shuffles[0]['motif'] left_shuffle = possible_shuffles[0]['left'] right_shuffle = possible_shuffles[0]['right'] successful_shuffles = [] for i in range(len(possible_shuffles)): success, new_block = self.shuffle(possible_shuffles[i]['left'], possible_shuffles[i]['right'], extra_query[:], qblock1, qblock2, tblock1, tblock2, query_strand) if success: splice_site = possible_shuffles[i]['motif'] print 'success', splice_site, new_block, qblock1, qblock2, tblock1, tblock2 break if splice_site: return splice_site, new_block else: return False, None else: return False, None
def parse_line(line): """Parses individual line of UCSC dbSNP file""" cols = line.rstrip("\n").split("\t") data = {} for i in range(len(cols)): if i == len(fields): break data[fields[i]] = cols[i] if data["class"] == "single": data["type"] = "snv" # sometimes 'class' is 'named' and '(LARGEDELETION)' is reported instead # of actual allele elif data["class"] == "deletion" or "deletion" in data["observed"].lower(): data["type"] = "del" elif data["class"] == "insertion": data["type"] = "ins" else: data["type"] = "NA" if data["type"] == "ins": data["start"] = int(data["chromStart"]) else: data["start"] = int(data["chromStart"]) + 1 data["end"] = int(data["chromEnd"]) data["allele"] = {} data["size"] = 0 for a in data["observed"].split("/"): if a != "-": if data["strand"] == "+": data["allele"][a.lower()] = True elif data["strand"] == "-": data["allele"][reverse_complement(a).lower()] = True if data["size"] == 0: data["size"] = len(a) # make sure deletion size is correct, as sometimes '(LARGEDELETION)' # will be put as allele if data["type"] == "del": data["size"] = int(data["end"]) - int(data["start"]) + 1 if data["observed"] == "lengthTooLong": data = {} return data
def parse_line(line): """Parses individual line of UCSC dbSNP file""" cols = line.rstrip('\n').split('\t') data = {} for i in range(len(cols)): if i == len(fields): break data[fields[i]] = cols[i] if data['class'] == 'single': data['type'] = 'snv' # sometimes 'class' is 'named' and '(LARGEDELETION)' is reported instead # of actual allele elif data['class'] == 'deletion' or 'deletion' in data['observed'].lower(): data['type'] = 'del' elif data['class'] == 'insertion': data['type'] = 'ins' else: data['type'] = 'NA' if data['type'] == 'ins': data['start'] = int(data['chromStart']) else: data['start'] = int(data['chromStart']) + 1 data['end'] = int(data['chromEnd']) data['allele'] = {} data['size'] = 0 for a in data['observed'].split('/'): if a != '-': if data['strand'] == '+': data['allele'][a.lower()] = True elif data['strand'] == '-': data['allele'][reverse_complement(a).lower()] = True if data['size'] == 0: data['size'] = len(a) # make sure deletion size is correct, as sometimes '(LARGEDELETION)' # will be put as allele if data['type'] == 'del': data['size'] = int(data['end']) - int(data['start']) + 1 if data['observed'] == 'lengthTooLong': data = {} return data
def find_pep_change(self, refseq): """Finds effect on protein sequence given event""" if self.event_type == 'novel_transcript': return self.longest_orf() coord = re.split('[:-]', self.coordinate()) variant = None if self.event_type == 'novel_utr': # include the block that's 'matching' (partially) if min(self.align_blocks) == 1: b1 = self.align_blocks[0] b2 = self.align_blocks[-1] + 1 else: b1 = self.align_blocks[0] - 1 b2 = self.align_blocks[-1] qcoord1 = self.align.query_blocks[b1-1][0] qcoord2 = self.align.query_blocks[b2-1][1] else: qcoord1 = self.align.query_blocks[self.align_blocks[0]-1][0] qcoord2 = self.align.query_blocks[self.align_blocks[-1]-1][1] if qcoord1 < qcoord2: variant = self.align.contig.sequence[qcoord1-1:qcoord2] else: variant = self.align.contig.sequence[qcoord2-1:qcoord1] variant = reverse_complement(variant) # constructs cDNA sequence of both reference and sequence with event cdna_original = self.construct_cdna(coord, self.txt, refseq) cdna_changed = self.construct_cdna(coord, self.txt, refseq, variant=variant, change=self.event_type, exons=self.exons) frame = 0 if self.event_type == 'novel_utr': frame = None if self.txt.strand == '+': pep_original = translate(cdna_original, orient='+', frame=0) pep_changed = translate(cdna_changed, orient='+', frame=frame) else: pep_original = translate(cdna_original, orient='-', frame=0) pep_changed = translate(cdna_changed, orient='-', frame=frame) if not pep_changed or not pep_original or self.event_type == 'read-through': return 'na' self.orf = pep_change(pep_original, pep_changed)
def get_sequence(self, refseq, fasta=False, chrom=None): """Extracts transcript sequence""" sequence = '' if chrom is None: chrom = self.chrom else: chrom = chrom for i in range(len(self.exons)): exon = refseq.GetSequence(chrom, int(self.exons[i][0]), int(self.exons[i][-1])) sequence += exon.upper() if self.strand == '-': sequence = tools.reverse_complement(sequence) if fasta: sequence = '>%s_%s\n%s' % (self.name, self.alias, sequence) return sequence
def match_blocks(self, align, query_seq): """Identifies SNVs""" snvs = [] for i in range(len(align.blocks)): if align.query_strand == '+': qseq = query_seq[int(align.query_blocks[i][0])-1:int(align.query_blocks[i][1])] else: qseq = tools.reverse_complement(query_seq[int(align.query_blocks[i][1]-1):int(align.query_blocks[i][0])]) tseq = self.refseq.GetSequence(align.target, int(align.blocks[i][0]), int(align.blocks[i][1])) mismatches = self.find_mismatches(qseq, tseq) for pos, change in mismatches.iteritems(): tpos = int(align.blocks[i][0]) + pos if int(align.query_blocks[i][0]) < int(align.query_blocks[i][1]): qpos = int(align.query_blocks[i][0]) + pos else: qpos = int(align.query_blocks[i][0]) - pos snv = SNV('psl', 'snv', align.target, tpos, tpos, change[0], align.query_strand, align.query, qpos, qpos, change[1]) snvs.append(snv) return snvs
def gap_snv(self, align, splice_motifs, query_seq, cutoff=None): """Identifies insertions, deletions, inversion from gapped alignments""" if self.debug: print align.target, align.blocks print align.query, align.query_blocks print align.splice_sites snvs = [] # cannot identify indels without splice site information if self.sample_type == 'transcriptome' and not align.splice_sites: return snvs for i in range(len(align.blocks)-1): if self.sample_type != 'transcriptome' or not self.match_intron(align.splice_sites[i], splice_motifs): if align.query_strand == '+': qstart = align.query_blocks[i][1]+1 qend = align.query_blocks[i+1][0]-1 query = query_seq[qstart-1:qend] else: qend = align.query_blocks[i][1]-1 qstart = align.query_blocks[i+1][0]+1 query = query_seq[qstart-1:qend] query = tools.reverse_complement(query) # target strand always + from psl tstart = align.blocks[i][1]+1 tend = align.blocks[i+1][0]-1 target = '' if tstart <= tend: target = self.refseq.GetSequence(align.target, tstart, tend) #if code cannot extract sequence from reference, there must be a disagreement between alignment and reference - abort analysis if tend > tstart-1 and len(target) < 1: sys.stderr.write("cannot extract reference sequence, abort: %s %s %s\n" % (align.target, tstart-1, tend)) sys.exit(100) snv_type = None if qstart > qend and (tend - tstart) >= 0: size = tend - tstart + 1 if align.query_strand == '+': qstart = qend else: qend = qstart snv_type = "del" elif tstart > tend and (qend - qstart) >= 0: size = qend - qstart + 1 tstart = tend snv_type = "ins" else: size = min(1, tend - tstart + 1) snv_type = "indel" # skip if 0 or negative size event detected (or smaller than cutoff) if size <= 0 or (cutoff and size > cutoff): continue target = target.lower() query = query.lower() # would not report event with non-AGCT characters if not re.search('[^agtcATGC]', target) and not re.search('[^agtcATGC]', query): if snv_type != 'indel': snv = SNV('psl', snv_type, align.target, tstart, tend, target, align.query_strand, align.query, qstart, qend, query) snvs.append(snv) # resolves indels else: if len(query) == len(target) and\ (query[::-1].lower() == target.lower() or tools.reverse_complement(query).lower() == target.lower()): # inversion must be longer than 1 base if len(query) > 1: snv = SNV('psl', 'inv', align.target, tstart, tend, target, align.query_strand, align.query, qstart, qend, query) snvs.append(snv) # 1 bp gap in both query and target == snv else: snv = SNV('psl', 'snv', align.target, tstart, tend, target, align.query_strand, align.query, qstart, qend, query) snvs.append(snv) # breaks up indel into ins and del else: if align.query_strand == '+': qcoord = qstart else: qcoord = qend snv = SNV('psl', 'del', align.target, tstart, tend, target, align.query_strand, align.query, qcoord, qcoord, query) snvs.append(snv) tcoord = tstart snv = SNV('psl', 'ins', align.target, tcoord, tcoord, target, align.query_strand, align.query, qstart, qend, query) snvs.append(snv) return snvs
def match_intron(self, ss, splice_motifs): """Determines splite sites correspond to intron by comparing to splice motifs""" if ss and (splice_motifs.has_key(ss.lower()) or splice_motifs.has_key(tools.reverse_complement(ss).lower())): return True else: return False
def expand_contig_region(self, contig_sequence, query_strand): """Expand read-support checking region if repeats are involved""" if not self.snv_type in ('ins', 'dup', 'ITD', 'PTD', 'del'): return None # skip if deleted/inserted sequence is longer than contig sequence if self.snv_type == 'del' and len(self.ref_seq) > len(contig_sequence): return None if self.snv_type in ('ins', 'dup', 'ITD', 'PTD') and len(self.var_seq) > len(contig_sequence): return None if self.snv_type in ('ins', 'dup', 'ITD', 'PTD'): seq = self.var_seq[:] else: seq = self.ref_seq[:] if len(seq) == 0: return None if tools.is_homopolymer(seq) or len(seq) == 1: h**o = True else: h**o = False # keep a record of previous value for reporting expansion region_before = self.confirm_contig_region[:] # arbitrary big number limit = 100000 # forward expand = 0 for i in range(limit): if h**o: changed_base = seq[0].upper() else: changed_base = seq[i % len(seq)].upper() downstream_base = None if self.snv_type == 'del': if query_strand == '+': if int(self.var_end) + i < len(contig_sequence) and int(self.var_end) + i >= 0: downstream_base = contig_sequence[int(self.var_end) + i].upper() else: if int(self.var_end) - 2 - i >= 0 and int(self.var_end) - 2 - i < len(contig_sequence): downstream_base = tools.reverse_complement(contig_sequence[int(self.var_end) - 2 - i]).upper() elif self.snv_type in ('ins', 'dup', 'ITD', 'PTD'): if query_strand == '+': if int(self.var_end) + i < len(contig_sequence) and int(self.var_end) + i >= 0: downstream_base = contig_sequence[int(self.var_end) + i].upper() else: if int(self.var_start) - i - 2 >= 0 and int(self.var_start) - i - 2 < len(contig_sequence): downstream_base = tools.reverse_complement(contig_sequence[int(self.var_start) - i - 2]).upper() if changed_base == downstream_base: expand += 1 else: break multiples = expand/len(seq) if multiples > 0: if query_strand == '+': self.confirm_contig_region[1] += multiples * self.snv_len else: self.confirm_contig_region[0] -= multiples * self.snv_len # reverse seq = seq[::-1] expand = 0 for i in range(limit): if h**o: changed_base = seq[0].upper() else: changed_base = seq[i%len(seq)].upper() upstream_base = None if self.snv_type == 'del': if query_strand == '+': if int(self.var_start) - i - 1 >= 0 and int(self.var_start) - i - 1 < len(contig_sequence): upstream_base = contig_sequence[int(self.var_start) - i - 1].upper() else: if int(self.var_start) + i - 1 < len(contig_sequence) and int(self.var_start) + i - 1 >= 0: upstream_base = tools.reverse_complement(contig_sequence[int(self.var_start)+i-1]).upper() elif self.snv_type in ('ins', 'dup', 'ITD', 'PTD'): if query_strand == '+': if int(self.var_start) - i - 2 >= 0 and int(self.var_start) - i - 2 < len(contig_sequence): upstream_base = contig_sequence[int(self.var_start) - i - 2].upper() else: if int(self.var_end) + i < len(contig_sequence) and int(self.var_end) + i >= 0: upstream_base = tools.reverse_complement(contig_sequence[int(self.var_end) + i]).upper() if changed_base == upstream_base: expand += 1 else: break multiples = expand/len(seq) if multiples > 0: if query_strand == '+': self.confirm_contig_region[0] -= multiples * self.snv_len else: self.confirm_contig_region[1] += multiples * self.snv_len expanded_sequence = contig_sequence[self.confirm_contig_region[0]-1:self.confirm_contig_region[1]] # coordinate given in 1-based if region_before[0] != self.confirm_contig_region[0] or region_before[1] != self.confirm_contig_region[1]: self.expansion = (self.confirm_contig_region[1] - self.confirm_contig_region[0] + 1) / self.snv_len sys.stderr.write("expand confirm contig region %s %s -> %s %s %s %sx\n" % (self.var, region_before, self.confirm_contig_region, expanded_sequence, len(expanded_sequence), self.expansion))
def correct_neighbor_gaps(self, splice_motifs, refseq): """Post-process blocks after fix_neighbor_gaps()""" gaps = {} for i in range(len(self.blocks)-1): ss = self.splice_sites[i] if ss and not splice_motifs.has_key(ss) and not splice_motifs.has_key(tools.reverse_complement(ss).lower()): if abs(self.query_blocks[i+1][0] - self.query_blocks[i][1]) == 1: gaps[i] = 0 if gaps: target_blocks = self.blocks[:] query_blocks = self.query_blocks[:] splice_sites = self.splice_sites[:] gap_indices = gaps.keys() gap_indices.sort(lambda x,y: x-y) # fix by moving exon and then shuffle replaced = {} replaced_ordered = [] for i in range(len(gap_indices)): i1 = gap_indices[i] i2 = i1 + 1 i0 = i1 - 1 if gaps[i1] == 1: continue if i2 + 1 < len(target_blocks): tblock1 = target_blocks[i1][:] tblock2 = target_blocks[i2][:] tblock3 = target_blocks[i2+1][:] qblock1 = query_blocks[i1][:] qblock2 = query_blocks[i2][:] qblock3 = query_blocks[i2+1][:] splice_site, new_block = self.fix_neighbor_gaps(tblock1, tblock2, tblock3, qblock1, qblock2, qblock3, splice_motifs, refseq, self.query_strand) if splice_site: idx = ' '.join((str(i1), str(i2), str(i2+1))) if new_block: replaced[idx] = tblock1, tblock3, qblock1, qblock3, splice_site, new_block else: replaced[idx] = tblock1, tblock3, qblock1, qblock3, splice_site gaps[i1] = 1 if gaps.has_key(i2): gaps[i2] = 1 replaced_ordered.append(idx) # if not fixed, try backward if gaps[i1] == 0 and i0 >= 0 and i2 <= len(target_blocks) and (not gaps.has_key(i0) or gaps[i0] == 0): tblock1 = target_blocks[i0][:] tblock2 = target_blocks[i1][:] tblock3 = target_blocks[i2][:] qblock1 = query_blocks[i0][:] qblock2 = query_blocks[i1][:] qblock3 = query_blocks[i2][:] splice_site, new_block = self.fix_neighbor_gaps(tblock1, tblock2, tblock3, qblock1, qblock2, qblock3, splice_motifs, refseq, self.query_strand) if splice_site: idx = ' '.join((str(i0), str(i1), str(i2))) if new_block: replaced[idx] = tblock1, tblock3, qblock1, qblock3, splice_site, new_block else: replaced[idx] = tblock1, tblock3, qblock1, qblock3, splice_site gaps[i1] = 1 if gaps.has_key(i0): gaps[i0] = 1 replaced_ordered.append(idx) # make sure delete from back to front replaced_ordered.reverse() for indices in replaced_ordered: new_blocks = replaced[indices] ok = True for index in indices.split(' '): if gaps.has_key(int(index)) and gaps[int(index)] > 1: ok = False break if ok: idx = [int(i) for i in indices.split(' ')] sys.stderr.write("Type3 %s changed blocks %s %s to %s %s\n" % (self.query, self.target, self.blocks[idx[0]], new_blocks[0], new_blocks[4])) sys.stderr.write("Type3 %s changed blocks %s %s to %s %s\n" % (self.query, self.target, self.blocks[idx[2]], new_blocks[1], new_blocks[4])) if len(new_blocks) == 5: sys.stderr.write("Type3 %s removed block %s %s\n" % (self.query, self.target, self.blocks[idx[1]])) else: sys.stderr.write("Type3 %s changed blocks %s %s to %s\n" % (self.query, self.target, self.blocks[idx[1]], new_blocks[-1]['target'])) target_blocks[idx[0]] = new_blocks[0] target_blocks[idx[2]] = new_blocks[1] query_blocks[idx[0]] = new_blocks[2] query_blocks[idx[2]] = new_blocks[3] if len(new_blocks) == 5: del target_blocks[idx[1]] del query_blocks[idx[1]] del splice_sites[idx[1]] splice_sites[idx[0]] = new_blocks[4] else: target_blocks[idx[1]] = new_blocks[-1]['target'] query_blocks[idx[1]] = new_blocks[-1]['query'] if target_blocks[idx[1]][0] - target_blocks[idx[0]][1] < target_blocks[idx[2]][0] - target_blocks[idx[1]][0]: splice_sites[idx[0]] = 'NA' splice_sites[idx[1]] = new_blocks[4] else: splice_sites[idx[0]] = new_blocks[4] splice_sites[idx[1]] = 'NA' if target_blocks != self.blocks and self.check_corrections(query_blocks, target_blocks, self.query_strand, '+', self.query): self.blocks = target_blocks[:] self.query_blocks = query_blocks[:] self.splice_sites = splice_sites if not self.mismatch or int(self.mismatch) == 0: self.mismatch = 1
def correct_single_gaps(self, splice_motifs, refseq): """ Post-process blocks after fix_single_gaps() """ gaps = {} for i in range(len(self.blocks)-1): ss = self.splice_sites[i] if ss and not splice_motifs.has_key(ss) and not splice_motifs.has_key(tools.reverse_complement(ss).lower()): gaps[i] = [] if self.query_blocks[i][1] < self.query_blocks[i+1][0]: for j in range(self.query_blocks[i][1]+1, self.query_blocks[i+1][0]): gaps[i].append(j) else: for j in range(self.query_blocks[i][1]-1, self.query_blocks[i+1][0], -1): gaps[i].append(j) if gaps: target_blocks = self.blocks[:] query_blocks = self.query_blocks[:] splice_sites = self.splice_sites[:] gap_indices = gaps.keys() gap_indices.sort(lambda x,y: x-y) changed_blocks = {} for i in gap_indices: tblock1 = target_blocks[i][:] tblock2 = target_blocks[i+1][:] qblock1 = query_blocks[i][:] qblock2 = query_blocks[i+1][:] splice_site, new_block = self.fix_single_gap(tblock1, tblock2, qblock1, qblock2, splice_motifs, refseq, self.query_strand, gaps[i]) if splice_site: sys.stderr.write("Type2a %s changed blocks %s %s to %s %s\n" % (self.query, self.target, target_blocks[i], tblock1, splice_site)) sys.stderr.write("Type2a %s changed blocks %s %s to %s %s\n" % (self.query, self.target, target_blocks[i+1], tblock2, splice_site)) target_blocks[i] = tblock1 target_blocks[i+1] = tblock2 query_blocks[i] = qblock1 query_blocks[i+1] = qblock2 splice_sites[i] = splice_site if new_block: changed_blocks[i] = new_block if changed_blocks: changed_blocks_indices = changed_blocks.keys() changed_blocks_indices.sort(lambda x,y: y-x) for i in changed_blocks_indices: if changed_blocks[i] != -1: sys.stderr.write("Type2b %s add block %s %s %s %s\n" % (self.query, self.target, changed_blocks[i]['target'], changed_blocks[i]['query'], 'NA')) if abs(target_blocks[i][1] - changed_blocks[i]['target'][0]) < abs(target_blocks[i+1][0] - changed_blocks[i]['target'][1]): splice_sites.insert(i, 'NA') else: splice_sites.insert(i+1, 'NA') query_blocks.insert(i+1, changed_blocks[i]['query']) target_blocks.insert(i+1, changed_blocks[i]['target']) else: del query_blocks[i] if target_blocks != self.blocks and self.check_corrections(query_blocks, target_blocks, self.query_strand, '+', self.query): self.blocks = target_blocks[:] self.query_blocks = query_blocks[:] self.splice_sites = splice_sites if not self.mismatch or int(self.mismatch) == 0: self.mismatch = 1