def upshift(self, refseq): """Shifts event start coordinate upstream for repeat-involved event""" if self.snv_type in ('ins', 'dup', 'ITD', 'PTD') or self.snv_type == 'del': if self.snv_type in ('ins', 'dup', 'ITD', 'PTD'): if tools.is_homopolymer(self.var_seq): size = 1 seq = self.var_seq[0] else: size = len(self.var_seq) seq = self.var_seq else: if tools.is_homopolymer(self.ref_seq): size = 1 seq = self.ref_seq[0] else: size = len(self.ref_seq) seq = self.ref_seq start = int(self.ref_start) - size # skip if 0-size event if size == 0: sys.stderr.write("error in upshift size 0 contig:%s %s\n" % (self.var, self.ref_seq)) return # continues checking upstream sequence to see if it's repeat of sequence in question while start > 1: upstream = refseq.GetSequence(self.ref, start + 1, start + size) if seq.upper() != upstream.upper(): break start = start - size # changed reference start coordinate (and end coordinate too if deletion) if start + size < self.ref_start: if self.snv_type in ('ins', 'dup', 'ITD', 'PTD'): sys.stderr.write("shifted %s %s %s %s to %d\n" % (self.var, self.snv_type, self.ref, self.ref_start, start+size)) self.ref_start = self.ref_end = start + size else: sys.stderr.write("shifted %s %s %s %s to %d\n" % (self.var, self.snv_type, self.ref, self.ref_start, start+size+1)) self.ref_start = start + size + 1 self.ref_end = self.ref_start + len(self.ref_seq) - 1
def expand_contig_region(self, contig_sequence, query_strand): """Expand read-support checking region if repeats are involved""" if not self.snv_type in ('ins', 'dup', 'ITD', 'PTD', 'del'): return None # skip if deleted/inserted sequence is longer than contig sequence if self.snv_type == 'del' and len(self.ref_seq) > len(contig_sequence): return None if self.snv_type in ('ins', 'dup', 'ITD', 'PTD') and len(self.var_seq) > len(contig_sequence): return None if self.snv_type in ('ins', 'dup', 'ITD', 'PTD'): seq = self.var_seq[:] else: seq = self.ref_seq[:] if len(seq) == 0: return None if tools.is_homopolymer(seq) or len(seq) == 1: h**o = True else: h**o = False # keep a record of previous value for reporting expansion region_before = self.confirm_contig_region[:] # arbitrary big number limit = 100000 # forward expand = 0 for i in range(limit): if h**o: changed_base = seq[0].upper() else: changed_base = seq[i % len(seq)].upper() downstream_base = None if self.snv_type == 'del': if query_strand == '+': if int(self.var_end) + i < len(contig_sequence) and int(self.var_end) + i >= 0: downstream_base = contig_sequence[int(self.var_end) + i].upper() else: if int(self.var_end) - 2 - i >= 0 and int(self.var_end) - 2 - i < len(contig_sequence): downstream_base = tools.reverse_complement(contig_sequence[int(self.var_end) - 2 - i]).upper() elif self.snv_type in ('ins', 'dup', 'ITD', 'PTD'): if query_strand == '+': if int(self.var_end) + i < len(contig_sequence) and int(self.var_end) + i >= 0: downstream_base = contig_sequence[int(self.var_end) + i].upper() else: if int(self.var_start) - i - 2 >= 0 and int(self.var_start) - i - 2 < len(contig_sequence): downstream_base = tools.reverse_complement(contig_sequence[int(self.var_start) - i - 2]).upper() if changed_base == downstream_base: expand += 1 else: break multiples = expand/len(seq) if multiples > 0: if query_strand == '+': self.confirm_contig_region[1] += multiples * self.snv_len else: self.confirm_contig_region[0] -= multiples * self.snv_len # reverse seq = seq[::-1] expand = 0 for i in range(limit): if h**o: changed_base = seq[0].upper() else: changed_base = seq[i%len(seq)].upper() upstream_base = None if self.snv_type == 'del': if query_strand == '+': if int(self.var_start) - i - 1 >= 0 and int(self.var_start) - i - 1 < len(contig_sequence): upstream_base = contig_sequence[int(self.var_start) - i - 1].upper() else: if int(self.var_start) + i - 1 < len(contig_sequence) and int(self.var_start) + i - 1 >= 0: upstream_base = tools.reverse_complement(contig_sequence[int(self.var_start)+i-1]).upper() elif self.snv_type in ('ins', 'dup', 'ITD', 'PTD'): if query_strand == '+': if int(self.var_start) - i - 2 >= 0 and int(self.var_start) - i - 2 < len(contig_sequence): upstream_base = contig_sequence[int(self.var_start) - i - 2].upper() else: if int(self.var_end) + i < len(contig_sequence) and int(self.var_end) + i >= 0: upstream_base = tools.reverse_complement(contig_sequence[int(self.var_end) + i]).upper() if changed_base == upstream_base: expand += 1 else: break multiples = expand/len(seq) if multiples > 0: if query_strand == '+': self.confirm_contig_region[0] -= multiples * self.snv_len else: self.confirm_contig_region[1] += multiples * self.snv_len expanded_sequence = contig_sequence[self.confirm_contig_region[0]-1:self.confirm_contig_region[1]] # coordinate given in 1-based if region_before[0] != self.confirm_contig_region[0] or region_before[1] != self.confirm_contig_region[1]: self.expansion = (self.confirm_contig_region[1] - self.confirm_contig_region[0] + 1) / self.snv_len sys.stderr.write("expand confirm contig region %s %s -> %s %s %s %sx\n" % (self.var, region_before, self.confirm_contig_region, expanded_sequence, len(expanded_sequence), self.expansion))