Esempio n. 1
0
 def upshift(self, refseq): 
     """Shifts event start coordinate upstream for repeat-involved event"""
     if self.snv_type in ('ins', 'dup', 'ITD', 'PTD') or self.snv_type == 'del':
         if self.snv_type in ('ins', 'dup', 'ITD', 'PTD'):
             if tools.is_homopolymer(self.var_seq):
                 size = 1
                 seq = self.var_seq[0]
             else:
                 size = len(self.var_seq)
                 seq = self.var_seq
         else:
             if tools.is_homopolymer(self.ref_seq):
                 size = 1
                 seq = self.ref_seq[0]
             else:
                 size = len(self.ref_seq)
                 seq = self.ref_seq
                 
         start = int(self.ref_start) - size
         # skip if 0-size event
         if size == 0:
             sys.stderr.write("error in upshift size 0 contig:%s %s\n" % (self.var, self.ref_seq))
             return
         
         # continues checking upstream sequence to see if it's repeat of sequence in question
         while start > 1:
             upstream = refseq.GetSequence(self.ref, start + 1, start + size)
             if seq.upper() != upstream.upper():
                 break
             start = start - size
             
         # changed reference start coordinate (and end coordinate too if deletion)
         if start + size < self.ref_start:
             if self.snv_type in ('ins', 'dup', 'ITD', 'PTD'):
                 sys.stderr.write("shifted %s %s %s %s to %d\n" % (self.var, self.snv_type, self.ref, self.ref_start, start+size))
                 self.ref_start = self.ref_end = start + size
             else:
                 sys.stderr.write("shifted %s %s %s %s to %d\n" % (self.var, self.snv_type, self.ref, self.ref_start, start+size+1))
                 self.ref_start = start + size + 1
                 self.ref_end = self.ref_start + len(self.ref_seq) - 1
Esempio n. 2
0
    def expand_contig_region(self, contig_sequence, query_strand):
        """Expand read-support checking region if repeats are involved"""
        if not self.snv_type in ('ins', 'dup', 'ITD', 'PTD', 'del'):
            return None
        
        # skip if deleted/inserted sequence is longer than contig sequence
        if self.snv_type == 'del' and len(self.ref_seq) > len(contig_sequence):
            return None
        if self.snv_type in ('ins', 'dup', 'ITD', 'PTD') and len(self.var_seq) > len(contig_sequence):
            return None

        if self.snv_type in ('ins', 'dup', 'ITD', 'PTD'):
            seq = self.var_seq[:]
        else:
            seq = self.ref_seq[:]
        
        if len(seq) == 0:
            return None
            
        if tools.is_homopolymer(seq) or len(seq) == 1:
            homo = True
        else:
            homo = False

        # keep a record of previous value for reporting expansion
        region_before = self.confirm_contig_region[:]
        
        # arbitrary big number
        limit = 100000
        
        # forward
        expand = 0
        for i in range(limit):
            if homo:
                changed_base = seq[0].upper()
            else:
                changed_base = seq[i % len(seq)].upper()
                
            downstream_base = None
            if self.snv_type == 'del':
                if query_strand == '+':
                    if int(self.var_end) + i < len(contig_sequence) and int(self.var_end) + i >= 0:
                        downstream_base = contig_sequence[int(self.var_end) + i].upper()
                else:
                    if int(self.var_end) - 2 - i >= 0 and int(self.var_end) - 2 - i < len(contig_sequence): 
                        downstream_base = tools.reverse_complement(contig_sequence[int(self.var_end) - 2 - i]).upper()

            elif self.snv_type in ('ins', 'dup', 'ITD', 'PTD'):
                if query_strand == '+':
                    if int(self.var_end) + i < len(contig_sequence) and int(self.var_end) + i >= 0:
                        downstream_base = contig_sequence[int(self.var_end) + i].upper()
                else:
                    if int(self.var_start) - i - 2 >= 0 and int(self.var_start) - i - 2 < len(contig_sequence):
                        downstream_base = tools.reverse_complement(contig_sequence[int(self.var_start) - i - 2]).upper()

            if changed_base == downstream_base:
                expand += 1   
            else:
                break
        
        multiples = expand/len(seq)
        if multiples > 0:
            if query_strand == '+':
                self.confirm_contig_region[1] += multiples * self.snv_len
            else:
                self.confirm_contig_region[0] -= multiples * self.snv_len

        # reverse
        seq = seq[::-1]
        expand = 0

        for i in range(limit):
            if homo:
                changed_base = seq[0].upper()
            else:
                changed_base = seq[i%len(seq)].upper()
                
            upstream_base = None
            if self.snv_type == 'del':
                if query_strand == '+':
                    if int(self.var_start) - i - 1 >= 0 and int(self.var_start) - i - 1 < len(contig_sequence):
                        upstream_base = contig_sequence[int(self.var_start) - i - 1].upper()
                else:
                    if int(self.var_start) + i - 1 < len(contig_sequence) and int(self.var_start) + i - 1 >= 0:
                        upstream_base = tools.reverse_complement(contig_sequence[int(self.var_start)+i-1]).upper()
                        
            elif self.snv_type in ('ins', 'dup', 'ITD', 'PTD'):
                if query_strand == '+':
                    if int(self.var_start) - i - 2 >= 0 and int(self.var_start) - i - 2 < len(contig_sequence):
                        upstream_base = contig_sequence[int(self.var_start) - i - 2].upper()
                else:
                    if int(self.var_end) + i < len(contig_sequence) and int(self.var_end) + i >= 0:
                        upstream_base = tools.reverse_complement(contig_sequence[int(self.var_end) + i]).upper()

            if changed_base == upstream_base:
                expand += 1   
            else:
                break

        multiples = expand/len(seq)
        if multiples > 0:
            if query_strand == '+':
                self.confirm_contig_region[0] -= multiples * self.snv_len
            else:
                self.confirm_contig_region[1] += multiples * self.snv_len
                
        expanded_sequence = contig_sequence[self.confirm_contig_region[0]-1:self.confirm_contig_region[1]]
        
        # coordinate given in 1-based
        if region_before[0] != self.confirm_contig_region[0] or region_before[1] != self.confirm_contig_region[1]:
            self.expansion = (self.confirm_contig_region[1] - self.confirm_contig_region[0] + 1) / self.snv_len
            sys.stderr.write("expand confirm contig region %s %s -> %s %s %s %sx\n" % (self.var, region_before, self.confirm_contig_region, expanded_sequence, len(expanded_sequence), self.expansion))