Example #1
0
    def get_splice_sites(self, block, left=False, right=False, whole=False):
	"""Reports splice site sequence and motif
	'left'=donor, 'right'=acceptor, 'whole'=donor + acceptor
	"""
	ss = []
	if self.align.splice_sites:
	    splice_sites = self.align.splice_sites

	    orient = None
	    if self.txt:
		if self.txt.strand == '-':
		    orient = '-'
		elif self.txt.strand == '+':
		    orient = '+'
	    elif self.align.orient:
		orient = self.align.orient
	    
	    motif = '?'		
	    if left and int(block) > 1:
		splice_site = splice_sites[block-2]
		
		if orient == '+' and Event.splice_motifs.has_key(splice_site):
		    motif = Event.splice_motifs[splice_site]
		elif orient == '-' and Event.splice_motifs.has_key(reverse_complement(splice_site)):
		    motif = Event.splice_motifs[reverse_complement(splice_site)]
		    
		if not whole:
		    ss.append('%s%s(%s)' % (splice_site[:2].lower(), splice_site[-2:].upper(), motif))
		else:
		    ss.append('%s%s(%s)' % (splice_site[:2].upper(), splice_site[-2:].upper(), motif))
		    
	    if right and int(block) < len(self.align.blocks):
		splice_site = splice_sites[block-1]
		
		if orient == '+' and Event.splice_motifs.has_key(splice_site):
		    motif = Event.splice_motifs[splice_site]
		elif orient == '-' and Event.splice_motifs.has_key(reverse_complement(splice_site)):
		    motif = Event.splice_motifs[reverse_complement(splice_site)]
		  
		if not whole:
		    ss.append('%s%s(%s)' % (splice_site[:2].upper(), splice_site[-2:].lower(), motif))
		else:
		    ss.append('%s%s(%s)' % (splice_site[:2].upper(), splice_site[-2:].upper(), motif))

	else:
	    print 'cannot extract splice sites: %s' % (self.align.query)
	    	    
	return ss
Example #2
0
    def set_orient(self, splice_motifs):
        """Sets orientation(+/-) of contig based on splice motifs"""
        if not splice_motifs:
            return None
    
        counts = {"forward":0, "backward":0, "unknown":0}

        motifs = {'forward':[], 'backward':[]}
        for motif in splice_motifs.keys():
            motifs['forward'].append(motif.lower())
            motifs['backward'].append(tools.reverse_complement(motif).lower())

        for ss in self.splice_sites:
            orient = "unknown"

            if ss in motifs['forward']:
                orient = "forward"
            elif ss in motifs['backward']:
                orient = "backward"

            counts[orient] += 1

        orient = None
        if counts['forward'] > 0 and counts['backward'] == 0 and counts['forward'] > counts['unknown']:
            orient = '+'
        elif counts['backward'] > 0 and counts['forward'] == 0 and counts['backward'] > counts['unknown']:
            orient = '-'

        self.orient = orient
Example #3
0
 def fix_neighbor_gaps(self, tblock1, tblock2, tblock3, qblock1, qblock2, qblock3, splice_motifs, refseq, query_strand):
     """Fix neigboring gaps if necessary to see if canonical splice sites can be re-established.
     In case of consecutive gaps that don't have canonical splice sites,
     move middle block to either end to see if canonical splice can be achieved
     (without or without shuffling after movement)
     """        
     middle_block_size = tblock2[1] - tblock2[0] + 1
     max_shuffle_size = 10
     if middle_block_size > max_shuffle_size:
         return False, None
     
     tgap = [tblock1[1]+1, tblock3[0]-1]
     new_block = {'query':[], 'target':[]}
     
     possible_shuffles = []
     for shuffle in (-1 * middle_block_size, middle_block_size):
         if shuffle > 0:
             left_shuffle, right_shuffle = shuffle, 0
         else:
             left_shuffle, right_shuffle = 0, shuffle
         coord = tgap[0] + left_shuffle, tgap[1] + right_shuffle
         gap_seq = refseq.GetSequence(self.target, coord[0], coord[1])
         ss = gap_seq[:2] + gap_seq[-2:]
         
         if splice_motifs.has_key(ss.lower()) or splice_motifs.has_key(tools.reverse_complement(ss).lower()):
             possible_shuffles.append({'motif':ss.lower(), 'left':left_shuffle, 'right':right_shuffle, 'shuffle_size':abs(left_shuffle) + abs(right_shuffle)})
                 
     splice_site = None
     left_shuffle = right_shuffle = None
             
     if possible_shuffles:
         possible_shuffles.sort(self.compare_shuffles)        
         splice_site = possible_shuffles[0]['motif']
         left_shuffle = possible_shuffles[0]['left']
         right_shuffle = possible_shuffles[0]['right']        
             
     if splice_site and left_shuffle != None and right_shuffle != None:
         #move bases from right to left
         if right_shuffle == 0:
             tblock1[1] += left_shuffle
             if query_strand == '+':
                 qblock1[1] += left_shuffle
             else:
                 qblock1[1] -= left_shuffle
         else:
             tblock3[0] += right_shuffle
             if query_strand == '+':
                 qblock3[0] += right_shuffle
             else:
                 qblock3[0] -= right_shuffle
     else:
         splice_site, new_block = self.fix_single_gap(tblock1, tblock3, qblock1, qblock3, splice_motifs, refseq, query_strand, extra_query=range(qblock2[0], qblock2[1]+1))
         
     if new_block != None and new_block['query']:
         return splice_site, new_block
     else:
         return splice_site, None
Example #4
0
    def fix_single_gap(self, tblock1, tblock2, qblock1, qblock2, splice_motifs, refseq, query_strand, extra_query=[]):
        """Shuffles sequence from end to end to see if canonical splice sites can be achieved"""
        max_shuffle_size = 10
        
        tgap = [tblock1[1]+1, tblock2[0]-1]
        tsize = tgap[1] - tgap[0] + 1

        min_size = 10
        max_size = 100000
        if tsize < min_size or tsize > max_size:
            return False, None
        
        fixed = False
        new_block = {'query':[], 'target':[]}
        
        possible_shuffles = []
        for left_shuffle in range(-1 * max_shuffle_size, max_shuffle_size + 1):                        
            for right_shuffle in range(-1 * max_shuffle_size, max_shuffle_size + 1):
                #skip shuffle in opposite directions, and cases where one side is 0 but there is no extra sequence to move
                if left_shuffle * right_shuffle < 0 or (left_shuffle * right_shuffle == 0 and len(extra_query) == 0):
                    continue
                                
                coord = tgap[0] + left_shuffle, tgap[1] + right_shuffle
                gap_seq = refseq.GetSequence(self.target, coord[0], coord[1])
                ss = gap_seq[:2] + gap_seq[-2:]
                
                if splice_motifs.has_key(ss.lower()) or splice_motifs.has_key(tools.reverse_complement(ss).lower()):
                    possible_shuffles.append({'motif':ss.lower(), 'left':left_shuffle, 'right':right_shuffle, 'shuffle_size':abs(left_shuffle) + abs(right_shuffle)})
        
        splice_site = None
        left_shuffle = right_shuffle = None
        
        if possible_shuffles:
            possible_shuffles.sort(self.compare_shuffles)        
            splice_site = possible_shuffles[0]['motif']
            left_shuffle = possible_shuffles[0]['left']
            right_shuffle = possible_shuffles[0]['right']
            
            successful_shuffles = []
            for i in range(len(possible_shuffles)):
                success, new_block = self.shuffle(possible_shuffles[i]['left'], possible_shuffles[i]['right'], extra_query[:], qblock1, qblock2, tblock1, tblock2, query_strand)
                if success:
                    splice_site = possible_shuffles[i]['motif']
                    print 'success', splice_site, new_block, qblock1, qblock2, tblock1, tblock2
                    break
                
            if splice_site:
                return splice_site, new_block
            else:
                return False, None
            
        else:
            return False, None
Example #5
0
def parse_line(line):
    """Parses individual line of UCSC dbSNP file"""
    cols = line.rstrip("\n").split("\t")

    data = {}
    for i in range(len(cols)):
        if i == len(fields):
            break

        data[fields[i]] = cols[i]

    if data["class"] == "single":
        data["type"] = "snv"
    # sometimes 'class' is 'named' and '(LARGEDELETION)' is reported instead
    # of actual allele
    elif data["class"] == "deletion" or "deletion" in data["observed"].lower():
        data["type"] = "del"
    elif data["class"] == "insertion":
        data["type"] = "ins"
    else:
        data["type"] = "NA"

    if data["type"] == "ins":
        data["start"] = int(data["chromStart"])
    else:
        data["start"] = int(data["chromStart"]) + 1
    data["end"] = int(data["chromEnd"])
    data["allele"] = {}
    data["size"] = 0

    for a in data["observed"].split("/"):
        if a != "-":
            if data["strand"] == "+":
                data["allele"][a.lower()] = True
            elif data["strand"] == "-":
                data["allele"][reverse_complement(a).lower()] = True

            if data["size"] == 0:
                data["size"] = len(a)

    # make sure deletion size is correct, as sometimes '(LARGEDELETION)'
    # will be put as allele
    if data["type"] == "del":
        data["size"] = int(data["end"]) - int(data["start"]) + 1

    if data["observed"] == "lengthTooLong":
        data = {}

    return data
Example #6
0
def parse_line(line):
    """Parses individual line of UCSC dbSNP file"""
    cols = line.rstrip('\n').split('\t')
    
    data = {}
    for i in range(len(cols)):
	if i == len(fields):
	    break
	
        data[fields[i]] = cols[i]
        
    if data['class'] == 'single':
        data['type'] = 'snv'
    # sometimes 'class' is 'named' and '(LARGEDELETION)' is reported instead 
    # of actual allele
    elif data['class'] == 'deletion' or 'deletion' in data['observed'].lower():
        data['type'] = 'del'
    elif data['class'] == 'insertion':
        data['type'] = 'ins'
    else:
        data['type'] = 'NA'
        
    if data['type'] == 'ins':
	data['start'] = int(data['chromStart'])
    else:
	data['start'] = int(data['chromStart']) + 1
    data['end'] = int(data['chromEnd'])
    data['allele'] = {}
    data['size'] = 0
    
    for a in data['observed'].split('/'):
        if a != '-':
	    if data['strand'] == '+':
		data['allele'][a.lower()] = True
	    elif data['strand'] == '-':
		data['allele'][reverse_complement(a).lower()] = True
		
            if data['size'] == 0:
                data['size'] = len(a)

    # make sure deletion size is correct, as sometimes '(LARGEDELETION)'
    # will be put as allele
    if data['type'] == 'del':
	data['size'] = int(data['end']) - int(data['start']) + 1

    if data['observed'] == 'lengthTooLong':
	data = {}
	
    return data
Example #7
0
    def find_pep_change(self, refseq):
	"""Finds effect on protein sequence given event"""
        if self.event_type == 'novel_transcript':
            return self.longest_orf()
        
        coord = re.split('[:-]', self.coordinate())
        variant = None
        if self.event_type == 'novel_utr':
            # include the block that's 'matching' (partially)
            if min(self.align_blocks) == 1:
                b1 = self.align_blocks[0]
                b2 = self.align_blocks[-1] + 1
            else:
                b1 = self.align_blocks[0] - 1
                b2 = self.align_blocks[-1]
            qcoord1 = self.align.query_blocks[b1-1][0]
            qcoord2 = self.align.query_blocks[b2-1][1]
        else:
            qcoord1 = self.align.query_blocks[self.align_blocks[0]-1][0]
            qcoord2 = self.align.query_blocks[self.align_blocks[-1]-1][1]
        if qcoord1 < qcoord2:
            variant = self.align.contig.sequence[qcoord1-1:qcoord2]
        else:
            variant = self.align.contig.sequence[qcoord2-1:qcoord1]
            variant = reverse_complement(variant)
              
	# constructs cDNA sequence of both reference and sequence with event
        cdna_original = self.construct_cdna(coord, self.txt, refseq)
        cdna_changed = self.construct_cdna(coord, self.txt, refseq, variant=variant, change=self.event_type, exons=self.exons)

        frame = 0
        if self.event_type == 'novel_utr':
            frame = None

        if self.txt.strand == '+':
            pep_original = translate(cdna_original, orient='+', frame=0)
            pep_changed = translate(cdna_changed, orient='+', frame=frame)
        else:
            pep_original = translate(cdna_original, orient='-', frame=0)
            pep_changed = translate(cdna_changed, orient='-', frame=frame)

        if not pep_changed or not pep_original or self.event_type == 'read-through':
            return 'na'
        
        self.orf = pep_change(pep_original, pep_changed)
Example #8
0
    def get_sequence(self, refseq, fasta=False, chrom=None):
	"""Extracts transcript sequence"""
	sequence = ''
	if chrom is None:
	    chrom = self.chrom
	else:
	    chrom = chrom
		    
	for i in range(len(self.exons)):
	    exon = refseq.GetSequence(chrom, int(self.exons[i][0]), int(self.exons[i][-1]))
	    sequence += exon.upper()
	  
	if self.strand == '-':
	    sequence = tools.reverse_complement(sequence)
	    
	if fasta:
	    sequence = '>%s_%s\n%s' % (self.name, self.alias, sequence)
	
	return sequence
Example #9
0
 def match_blocks(self, align, query_seq):
     """Identifies SNVs"""
     snvs = []
     
     for i in range(len(align.blocks)):
         if align.query_strand == '+':
             qseq = query_seq[int(align.query_blocks[i][0])-1:int(align.query_blocks[i][1])]
         else:
             qseq = tools.reverse_complement(query_seq[int(align.query_blocks[i][1]-1):int(align.query_blocks[i][0])])
         tseq = self.refseq.GetSequence(align.target, int(align.blocks[i][0]), int(align.blocks[i][1]))
         
         mismatches = self.find_mismatches(qseq, tseq)
         
         for pos, change in mismatches.iteritems():
             tpos = int(align.blocks[i][0]) + pos
             if int(align.query_blocks[i][0]) < int(align.query_blocks[i][1]):
                 qpos = int(align.query_blocks[i][0]) + pos
             else:
                 qpos = int(align.query_blocks[i][0]) - pos
             snv = SNV('psl', 'snv', align.target, tpos, tpos, change[0], align.query_strand, align.query, qpos, qpos, change[1])
             snvs.append(snv)
                     
     return snvs
Example #10
0
    def gap_snv(self, align, splice_motifs, query_seq, cutoff=None):
        """Identifies insertions, deletions, inversion from gapped alignments"""
        if self.debug:
            print align.target, align.blocks
            print align.query, align.query_blocks
            print align.splice_sites
            
        snvs = []
        # cannot identify indels without splice site information
        if self.sample_type == 'transcriptome' and not align.splice_sites:
            return snvs
            
        for i in range(len(align.blocks)-1):
            if self.sample_type != 'transcriptome' or not self.match_intron(align.splice_sites[i], splice_motifs):
                if align.query_strand == '+':
                    qstart = align.query_blocks[i][1]+1
                    qend = align.query_blocks[i+1][0]-1
                    query = query_seq[qstart-1:qend]
                else:
                    qend = align.query_blocks[i][1]-1
                    qstart = align.query_blocks[i+1][0]+1
                    query = query_seq[qstart-1:qend]
                    query = tools.reverse_complement(query)
                    
                # target strand always + from psl
                tstart = align.blocks[i][1]+1
                tend = align.blocks[i+1][0]-1
                target = ''
                if tstart <= tend:
                    target = self.refseq.GetSequence(align.target, tstart, tend)
                
                #if code cannot extract sequence from reference, there must be a disagreement between alignment and reference - abort analysis
                if tend > tstart-1 and len(target) < 1:
                    sys.stderr.write("cannot extract reference sequence, abort: %s %s %s\n" % (align.target, tstart-1, tend))
                    sys.exit(100)

                snv_type = None               
                if qstart > qend and (tend - tstart) >= 0:
                    size = tend - tstart + 1
                    if align.query_strand == '+':
                        qstart = qend
                    else:
                        qend = qstart
                    snv_type = "del"                    
                elif tstart > tend and (qend - qstart) >= 0:
                    size = qend - qstart + 1
                    tstart = tend
                    snv_type = "ins"                    
                else:
                    size = min(1, tend - tstart + 1)
                    snv_type = "indel"
                    
                # skip if 0 or negative size event detected (or smaller than cutoff)
                if size <= 0 or (cutoff and size > cutoff):
                    continue

                target = target.lower()
                query = query.lower()
                # would not report event with non-AGCT characters
                if not re.search('[^agtcATGC]', target) and not re.search('[^agtcATGC]', query):
                    if snv_type != 'indel':
                        snv = SNV('psl', snv_type, align.target, tstart, tend, target, align.query_strand, align.query, qstart, qend, query)
                        snvs.append(snv)
                    # resolves indels
                    else:
                        if len(query) == len(target) and\
                           (query[::-1].lower() == target.lower() or tools.reverse_complement(query).lower() == target.lower()):
                            # inversion must be longer than 1 base
                            if len(query) > 1:
                                snv = SNV('psl', 'inv', align.target, tstart, tend, target, align.query_strand, align.query, qstart, qend, query)
                                snvs.append(snv)
                            # 1 bp gap in both query and target == snv
                            else:
                                snv = SNV('psl', 'snv', align.target, tstart, tend, target, align.query_strand, align.query, qstart, qend, query)
                                snvs.append(snv)
                        # breaks up indel into ins and del
                        else:
                            if align.query_strand == '+':
                                qcoord = qstart
                            else:
                                qcoord = qend
                            
                            snv = SNV('psl', 'del', align.target, tstart, tend, target, align.query_strand, align.query, qcoord, qcoord, query)
                            snvs.append(snv)
                            
                            tcoord = tstart
                            snv = SNV('psl', 'ins', align.target, tcoord, tcoord, target, align.query_strand, align.query, qstart, qend, query)
                            snvs.append(snv)
                                            
        return snvs
Example #11
0
 def match_intron(self, ss, splice_motifs):
     """Determines splite sites correspond to intron by comparing to splice motifs"""
     if ss and (splice_motifs.has_key(ss.lower()) or splice_motifs.has_key(tools.reverse_complement(ss).lower())):
         return True
     else:
         return False
Example #12
0
    def expand_contig_region(self, contig_sequence, query_strand):
        """Expand read-support checking region if repeats are involved"""
        if not self.snv_type in ('ins', 'dup', 'ITD', 'PTD', 'del'):
            return None
        
        # skip if deleted/inserted sequence is longer than contig sequence
        if self.snv_type == 'del' and len(self.ref_seq) > len(contig_sequence):
            return None
        if self.snv_type in ('ins', 'dup', 'ITD', 'PTD') and len(self.var_seq) > len(contig_sequence):
            return None

        if self.snv_type in ('ins', 'dup', 'ITD', 'PTD'):
            seq = self.var_seq[:]
        else:
            seq = self.ref_seq[:]
        
        if len(seq) == 0:
            return None
            
        if tools.is_homopolymer(seq) or len(seq) == 1:
            h**o = True
        else:
            h**o = False

        # keep a record of previous value for reporting expansion
        region_before = self.confirm_contig_region[:]
        
        # arbitrary big number
        limit = 100000
        
        # forward
        expand = 0
        for i in range(limit):
            if h**o:
                changed_base = seq[0].upper()
            else:
                changed_base = seq[i % len(seq)].upper()
                
            downstream_base = None
            if self.snv_type == 'del':
                if query_strand == '+':
                    if int(self.var_end) + i < len(contig_sequence) and int(self.var_end) + i >= 0:
                        downstream_base = contig_sequence[int(self.var_end) + i].upper()
                else:
                    if int(self.var_end) - 2 - i >= 0 and int(self.var_end) - 2 - i < len(contig_sequence): 
                        downstream_base = tools.reverse_complement(contig_sequence[int(self.var_end) - 2 - i]).upper()

            elif self.snv_type in ('ins', 'dup', 'ITD', 'PTD'):
                if query_strand == '+':
                    if int(self.var_end) + i < len(contig_sequence) and int(self.var_end) + i >= 0:
                        downstream_base = contig_sequence[int(self.var_end) + i].upper()
                else:
                    if int(self.var_start) - i - 2 >= 0 and int(self.var_start) - i - 2 < len(contig_sequence):
                        downstream_base = tools.reverse_complement(contig_sequence[int(self.var_start) - i - 2]).upper()

            if changed_base == downstream_base:
                expand += 1   
            else:
                break
        
        multiples = expand/len(seq)
        if multiples > 0:
            if query_strand == '+':
                self.confirm_contig_region[1] += multiples * self.snv_len
            else:
                self.confirm_contig_region[0] -= multiples * self.snv_len

        # reverse
        seq = seq[::-1]
        expand = 0

        for i in range(limit):
            if h**o:
                changed_base = seq[0].upper()
            else:
                changed_base = seq[i%len(seq)].upper()
                
            upstream_base = None
            if self.snv_type == 'del':
                if query_strand == '+':
                    if int(self.var_start) - i - 1 >= 0 and int(self.var_start) - i - 1 < len(contig_sequence):
                        upstream_base = contig_sequence[int(self.var_start) - i - 1].upper()
                else:
                    if int(self.var_start) + i - 1 < len(contig_sequence) and int(self.var_start) + i - 1 >= 0:
                        upstream_base = tools.reverse_complement(contig_sequence[int(self.var_start)+i-1]).upper()
                        
            elif self.snv_type in ('ins', 'dup', 'ITD', 'PTD'):
                if query_strand == '+':
                    if int(self.var_start) - i - 2 >= 0 and int(self.var_start) - i - 2 < len(contig_sequence):
                        upstream_base = contig_sequence[int(self.var_start) - i - 2].upper()
                else:
                    if int(self.var_end) + i < len(contig_sequence) and int(self.var_end) + i >= 0:
                        upstream_base = tools.reverse_complement(contig_sequence[int(self.var_end) + i]).upper()

            if changed_base == upstream_base:
                expand += 1   
            else:
                break

        multiples = expand/len(seq)
        if multiples > 0:
            if query_strand == '+':
                self.confirm_contig_region[0] -= multiples * self.snv_len
            else:
                self.confirm_contig_region[1] += multiples * self.snv_len
                
        expanded_sequence = contig_sequence[self.confirm_contig_region[0]-1:self.confirm_contig_region[1]]
        
        # coordinate given in 1-based
        if region_before[0] != self.confirm_contig_region[0] or region_before[1] != self.confirm_contig_region[1]:
            self.expansion = (self.confirm_contig_region[1] - self.confirm_contig_region[0] + 1) / self.snv_len
            sys.stderr.write("expand confirm contig region %s %s -> %s %s %s %sx\n" % (self.var, region_before, self.confirm_contig_region, expanded_sequence, len(expanded_sequence), self.expansion))
Example #13
0
    def correct_neighbor_gaps(self, splice_motifs, refseq):
        """Post-process blocks after fix_neighbor_gaps()"""
        gaps = {}
        
        for i in range(len(self.blocks)-1):
            ss = self.splice_sites[i]

            if ss and not splice_motifs.has_key(ss) and not splice_motifs.has_key(tools.reverse_complement(ss).lower()):
                if abs(self.query_blocks[i+1][0] - self.query_blocks[i][1]) == 1:
                    gaps[i] = 0

        if gaps:
            target_blocks = self.blocks[:]
            query_blocks = self.query_blocks[:]
            splice_sites = self.splice_sites[:]
            
            gap_indices = gaps.keys()
            gap_indices.sort(lambda x,y: x-y)
            
            # fix by moving exon and then shuffle
            replaced = {}
            replaced_ordered = []
            for i in range(len(gap_indices)):
                i1 = gap_indices[i]
                i2 = i1 + 1
                i0 = i1 - 1
                
                if gaps[i1] == 1:
                    continue
                
                if i2 + 1 < len(target_blocks):
                    tblock1 = target_blocks[i1][:]
                    tblock2 = target_blocks[i2][:]
                    tblock3 = target_blocks[i2+1][:]
                    qblock1 = query_blocks[i1][:]
                    qblock2 = query_blocks[i2][:]
                    qblock3 = query_blocks[i2+1][:]
                    splice_site, new_block = self.fix_neighbor_gaps(tblock1, tblock2, tblock3, qblock1, qblock2, qblock3, splice_motifs, refseq, self.query_strand)
                    if splice_site:
                        idx = ' '.join((str(i1), str(i2), str(i2+1)))
                        if new_block:
                            replaced[idx] = tblock1, tblock3, qblock1, qblock3, splice_site, new_block
                        else:
                            replaced[idx] = tblock1, tblock3, qblock1, qblock3, splice_site
                            
                        gaps[i1] = 1
                        if gaps.has_key(i2):
                            gaps[i2] = 1
                        replaced_ordered.append(idx)
                        
                # if not fixed, try backward
                if gaps[i1] == 0 and i0 >= 0 and i2 <= len(target_blocks) and (not gaps.has_key(i0) or gaps[i0] == 0):
                    tblock1 = target_blocks[i0][:]
                    tblock2 = target_blocks[i1][:]
                    tblock3 = target_blocks[i2][:]
                    qblock1 = query_blocks[i0][:]
                    qblock2 = query_blocks[i1][:]
                    qblock3 = query_blocks[i2][:]                    
                    splice_site, new_block = self.fix_neighbor_gaps(tblock1, tblock2, tblock3, qblock1, qblock2, qblock3, splice_motifs, refseq, self.query_strand)
                    if splice_site:
                        idx = ' '.join((str(i0), str(i1), str(i2)))
                        if new_block:
                            replaced[idx] = tblock1, tblock3, qblock1, qblock3, splice_site, new_block
                        else:
                            replaced[idx] = tblock1, tblock3, qblock1, qblock3, splice_site
                            
                        gaps[i1] = 1
                        if gaps.has_key(i0):
                            gaps[i0] = 1
                        replaced_ordered.append(idx)

            # make sure delete from back to front
            replaced_ordered.reverse()
            for indices in replaced_ordered:
                new_blocks = replaced[indices]

                ok = True
                for index in indices.split(' '):
                    if gaps.has_key(int(index)) and gaps[int(index)] > 1:
                        ok = False
                        break

                if ok:
                    idx = [int(i) for i in indices.split(' ')]
                    sys.stderr.write("Type3 %s changed blocks %s %s to %s %s\n" % (self.query, self.target, self.blocks[idx[0]], new_blocks[0], new_blocks[4]))
                    sys.stderr.write("Type3 %s changed blocks %s %s to %s %s\n" % (self.query, self.target, self.blocks[idx[2]], new_blocks[1], new_blocks[4]))
                    if len(new_blocks) == 5:
                        sys.stderr.write("Type3 %s removed block %s %s\n" % (self.query, self.target, self.blocks[idx[1]]))
                    else:
                        sys.stderr.write("Type3 %s changed blocks %s %s to %s\n" % (self.query, self.target, self.blocks[idx[1]], new_blocks[-1]['target']))
                    target_blocks[idx[0]] = new_blocks[0]
                    target_blocks[idx[2]] = new_blocks[1]
                    query_blocks[idx[0]] = new_blocks[2]
                    query_blocks[idx[2]] = new_blocks[3]
                    
                    if len(new_blocks) == 5:
                        del target_blocks[idx[1]]
                        del query_blocks[idx[1]]
                        del splice_sites[idx[1]]
                        splice_sites[idx[0]] = new_blocks[4]
                    else:
                        target_blocks[idx[1]] = new_blocks[-1]['target']
                        query_blocks[idx[1]] = new_blocks[-1]['query']
                        
                        if target_blocks[idx[1]][0] - target_blocks[idx[0]][1] < target_blocks[idx[2]][0] - target_blocks[idx[1]][0]:
                            splice_sites[idx[0]] = 'NA'
                            splice_sites[idx[1]] = new_blocks[4]
                        else:
                            splice_sites[idx[0]] = new_blocks[4]
                            splice_sites[idx[1]] = 'NA'

            if target_blocks != self.blocks and self.check_corrections(query_blocks, target_blocks, self.query_strand, '+', self.query):
                self.blocks = target_blocks[:]
                self.query_blocks = query_blocks[:]
                self.splice_sites = splice_sites

                if not self.mismatch or int(self.mismatch) == 0:
                    self.mismatch = 1
Example #14
0
    def correct_single_gaps(self, splice_motifs, refseq):
        """
        Post-process blocks after fix_single_gaps()
        """
        gaps = {}
        
        for i in range(len(self.blocks)-1):
            ss = self.splice_sites[i]
            if ss and not splice_motifs.has_key(ss) and not splice_motifs.has_key(tools.reverse_complement(ss).lower()):                    
                gaps[i] = []
                if self.query_blocks[i][1] < self.query_blocks[i+1][0]:
                    for j in range(self.query_blocks[i][1]+1, self.query_blocks[i+1][0]):
                        gaps[i].append(j)
                else:
                    for j in range(self.query_blocks[i][1]-1, self.query_blocks[i+1][0], -1):
                        gaps[i].append(j)
                
        if gaps:
            target_blocks = self.blocks[:]
            query_blocks = self.query_blocks[:]
            splice_sites = self.splice_sites[:]
            
            gap_indices = gaps.keys()
            gap_indices.sort(lambda x,y: x-y)
            
            changed_blocks = {}
            
            for i in gap_indices:
                tblock1 = target_blocks[i][:]
                tblock2 = target_blocks[i+1][:]
                qblock1 = query_blocks[i][:]
                qblock2 = query_blocks[i+1][:]
                
                splice_site, new_block = self.fix_single_gap(tblock1, tblock2, qblock1, qblock2, splice_motifs, refseq, self.query_strand, gaps[i])

                if splice_site:          
                    sys.stderr.write("Type2a %s changed blocks %s %s to %s %s\n" % (self.query, self.target, target_blocks[i], tblock1, splice_site))
                    sys.stderr.write("Type2a %s changed blocks %s %s to %s %s\n" % (self.query, self.target, target_blocks[i+1], tblock2, splice_site))
                    target_blocks[i] = tblock1
                    target_blocks[i+1] = tblock2
                    query_blocks[i] = qblock1
                    query_blocks[i+1] = qblock2
                    splice_sites[i] = splice_site
                    
                    if new_block:
                        changed_blocks[i] = new_block
                        
            if changed_blocks:
                changed_blocks_indices = changed_blocks.keys()
                changed_blocks_indices.sort(lambda x,y: y-x)
                
                for i in changed_blocks_indices:
                    if changed_blocks[i] != -1:
                        sys.stderr.write("Type2b %s add block %s %s %s %s\n" % (self.query, self.target, changed_blocks[i]['target'], changed_blocks[i]['query'], 'NA'))
                        if abs(target_blocks[i][1] - changed_blocks[i]['target'][0]) < abs(target_blocks[i+1][0] - changed_blocks[i]['target'][1]):
                            splice_sites.insert(i, 'NA')
                        else:
                            splice_sites.insert(i+1, 'NA')
                        
                        query_blocks.insert(i+1, changed_blocks[i]['query'])
                        target_blocks.insert(i+1, changed_blocks[i]['target'])
                    else:
                        del query_blocks[i]
                
    
            if target_blocks != self.blocks and self.check_corrections(query_blocks, target_blocks, self.query_strand, '+', self.query):
                self.blocks = target_blocks[:]
                self.query_blocks = query_blocks[:]
                self.splice_sites = splice_sites
                if not self.mismatch or int(self.mismatch) == 0:
                    self.mismatch = 1