Beispiel #1
0
 def localize_feature(self, coord, txt):
     """Identifies feature in given transcript of a given coordinate.
     Features include intron, exon, splice-accepotor, splice-donor, UTR
     """   
     feature = None        
     # search introns first
     for i in range(len(txt.exons)-1):
         intron = [int(txt.exons[i][1])+1, int(txt.exons[i+1][0])-1]
         if subsume([coord, coord], intron):
             if txt.strand == '+':
                 feature = 'intron' + str(i+1)
             else:
                 feature = 'intron' + str(len(txt.exons) - i - 1)
                         
             splice = None
             if subsume([coord, coord], [intron[0], intron[0]+1]):
                 if txt.strand == '+':
                     splice = 'splice-donor'
                 else:
                     splice = 'splice-acceptor'
             elif subsume([coord, coord], [intron[-1]-1, intron[-1]]):
                 if txt.strand == '+':
                     splice = 'splice-acceptor'
                 else:
                     splice = 'splice-donor'
                     
             if splice:
                 feature += "(%s)" % (splice)  
             break
         
     # then exons
     if not feature:
         for i in range(len(txt.exons)):
             if subsume([coord, coord], txt.exons[i]):
                 if txt.strand == '+':
                     feature = 'exon' + str(i+1)
                 else:
                     feature = 'exon' + str(len(txt.exons)-i)
                 break
                 
     # must be utr if not in intron and not in exons inside cds            
     if not feature:
         if (int(coord) < int(txt.cdsStart) and txt.strand == '+') or (int(coord) > int(txt.cdsEnd) and txt.strand == '-'):
             feature = '5utr'
         else:
             feature = '3utr'
    
     return feature
Beispiel #2
0
    def construct_cdna(self, coord, txt, refseq, variant=None, change=None, exons=None):
	"""Constructs cDNA seqeunce given event"""
        cdna = ""
        for i in range(len(txt.exons)):
            if txt.coding_type() != 'CODING' or not overlap(txt.exons[i], [txt.cdsStart, txt.cdsEnd]):
                continue
                
            if subsume([txt.cdsStart, txt.cdsStart], txt.exons[i]):
                start = int(txt.cdsStart) + 1
            else:
                start = txt.exons[i][0]

            if subsume([txt.cdsEnd, txt.cdsEnd], txt.exons[i]):
                end = int(txt.cdsEnd)
            else:
                end = int(txt.exons[i][1])

            exon = refseq.GetSequence(coord[0], int(start), int(end))
            if change:
                if change.lower() == 'retained_intron' and i+1 == exons[0]:
                    intron = refseq.GetSequence(coord[0], end+1, txt.exons[i+1][0]-1)
                    exon += intron
		    
                elif change.lower() == 'novel_exon' and i+1 == exons[0]:
                    exon += variant

                elif change.lower() == 'skipped_exon' and i+1 in exons:
                    exon = ''

                elif change.lower() == 'novel_intron' and i+1 == exons[0]:
                    bases_deleted = int(coord[1])-int(start), int(coord[2])-int(start)
                    new_exon = exon[:bases_deleted[0]] + exon[bases_deleted[1]+1:]
                    exon = new_exon

                elif change.lower() in ['as5', 'as3', 'as53'] and i+1 == exons[0]:
                    new_exon = refseq.GetSequence(coord[0], int(coord[1]), int(coord[2]))
                    exon = new_exon

                elif change.lower() == 'novel_utr' and i+1 == exons[0]:
                    exon = variant
            
            cdna += exon

        return cdna
Beispiel #3
0
def find_overlaps(test, repeat_overlaps):
    """Overlaps given coordinates with repeats to identify subsuming(simple repeats, segdups)
    or overlaps(rmsk)
    """
    overlaps = {}
    for repeat_type, repeat_overlap in repeat_overlaps.iteritems():
	overlaps[repeat_type] = {}
	repeats = repeat_overlap.overlap(test['chrom'], test['start'], test['end'], parse_line=parse_line)
    
	if repeat_type == 'simple_repeats' or repeat_type == 'segdup':
	    for repeat in repeats:
		if subsume([test['start'], test['end']], [repeat['start'], repeat['end']]):
		    overlaps[repeat_type][repeat['type']] = True
	elif repeat_type == 'rmsk':
	    for repeat in repeats:
		if overlap([test['start'], test['end']], [repeat['start'], repeat['end']]):
		    overlaps[repeat_type][repeat['type']] = True
		    
    return overlaps
Beispiel #4
0
    def construct_cdna(self, coord, txt, refseq, variant=None, change=None):
        """Constructs transcript sequence given variant.
        Variants: SNV, insertion, duplication, deletion
        """
        cdna = ""
        remove = False     
        for i in range(len(txt.exons)):
            if not overlap(txt.exons[i], [txt.cdsStart, txt.cdsEnd]):
                continue
                
            # extracts reference exon sequence
            if subsume([txt.cdsStart, txt.cdsStart], txt.exons[i]):
                start = int(txt.cdsStart) + 1
            else:
                start = txt.exons[i][0]
            if subsume([txt.cdsEnd, txt.cdsEnd], txt.exons[i]):
                end = txt.cdsEnd
            else:
                end = txt.exons[i][1]
            exon = refseq.GetSequence(coord[0], int(start), int(end))

            # modifies exon sequence based on variant
            if change:
                if change.lower() == 'snv' and subsume(coord[1:], [int(start), int(end)]):
                    bases_changed = int(coord[1])-int(start), int(coord[2])-int(start)
                    before_change = exon[:bases_changed[0]]
                    after_change = exon[bases_changed[1]+1:]
                    exon = before_change + variant + after_change
                        
                elif change.lower() in ('ins', 'dup', 'ITD', 'PTD') and subsume(coord[1:], [int(start), int(end)]):
                    base_to_insert = int(coord[1])-int(start)+1
                    exon = exon[:base_to_insert] + variant + exon[base_to_insert:]

                elif change.lower() == 'del':
                    if subsume(coord[1:], [int(start), int(end)]):
                        bases_deleted = int(coord[1])-int(start), int(coord[2])-int(start)
                        new_exon = exon[:bases_deleted[0]] + exon[bases_deleted[1]+1:]
                        exon = new_exon
                        
                    elif subsume([coord[1], coord[1]], [int(start), int(end)]):
                        first_base_deleted = int(coord[1])-int(start)
                        exon = exon[:first_base_deleted]
                        remove = True
                        
                    elif subsume([coord[2], coord[2]], [int(start), int(end)]):
                        last_base_deleted = int(coord[2])-int(start)
                        exon = exon[last_base_deleted+1:]
                        remove = False

                    elif i >0 and subsume([coord[1], coord[1]], [int(txt.exons[i-1][1])+1, int(txt.exons[i][0]-1)]):
                        if not remove:
                            remove = True
                            exon = ''
                        else:
                            remove = False

                    elif i >0 and subsume([coord[2], coord[2]], [int(txt.exons[i-1][1])+1, int(txt.exons[i][0]-1)]):
                        if not remove:
                            remove = True
                            exon = ''
                        else:
                            remove = False
                        
                    elif remove:
                        exon = ''
                        
            cdna += exon
                   
        return cdna
Beispiel #5
0
    def is_read_through(self, txts, mm):
	"""Determines if event is read-through"""
	last_matched_block, last_matched_exon = self.last_matched()	
	for txt2 in txts:
	    if txt2.strand != self.txt.strand:
		continue
	    
	    if txt2.model != self.txt.model:
		continue
	    
	    if txt2.name == self.txt.name or txt2.alias == self.txt.alias:
		continue
			
	    if not overlap([self.align_coords[0][0], self.align_coords[-1][1]], [txt2.txStart, txt2.txEnd]) or\
	       overlap([self.txt.txStart, self.txt.txEnd], [txt2.txStart, txt2.txEnd]):
		continue
	    		
	    if overlap(last_matched_block, [txt2.txStart, txt2.txEnd]):
		continue
							
	    result = mm.match_exons(self.contig, txt2.full_name(), self.align_coords, txt2.exons, txt2.chrom, strand=txt2.strand)    	    	
	    if result and len(result.matched_blocks) == len(self.align_blocks):				
		exon_bounds_matched = True
		for i in range(len(result.matched_blocks)):		    		
		    # only 1 boundary has to be flush if it's terminal block
		    if i == len(self.align_blocks) - 1:
			if self.txt.txStart < txt2.txStart:
			    if self.align_coords[result.matched_blocks[i] - 1][0] != txt2.exons[result.matched_exons[i] - 1][0]:
				exon_bounds_matched = False
				
			else:
			    if self.align_coords[result.matched_blocks[i] - 1][1] != txt2.exons[result.matched_exons[i] - 1][1]:
				exon_bounds_matched = False
			
		    # both boundaries have to be flush if it's not terminal block
		    else:
			if not(self.align_coords[result.matched_blocks[i] - 1][0] == txt2.exons[result.matched_exons[i] - 1][0] and\
			       self.align_coords[result.matched_blocks[i] - 1][1] == txt2.exons[result.matched_exons[i] - 1][1]):
			    exon_bounds_matched = False
			    
		if not exon_bounds_matched:
		    continue
		
		if self.txt.txStart < txt2.txStart:
		    txt_span = [int(self.txt.txEnd) + 1, int(txt2.txStart) - 1]
		else:
		    txt_span = [int(txt2.txEnd) + 1, int(self.txt.txStart) - 1]
				
		# make sure there is no transcripts in between the 1st and 2nd transcripts
		has_txt_between = False
		for t in txts:
		    if t.name == self.txt.name or t.name == txt2.name:
			continue
				
		    if subsume([t.txStart, t.txEnd], txt_span):
			has_txt_between = True
			break
			
		    if not has_txt_between:			
			if self.txt.alias and txt2.alias and type(self.txt.alias) is str and type(txt2.alias) is str:
			    if not Transcript.same_family(self.txt.alias, txt2.alias):
				self.event_type = 'read-through'
				self.txt2 = txt2
Beispiel #6
0
    def set_novelty(self, txts, matches=None):
	"""Determines if event is novel"""
        novel_events = []
        
        if self.event_type == "novel_exon" or self.event_type == "AS53" or self.event_type == "novel_utr":
            for txt in txts:
                blocks_to_delete = []

                for b in range(len(self.align_blocks)):
                    for e in range(len(txt.exons)):
                        novel = True
                        if subsume(self.align_coords[b], txt.exons[e]):
                            novel = False
                        #novel utr - requires just one edge to align
			if novel and self.event_type == 'novel_utr':
			    if int(self.align_coords[b][0]) == int(txt.exons[e][0]) or int(self.align_coords[b][1]) == int(txt.exons[e][1]):
                                novel = False
                        if not novel:
                            blocks_to_delete.append(b)
                            break

                if blocks_to_delete:
                    for b in sorted(blocks_to_delete, reverse=True):
                        del self.align_blocks[b]
                        del self.align_coords[b]

            if not self.align_blocks:
                self.novel = False
		
	elif self.event_type == 'read-through':
	    if len(self.align_coords) == 1:
		start, end = self.align.blocks[self.align_blocks[0] - 2][1], self.align.blocks[self.align_blocks[0] - 1][0]		
		# see if any single transcript contains the exon junction
		for txt in txts:
		    found_start, found_end = None, None		    
		    for i in range(len(txt.exons) - 1):
			if int(txt.exons[i][1]) == start and int(txt.exons[i + 1][0]) == end:
			    found_start, found_end = i, i + 1
			    self.novel = False
			    break
			  				    
		    if not self.novel:
			break
		
        elif self.event_type == "retained_intron":
            multi = False
            if int(self.exons[-1]) - int(self.exons[0]) > 1:
                multi = True
                self.novel = False
		
            for i in range(len(self.exon_coords)-1):
                retained_intron = [int(self.exon_coords[i][1])+1, int(self.exon_coords[i+1][0])-1]
                middle_exons = {}
                for txt in txts:
                    exons_txt = []
                    for j in range(len(txt.exons)):
                        exon = txt.exons[j]
                        #terminal exon, require subsume
                        if j == 0 or j == len(txt.exons)-1:
                            if subsume(retained_intron, exon):
                                exons_txt.append(exon) 
                        #middle exons, require just overlap
                        elif overlap(exon, (retained_intron[0], retained_intron[1])):
                            exons_txt.append(exon)

                    if exons_txt:
                        middle_exons[txt] = exons_txt

                # only time when original event is novel WITHOUT testing is when it's a single ri and it is clear of overlapping exons
                if len(middle_exons.keys()) == 0 and not multi:
                    self.novel = True
                else:
                    self.novel = False

                    # substract overlapping exons
                    if middle_exons.values() and middle_exons.values()[0]:
                        true_retained_intron = subtract(retained_intron, middle_exons.values())
                    else:
                        true_retained_intron = [retained_intron]

                    # if there is still some intron left after subtraction
                    if true_retained_intron:
                        # create new events
                        for ri in true_retained_intron:
                            event = {'contig': self.contig, 
			             'chrom':self.chrom, 
			             'align_blocks':self.align_blocks, 
			             'align_coords':self.align_coords, 
			             'type':self.event_type, 
			             'novel':True
			             }
                            # determine flanking exons, and transcript by frequency of flanking coordinates
                            flanks = {}
                            for txt in txts:
                                for i in range(len(txt.exons)-1):
                                    left = txt.exons[i]
                                    right = txt.exons[i+1]
                                    if left[1]+1 == ri[0] and right[0]-1 == ri[1]:
                                        #print "ri flanks", txt.full_name(), left, right, ri
                                        coord = ",".join((str(left[0]), str(left[1]), str(right[0]), str(right[1])))
                                        if not flanks.has_key(coord):
                                            flanks[coord] = [[txt,i,i+1]]
                                        else:
                                            flanks[coord].append([txt,i,i+1])

                            if len(flanks.keys()) > 0:
                                # use the most commom frequent exons
                                flanks_sorted = flanks.keys()
                                flanks_sorted.sort(lambda x,y: len(flanks[y])-len(flanks[x]))

                                # use the orginally assigned transcript if possible
                                same_txt = False
                                exon_coords = flanks_sorted[0].split(',')
                                exons = []
                                for txt,e1,e2 in flanks[flanks_sorted[0]]:
                                    if txt.full_name() == self.transcript:
                                        same_txt = True
                                        event['transcript'] = txt.full_name()
                                        event['exons'] = [e1+1,e2+1]
                                        event['exon_coords'] = [exon_coords[:2], exon_coords[2:]]
                                        event['txt'] = txt
                                        break

                                if not same_txt:
                                    txt,e1,e2 = flanks[flanks_sorted[0]][0]
                                    event['transcript'] = txt.full_name()
                                    event['exons'] = [e1+1,e2+1]
                                    event['exon_coords'] = [exon_coords[:2], exon_coords[2:]]
                                    event['txt'] = txt

                                novel_events.append(event)
            
        elif self.event_type == "skipped_exon":
            for txt in txts:
                for e in range(len(txt.exons)-1):
                    intron_span = [int(txt.exons[e][1])+1, int(txt.exons[e+1][0])-1]
                    if self.align_coords and self.exon_coords:
                        if subsume([int(self.exon_coords[0][0]), int(self.exon_coords[-1][1])], intron_span):
                            self.novel = False
                            break
                        
                if not self.novel:
                    break

        elif self.event_type == "novel_intron":
            novel_intron_span = [int(self.align_coords[0][1])+1, int(self.align_coords[1][0])-1]
            novel_intron_size = int(self.align_coords[1][0]) - 1 - int(self.align_coords[0][1])
            for txt in txts:
                for e in range(len(txt.exons)-1):
                    intron_span = [int(txt.exons[e][1])+1, int(txt.exons[e+1][0])-1]

                    if novel_intron_span[0] == intron_span[0] and novel_intron_span[1] == intron_span[1]:
                        self.novel = False
                        break

                if not self.novel:
                    break

        elif 'AS' in self.event_type and self.edge == 'left':
            for txt in txts:
                for exon in txt.exons:
                    if int(self.align_coords[0]) == int(exon[0]):
                        self.novel = False
                        break

                if not self.novel:
                    break

        elif 'AS' in self.event_type and self.edge == 'right':
            for txt in txts:
                for exon in txt.exons:
                    if int(self.align_coords[1]) == int(exon[1]):
                        self.novel = False
                        break

                if not self.novel:
                    break

        return novel_events