Exemple #1
0
    def last_matched(self):
	"""Determines last-matched alignment block and exon
	This method is only used for is_read_through()
	"""
	last_matched_block = self.align.blocks[self.last_matched_block - 1]
	if self.exons[0] == 1:
	    # the last-matched exon is supposed to be the second last flanking exon
	    if len(self.txt.exons) > 1 and overlap(last_matched_block, self.txt.exons[self.exons[0] - 1 + 1]):
		last_matched_exon = self.txt.exons[self.exons[0] - 1 + 1]		
	    # but sometimes it will be the last flanking exon if the unmatched block doesn't
	    # overlap the second last exon
	    else:
		last_matched_exon = self.txt.exons[self.exons[0] - 1]	    
	else:
	    if len(self.txt.exons) > 1 and overlap(last_matched_block, self.txt.exons[self.exons[0] - 1 - 1]):
		last_matched_exon = self.txt.exons[self.exons[0] - 1 - 1]	
	    else:
		last_matched_exon = self.txt.exons[self.exons[0] - 1]
				
	return last_matched_block, last_matched_exon
Exemple #2
0
 def del_is_intron(self, start, end, txt):
     """Determines if coordinate is intron"""
     for i in range(len(txt.exons)-1):
         intron = [int(txt.exons[i][1])+1, int(txt.exons[i+1][0])-1]
         
         olap_size = overlap([start, end], intron)
                     
         if float(olap_size) / float(cardinality([start, end])) > 0.9 and\
            float(olap_size) / float(cardinality(intron)) > 0.9:
             return True
         
     return False
Exemple #3
0
 def overlap(self, chrom, start, end, buffer=20000):
     """Reports copy numbers in overlapping segments with given coordinate"""
     copy_numbers = []
     span = [max(1, int(start)-buffer), int(end)+buffer]
     
     chr_num = tools.get_chr_number(chrom)
     if self.segments.has_key(chr_num):
         for segment in self.segments[chr_num]:
             if intspan.overlap([segment[0], segment[1]], span):
                 copy_numbers.append(segment[2])
             
     if copy_numbers:
         return ','.join(copy_numbers)
     else:
         return 'na'
Exemple #4
0
    def construct_cdna(self, coord, txt, refseq, variant=None, change=None, exons=None):
	"""Constructs cDNA seqeunce given event"""
        cdna = ""
        for i in range(len(txt.exons)):
            if txt.coding_type() != 'CODING' or not overlap(txt.exons[i], [txt.cdsStart, txt.cdsEnd]):
                continue
                
            if subsume([txt.cdsStart, txt.cdsStart], txt.exons[i]):
                start = int(txt.cdsStart) + 1
            else:
                start = txt.exons[i][0]

            if subsume([txt.cdsEnd, txt.cdsEnd], txt.exons[i]):
                end = int(txt.cdsEnd)
            else:
                end = int(txt.exons[i][1])

            exon = refseq.GetSequence(coord[0], int(start), int(end))
            if change:
                if change.lower() == 'retained_intron' and i+1 == exons[0]:
                    intron = refseq.GetSequence(coord[0], end+1, txt.exons[i+1][0]-1)
                    exon += intron
		    
                elif change.lower() == 'novel_exon' and i+1 == exons[0]:
                    exon += variant

                elif change.lower() == 'skipped_exon' and i+1 in exons:
                    exon = ''

                elif change.lower() == 'novel_intron' and i+1 == exons[0]:
                    bases_deleted = int(coord[1])-int(start), int(coord[2])-int(start)
                    new_exon = exon[:bases_deleted[0]] + exon[bases_deleted[1]+1:]
                    exon = new_exon

                elif change.lower() in ['as5', 'as3', 'as53'] and i+1 == exons[0]:
                    new_exon = refseq.GetSequence(coord[0], int(coord[1]), int(coord[2]))
                    exon = new_exon

                elif change.lower() == 'novel_utr' and i+1 == exons[0]:
                    exon = variant
            
            cdna += exon

        return cdna
Exemple #5
0
def find_overlaps(test, repeat_overlaps):
    """Overlaps given coordinates with repeats to identify subsuming(simple repeats, segdups)
    or overlaps(rmsk)
    """
    overlaps = {}
    for repeat_type, repeat_overlap in repeat_overlaps.iteritems():
	overlaps[repeat_type] = {}
	repeats = repeat_overlap.overlap(test['chrom'], test['start'], test['end'], parse_line=parse_line)
    
	if repeat_type == 'simple_repeats' or repeat_type == 'segdup':
	    for repeat in repeats:
		if subsume([test['start'], test['end']], [repeat['start'], repeat['end']]):
		    overlaps[repeat_type][repeat['type']] = True
	elif repeat_type == 'rmsk':
	    for repeat in repeats:
		if overlap([test['start'], test['end']], [repeat['start'], repeat['end']]):
		    overlaps[repeat_type][repeat['type']] = True
		    
    return overlaps
Exemple #6
0
    def construct_cdna(self, coord, txt, refseq, variant=None, change=None):
        """Constructs transcript sequence given variant.
        Variants: SNV, insertion, duplication, deletion
        """
        cdna = ""
        remove = False     
        for i in range(len(txt.exons)):
            if not overlap(txt.exons[i], [txt.cdsStart, txt.cdsEnd]):
                continue
                
            # extracts reference exon sequence
            if subsume([txt.cdsStart, txt.cdsStart], txt.exons[i]):
                start = int(txt.cdsStart) + 1
            else:
                start = txt.exons[i][0]
            if subsume([txt.cdsEnd, txt.cdsEnd], txt.exons[i]):
                end = txt.cdsEnd
            else:
                end = txt.exons[i][1]
            exon = refseq.GetSequence(coord[0], int(start), int(end))

            # modifies exon sequence based on variant
            if change:
                if change.lower() == 'snv' and subsume(coord[1:], [int(start), int(end)]):
                    bases_changed = int(coord[1])-int(start), int(coord[2])-int(start)
                    before_change = exon[:bases_changed[0]]
                    after_change = exon[bases_changed[1]+1:]
                    exon = before_change + variant + after_change
                        
                elif change.lower() in ('ins', 'dup', 'ITD', 'PTD') and subsume(coord[1:], [int(start), int(end)]):
                    base_to_insert = int(coord[1])-int(start)+1
                    exon = exon[:base_to_insert] + variant + exon[base_to_insert:]

                elif change.lower() == 'del':
                    if subsume(coord[1:], [int(start), int(end)]):
                        bases_deleted = int(coord[1])-int(start), int(coord[2])-int(start)
                        new_exon = exon[:bases_deleted[0]] + exon[bases_deleted[1]+1:]
                        exon = new_exon
                        
                    elif subsume([coord[1], coord[1]], [int(start), int(end)]):
                        first_base_deleted = int(coord[1])-int(start)
                        exon = exon[:first_base_deleted]
                        remove = True
                        
                    elif subsume([coord[2], coord[2]], [int(start), int(end)]):
                        last_base_deleted = int(coord[2])-int(start)
                        exon = exon[last_base_deleted+1:]
                        remove = False

                    elif i >0 and subsume([coord[1], coord[1]], [int(txt.exons[i-1][1])+1, int(txt.exons[i][0]-1)]):
                        if not remove:
                            remove = True
                            exon = ''
                        else:
                            remove = False

                    elif i >0 and subsume([coord[2], coord[2]], [int(txt.exons[i-1][1])+1, int(txt.exons[i][0]-1)]):
                        if not remove:
                            remove = True
                            exon = ''
                        else:
                            remove = False
                        
                    elif remove:
                        exon = ''
                        
            cdna += exon
                   
        return cdna
Exemple #7
0
    def get_feature(self, coord, gene_only=False, gene_strand=False, exact=False, refseq=None, variant=None, change=None, txt_obj=False, chrom=None, strand=None, all_overlaps=False):
        """Finds feature given coordinate.
        Given coordinate, return gene feature
        if gene_only: return a [gene1, gene2]
        if gene_only and gene_strand: return [gene1, strand1, gene2, strand2]
        else return "gene1:txt1:feature1|gene2:txt2:feature2|protein_change"
        where feature = intronX, exonX, utr
              protein_change only reports when txt1 == txt2
        if coordinate corresponds to single base:
        if gene_only: return a [gene1]
        if gene_only and gene_strand: return [gene1, strand1]
        else return "gene1:txt1:feature1|protein_change"
        """
        features = []
        overlaps = []
        target, start, end = coord.split()
        # go through both end points of coordinate given
        for base in (start, end):
            if len(features) == 1 and start == end:
                break
                       
            # the 'best' candidate transcript and feature
            the_txt, the_feature = None, None
                        
            # go through every overlapping transcript
            txt_lines = self.get_txt_lines([target, base, base])
            for line_num in txt_lines.keys():
                line = linecache.getline(self.annot_file, int(line_num))
                txt = {
                    'e': ensembl.parse_line,
                    'r': refGene.parse_line,
                    'k': knownGene.parse_line,
                    'a': aceview.parse_line,
                    'x': ensg.parse_line,
                    'n': ensembl.parse_line,
                    't': ensembl.parse_line,
                    'g': ensembl.parse_line,
                    }[self.model](line)

                if not overlap([txt.txStart, txt.txEnd], [base, base]):
                    continue
                if strand and txt.strand and txt.strand != strand:
                    continue
    
                feature = self.identify_feature(start, end, txt, exact=exact)
                if feature:
                    # keep all transcripts and features if 'all_overlaps' is True
                    if all_overlaps:
                        ff = ':'.join((txt.alias, txt.name, feature))
                        overlaps.append((ff, txt))
                                        
                    if features and features[0][0] != None:                            
                        if txt.name == features[0][0].name:
                            the_txt = txt
                            the_feature = ':'.join((txt.alias, txt.name, feature))
                    
                    # updates best candidate if
                    # - best transcript not defined
                    # - best feature not defined
                    # - new feature is exonic but best candidate isn't
                    # - current candidate is exonic but current candidate's CDS is longer
                    # - current candidate is not exonic but current candidate's CDS is longer
                    elif not the_txt or \
                         ((not the_feature or not self.is_exon(the_feature)) and self.is_exon(feature)) or\
                         (the_txt and self.is_exon(feature) and int(the_txt.cds_length()) < int(txt.cds_length())) or\
                         (the_txt and int(the_txt.cds_length()) < int(txt.cds_length())):
                        the_txt = txt
                        the_feature = ':'.join((txt.alias, txt.name, feature))
                                        
            # novel feature of known gene
            if the_txt and not the_feature:
                if the_txt:
                    the_feature = ':'.join((the_txt.alias, the_txt.name, 'novel'))
                else:
                    the_feature = ':'.join(('NA', 'NA'))
                    
            if the_feature:
                if gene_only:                    
                    if gene_strand:
                        if the_txt:
                            strand = the_txt.strand
                        else:
                            strand = 'NA'                            
                        features.append([the_feature.split(':')[0], strand])
                    else:
                        features.append([the_feature.split(':')[0]])                                            
                else:
                    if the_txt:
                        txt_name = the_txt.name
                    else:
                        txt_name = 'NA'                    
                    
                    features.append([the_txt, the_feature])
            else:
                if gene_only:                    
                    if gene_strand:
                        features.append(['NA', 'NA'])
                    else:
                        features.append(['NA'])
                else:
                    features.append([None, 'NA:NA:NA'])
                            
        if all_overlaps:
            return overlaps
                
        if gene_only:
            if txt_obj:
                return features[0][0], features[0][1], the_txt
            else:
                return features[0][0], features[0][1]
            
        else:
            pepchange = 'NA'
            
            # chromosome name for extracting sequence - maybe be different from target
            if chrom is not None:
                target = chrom

            if len(features) == 1 or (features[0][0] != None and features[1][0] != None and features[0][0].name == features[1][0].name):
                if 'exon' in features[0][1] and not gene_only and refseq:
                    pepchange = self.find_pep_change([target, start, end], features[0][0], refseq, variant, change)
            
            if len(features) > 1:
                feature = '|'.join([features[0][1], features[1][1], pepchange])
            else:
                feature = '|'.join([features[0][1], pepchange])
            
            if txt_obj:
                return feature, the_txt
            else:
                return feature
Exemple #8
0
    def is_read_through(self, txts, mm):
	"""Determines if event is read-through"""
	last_matched_block, last_matched_exon = self.last_matched()	
	for txt2 in txts:
	    if txt2.strand != self.txt.strand:
		continue
	    
	    if txt2.model != self.txt.model:
		continue
	    
	    if txt2.name == self.txt.name or txt2.alias == self.txt.alias:
		continue
			
	    if not overlap([self.align_coords[0][0], self.align_coords[-1][1]], [txt2.txStart, txt2.txEnd]) or\
	       overlap([self.txt.txStart, self.txt.txEnd], [txt2.txStart, txt2.txEnd]):
		continue
	    		
	    if overlap(last_matched_block, [txt2.txStart, txt2.txEnd]):
		continue
							
	    result = mm.match_exons(self.contig, txt2.full_name(), self.align_coords, txt2.exons, txt2.chrom, strand=txt2.strand)    	    	
	    if result and len(result.matched_blocks) == len(self.align_blocks):				
		exon_bounds_matched = True
		for i in range(len(result.matched_blocks)):		    		
		    # only 1 boundary has to be flush if it's terminal block
		    if i == len(self.align_blocks) - 1:
			if self.txt.txStart < txt2.txStart:
			    if self.align_coords[result.matched_blocks[i] - 1][0] != txt2.exons[result.matched_exons[i] - 1][0]:
				exon_bounds_matched = False
				
			else:
			    if self.align_coords[result.matched_blocks[i] - 1][1] != txt2.exons[result.matched_exons[i] - 1][1]:
				exon_bounds_matched = False
			
		    # both boundaries have to be flush if it's not terminal block
		    else:
			if not(self.align_coords[result.matched_blocks[i] - 1][0] == txt2.exons[result.matched_exons[i] - 1][0] and\
			       self.align_coords[result.matched_blocks[i] - 1][1] == txt2.exons[result.matched_exons[i] - 1][1]):
			    exon_bounds_matched = False
			    
		if not exon_bounds_matched:
		    continue
		
		if self.txt.txStart < txt2.txStart:
		    txt_span = [int(self.txt.txEnd) + 1, int(txt2.txStart) - 1]
		else:
		    txt_span = [int(txt2.txEnd) + 1, int(self.txt.txStart) - 1]
				
		# make sure there is no transcripts in between the 1st and 2nd transcripts
		has_txt_between = False
		for t in txts:
		    if t.name == self.txt.name or t.name == txt2.name:
			continue
				
		    if subsume([t.txStart, t.txEnd], txt_span):
			has_txt_between = True
			break
			
		    if not has_txt_between:			
			if self.txt.alias and txt2.alias and type(self.txt.alias) is str and type(txt2.alias) is str:
			    if not Transcript.same_family(self.txt.alias, txt2.alias):
				self.event_type = 'read-through'
				self.txt2 = txt2
Exemple #9
0
    def set_novelty(self, txts, matches=None):
	"""Determines if event is novel"""
        novel_events = []
        
        if self.event_type == "novel_exon" or self.event_type == "AS53" or self.event_type == "novel_utr":
            for txt in txts:
                blocks_to_delete = []

                for b in range(len(self.align_blocks)):
                    for e in range(len(txt.exons)):
                        novel = True
                        if subsume(self.align_coords[b], txt.exons[e]):
                            novel = False
                        #novel utr - requires just one edge to align
			if novel and self.event_type == 'novel_utr':
			    if int(self.align_coords[b][0]) == int(txt.exons[e][0]) or int(self.align_coords[b][1]) == int(txt.exons[e][1]):
                                novel = False
                        if not novel:
                            blocks_to_delete.append(b)
                            break

                if blocks_to_delete:
                    for b in sorted(blocks_to_delete, reverse=True):
                        del self.align_blocks[b]
                        del self.align_coords[b]

            if not self.align_blocks:
                self.novel = False
		
	elif self.event_type == 'read-through':
	    if len(self.align_coords) == 1:
		start, end = self.align.blocks[self.align_blocks[0] - 2][1], self.align.blocks[self.align_blocks[0] - 1][0]		
		# see if any single transcript contains the exon junction
		for txt in txts:
		    found_start, found_end = None, None		    
		    for i in range(len(txt.exons) - 1):
			if int(txt.exons[i][1]) == start and int(txt.exons[i + 1][0]) == end:
			    found_start, found_end = i, i + 1
			    self.novel = False
			    break
			  				    
		    if not self.novel:
			break
		
        elif self.event_type == "retained_intron":
            multi = False
            if int(self.exons[-1]) - int(self.exons[0]) > 1:
                multi = True
                self.novel = False
		
            for i in range(len(self.exon_coords)-1):
                retained_intron = [int(self.exon_coords[i][1])+1, int(self.exon_coords[i+1][0])-1]
                middle_exons = {}
                for txt in txts:
                    exons_txt = []
                    for j in range(len(txt.exons)):
                        exon = txt.exons[j]
                        #terminal exon, require subsume
                        if j == 0 or j == len(txt.exons)-1:
                            if subsume(retained_intron, exon):
                                exons_txt.append(exon) 
                        #middle exons, require just overlap
                        elif overlap(exon, (retained_intron[0], retained_intron[1])):
                            exons_txt.append(exon)

                    if exons_txt:
                        middle_exons[txt] = exons_txt

                # only time when original event is novel WITHOUT testing is when it's a single ri and it is clear of overlapping exons
                if len(middle_exons.keys()) == 0 and not multi:
                    self.novel = True
                else:
                    self.novel = False

                    # substract overlapping exons
                    if middle_exons.values() and middle_exons.values()[0]:
                        true_retained_intron = subtract(retained_intron, middle_exons.values())
                    else:
                        true_retained_intron = [retained_intron]

                    # if there is still some intron left after subtraction
                    if true_retained_intron:
                        # create new events
                        for ri in true_retained_intron:
                            event = {'contig': self.contig, 
			             'chrom':self.chrom, 
			             'align_blocks':self.align_blocks, 
			             'align_coords':self.align_coords, 
			             'type':self.event_type, 
			             'novel':True
			             }
                            # determine flanking exons, and transcript by frequency of flanking coordinates
                            flanks = {}
                            for txt in txts:
                                for i in range(len(txt.exons)-1):
                                    left = txt.exons[i]
                                    right = txt.exons[i+1]
                                    if left[1]+1 == ri[0] and right[0]-1 == ri[1]:
                                        #print "ri flanks", txt.full_name(), left, right, ri
                                        coord = ",".join((str(left[0]), str(left[1]), str(right[0]), str(right[1])))
                                        if not flanks.has_key(coord):
                                            flanks[coord] = [[txt,i,i+1]]
                                        else:
                                            flanks[coord].append([txt,i,i+1])

                            if len(flanks.keys()) > 0:
                                # use the most commom frequent exons
                                flanks_sorted = flanks.keys()
                                flanks_sorted.sort(lambda x,y: len(flanks[y])-len(flanks[x]))

                                # use the orginally assigned transcript if possible
                                same_txt = False
                                exon_coords = flanks_sorted[0].split(',')
                                exons = []
                                for txt,e1,e2 in flanks[flanks_sorted[0]]:
                                    if txt.full_name() == self.transcript:
                                        same_txt = True
                                        event['transcript'] = txt.full_name()
                                        event['exons'] = [e1+1,e2+1]
                                        event['exon_coords'] = [exon_coords[:2], exon_coords[2:]]
                                        event['txt'] = txt
                                        break

                                if not same_txt:
                                    txt,e1,e2 = flanks[flanks_sorted[0]][0]
                                    event['transcript'] = txt.full_name()
                                    event['exons'] = [e1+1,e2+1]
                                    event['exon_coords'] = [exon_coords[:2], exon_coords[2:]]
                                    event['txt'] = txt

                                novel_events.append(event)
            
        elif self.event_type == "skipped_exon":
            for txt in txts:
                for e in range(len(txt.exons)-1):
                    intron_span = [int(txt.exons[e][1])+1, int(txt.exons[e+1][0])-1]
                    if self.align_coords and self.exon_coords:
                        if subsume([int(self.exon_coords[0][0]), int(self.exon_coords[-1][1])], intron_span):
                            self.novel = False
                            break
                        
                if not self.novel:
                    break

        elif self.event_type == "novel_intron":
            novel_intron_span = [int(self.align_coords[0][1])+1, int(self.align_coords[1][0])-1]
            novel_intron_size = int(self.align_coords[1][0]) - 1 - int(self.align_coords[0][1])
            for txt in txts:
                for e in range(len(txt.exons)-1):
                    intron_span = [int(txt.exons[e][1])+1, int(txt.exons[e+1][0])-1]

                    if novel_intron_span[0] == intron_span[0] and novel_intron_span[1] == intron_span[1]:
                        self.novel = False
                        break

                if not self.novel:
                    break

        elif 'AS' in self.event_type and self.edge == 'left':
            for txt in txts:
                for exon in txt.exons:
                    if int(self.align_coords[0]) == int(exon[0]):
                        self.novel = False
                        break

                if not self.novel:
                    break

        elif 'AS' in self.event_type and self.edge == 'right':
            for txt in txts:
                for exon in txt.exons:
                    if int(self.align_coords[1]) == int(exon[1]):
                        self.novel = False
                        break

                if not self.novel:
                    break

        return novel_events