def is_fusion_compatible(r1, r2, max_fusion_point_dist, allow_extra_5_exons): """ Helper function for: merge_fusion_exons() Check that: (1) r1, r2 and both in the 5', or both in the 3' (2) if single-exon, fusion point must be close by if multi-exon, every junction identical (plus below is True) (3) if allow_extra_5_exons is False, num exons must be the same if allow_extra_5_exons is True, only allow additional 5' exons """ MAX_QSTART_FOR_5 = 100 # first need to figure out ends # also check that both are in the 5' portion of r1 and r2 assert r1.flag.strand == r2.flag.strand if r1.qStart < MAX_QSTART_FOR_5: # in the 5' portion of r1 if r2.qStart > MAX_QSTART_FOR_5: # in the 3' portion, reject return False in_5_portion = True else: # in the 3' portion of r1 if r2.qStart < MAX_QSTART_FOR_5: return False in_5_portion = False plus_is_5end = (r1.flag.strand == '+') type = compare_junctions.compare_junctions(r1, r2) if type == 'exact': if len(r1.segments) == 1: if len(r2.segments) == 1: # single exon case, check fusion point is close enough if in_5_portion and plus_is_5end: dist = abs(r1.sStart - r2.sStart) else: dist = abs(r1.sEnd - r2.sEnd) return dist <= max_fusion_point_dist else: raise Exception, "Not possible case for multi-exon transcript and " + \ "single-exon transcript to be exact!" else: # multi-exon case, must be OK return True elif type == 'super' or type == 'subset': if allow_extra_5_exons: # check that the 3' junction is identical # also check that the 3' end is relatively close if in_5_portion and plus_is_5end: if r1.segments[-1].start != r2.segments[-1].start: return False if abs(r1.segments[-1].end - r2.segments[-1].end) > max_fusion_point_dist: return False elif in_5_portion and (not plus_is_5end): if r1.segments[0].end != r2.segments[0].end: return False if abs(r1.segments[0].start - r2.segments[0].start) > max_fusion_point_dist: return False else: return False else: # not OK because number of exons must be the same return False else: #ex: partial, nomatch, etc... return False
def is_fusion_compatible(r1, r2, max_fusion_point_dist, max_exon_end_dist, allow_extra_5_exons): """ Helper function for: merge_fusion_exons() Check that: (1) r1, r2 and both in the 5', or both in the 3' (2) if single-exon, fusion point must be close by if multi-exon, every junction identical (plus below is True) (3) if allow_extra_5_exons is False, num exons must be the same if allow_extra_5_exons is True, only allow additional 5' exons """ # _ids = 'i1a_c1603/f67p459/1248,i1b_c19881/f7p368/1235,newClontech_i0HQ|c18279/f6p24/1229,i2b_c22046/f2p494/2157,i2a_c4714/f10p554/2152'.split(',') # if r1.qID in _ids or r2.qID in _ids: # pdb.set_trace() # first need to figure out ends # also check that both are in the 5' portion of r1 and r2 assert r1.flag.strand == r2.flag.strand if r1.qStart <= .5*r1.qLen: # in the 5' portion of r1 if r2.qStart > .5*r2.qLen: # in the 3' portion, reject return False in_5_portion = True else: # in the 3' portion of r1 if r2.qStart <= .5*r2.qLen: return False in_5_portion = False plus_is_5end = (r1.flag.strand == '+') type = compare_junctions.compare_junctions(r1, r2) if type == 'exact': if len(r1.segments) == 1: if len(r2.segments) == 1: # single exon case, check fusion point is close enough if in_5_portion and plus_is_5end: dist = abs(r1.sStart - r2.sStart) else: dist = abs(r1.sEnd - r2.sEnd) return dist <= max_fusion_point_dist else: raise Exception, "Not possible case for multi-exon transcript and " + \ "single-exon transcript to be exact!" else: # multi-exon case, must be OK return True elif type == 'super' or type == 'subset': if allow_extra_5_exons: # check that the 3' junction is identical # also check that the 3' end is relatively close if in_5_portion and plus_is_5end: if abs(r1.segments[-1].start - r2.segments[-1].start) > max_exon_end_dist: return False if abs(r1.segments[-1].end - r2.segments[-1].end) > max_fusion_point_dist: return False return True elif in_5_portion and (not plus_is_5end): if abs(r1.segments[0].end - r2.segments[0].end) > max_exon_end_dist: return False if abs(r1.segments[0].start - r2.segments[0].start) > max_fusion_point_dist: return False return True else: return False else: # not OK because number of exons must be the same return False else: #ex: partial, nomatch, etc... return False
def match_record_to_tree(self, r): """ r --- GMAPRecord tree --- dict of chromosome --> strand --> IntervalTree If exact match (every exon junction), return the matching GMAPRecord Otherwise return None *NOTE*: the tree should be non-redundant so can return as soon as exact match is found! """ matches = self.tree[r.chr][r.strand].find(r.start, r.end) for r2 in matches: r.segments = r.ref_exons r2.segments = r2.ref_exons if compare_junctions.compare_junctions(r, r2, internal_fuzzy_max_dist=self.internal_fuzzy_max_dist) == 'exact': # is a match! return r2 return None
def match_record_to_tree(self, r): """ r --- GMAPRecord tree --- dict of chromosome --> strand --> IntervalTree If exact match (every exon junction), return the matching GMAPRecord Otherwise return None *NOTE*: the tree should be non-redundant so can return as soon as exact match is found! """ matches = self.tree[r.chr][r.strand].find(r.start, r.end) for r2 in matches: r.segments = r.ref_exons r2.segments = r2.ref_exons if compare_junctions.compare_junctions( r, r2, internal_fuzzy_max_dist=self.internal_fuzzy_max_dist ) == 'exact': # is a match! return r2 return None
def filter_out_subsets(recs, internal_fuzzy_max_dist): # recs must be sorted by start becuz that's the order they are written i = 0 while i < len(recs)-1: j = i + 1 while j < len(recs): if recs[j].start > recs[i].end: break recs[i].segments = recs[i].ref_exons recs[j].segments = recs[j].ref_exons m = compare_junctions.compare_junctions(recs[i], recs[j], internal_fuzzy_max_dist) if can_merge(m, recs[i], recs[j], internal_fuzzy_max_dist): if m == 'super': # pop recs[j] recs.pop(j) else: recs.pop(i) j += 1 else: j += 1 i += 1
def is_fusion_compatible(r1, r2, max_fusion_point_dist, max_exon_end_dist, allow_extra_5_exons): """ Helper function for: merge_fusion_exons() Check that: (1) r1, r2 and both in the 5', or both in the 3' (2) if single-exon, fusion point must be close by if multi-exon, every junction identical (plus below is True) (3) if allow_extra_5_exons is False, num exons must be the same if allow_extra_5_exons is True, only allow additional 5' exons """ # _ids = 'i1a_c1603/f67p459/1248,i1b_c19881/f7p368/1235,newClontech_i0HQ|c18279/f6p24/1229,i2b_c22046/f2p494/2157,i2a_c4714/f10p554/2152'.split(',') # if r1.qID in _ids or r2.qID in _ids: # pdb.set_trace() # first need to figure out ends # also check that both are in the 5' portion of r1 and r2 assert r1.flag.strand == r2.flag.strand if r1.qStart <= .5 * r1.qLen: # in the 5' portion of r1 if r2.qStart > .5 * r2.qLen: # in the 3' portion, reject return False in_5_portion = True else: # in the 3' portion of r1 if r2.qStart <= .5 * r2.qLen: return False in_5_portion = False plus_is_5end = (r1.flag.strand == '+') type = compare_junctions.compare_junctions(r1, r2) if type == 'exact': if len(r1.segments) == 1: if len(r2.segments) == 1: # single exon case, check fusion point is close enough if in_5_portion and plus_is_5end: dist = abs(r1.sStart - r2.sStart) else: dist = abs(r1.sEnd - r2.sEnd) return dist <= max_fusion_point_dist else: raise Exception, "Not possible case for multi-exon transcript and " + \ "single-exon transcript to be exact!" else: # multi-exon case, must be OK return True elif type == 'super' or type == 'subset': if allow_extra_5_exons: # check that the 3' junction is identical # also check that the 3' end is relatively close if in_5_portion and plus_is_5end: if abs(r1.segments[-1].start - r2.segments[-1].start) > max_exon_end_dist: return False if abs(r1.segments[-1].end - r2.segments[-1].end) > max_fusion_point_dist: return False return True elif in_5_portion and (not plus_is_5end): if abs(r1.segments[0].end - r2.segments[0].end) > max_exon_end_dist: return False if abs(r1.segments[0].start - r2.segments[0].start) > max_fusion_point_dist: return False return True else: return False else: # not OK because number of exons must be the same return False else: #ex: partial, nomatch, etc... return False
def collapse_fuzzy_junctions(gff_filename, group_filename, allow_extra_5exon, internal_fuzzy_max_dist): def get_fl_from_id(members): # ex: 13cycle_1Mag1Diff|i0HQ_SIRV_1d1m|c139597/f1p0/178 return sum(int(_id.split('/')[1].split('p')[0][1:]) for _id in members) def can_merge(m, r1, r2): if m == 'exact': return True else: if not allow_extra_5exon: return False # below is continued only if (a) is 'subset' or 'super' AND (b) allow_extra_5exon is True if m == 'subset': r1, r2 = r2, r1 # rotate so r1 is always the longer one if m == 'super' or m == 'subset': n2 = len(r2.ref_exons) # check that (a) r1 and r2 end on same 3' exon, that is the last acceptor site agrees # AND (b) the 5' start of r2 is sandwiched between the matching r1 exon coordinates if r1.strand == '+': return abs(r1.ref_exons[-1].start - r2.ref_exons[-1].start) <= internal_fuzzy_max_dist and \ r1.ref_exons[-n2].start <= r2.ref_exons[0].start < r1.ref_exons[-n2].end else: return abs(r1.ref_exons[0].end - r2.ref_exons[0].end) <= internal_fuzzy_max_dist and \ r1.ref_exons[n2-1].start <= r2.ref_exons[-1].end < r1.ref_exons[n2].end return False d = {} recs = defaultdict(lambda: { '+': IntervalTree(), '-': IntervalTree() }) # chr --> strand --> tree fuzzy_match = defaultdict(lambda: []) for r in GFF.collapseGFFReader(gff_filename): d[r.seqid] = r has_match = False r.segments = r.ref_exons for r2 in recs[r.chr][r.strand].find(r.start, r.end): r2.segments = r2.ref_exons m = compare_junctions.compare_junctions( r, r2, internal_fuzzy_max_dist=internal_fuzzy_max_dist) if can_merge(m, r, r2): fuzzy_match[r2.seqid].append(r.seqid) has_match = True break if not has_match: recs[r.chr][r.strand].insert(r.start, r.end, r) fuzzy_match[r.seqid] = [r.seqid] group_info = {} with open(group_filename) as f: for line in f: pbid, members = line.strip().split('\t') group_info[pbid] = [x for x in members.split(',')] # pick for each fuzzy group the one that has the most exons (if tie, then most FL) keys = fuzzy_match.keys() keys.sort(key=lambda x: map(int, x.split('.')[1:])) f_gff = open(gff_filename + '.fuzzy', 'w') f_group = open(group_filename + '.fuzzy', 'w') for k in keys: all_members = [] best_pbid, best_size, best_num_exons = fuzzy_match[k][0], len( group_info[fuzzy_match[k][0]]), len(d[fuzzy_match[k][0]].ref_exons) all_members += group_info[fuzzy_match[k][0]] for pbid in fuzzy_match[k][1:]: _size = get_fl_from_id(group_info[pbid]) _num_exons = len(d[pbid].ref_exons) all_members += group_info[pbid] if _num_exons > best_num_exons or (_num_exons == best_num_exons and _size > best_size): best_pbid, best_size, best_num_exons = pbid, _size, _num_exons GFF.write_collapseGFF_format(f_gff, d[best_pbid]) f_group.write("{0}\t{1}\n".format(best_pbid, ",".join(all_members))) f_gff.close() f_group.close() return fuzzy_match
def collapse_fuzzy_junctions(gff_filename, group_filename, allow_extra_5exon, internal_fuzzy_max_dist): def get_fl_from_id(members): # ex: 13cycle_1Mag1Diff|i0HQ_SIRV_1d1m|c139597/f1p0/178 return sum(int(_id.split('/')[1].split('p')[0][1:]) for _id in members) def can_merge(m, r1, r2): if m == 'exact': return True else: if not allow_extra_5exon: return False # below is continued only if (a) is 'subset' or 'super' AND (b) allow_extra_5exon is True if m == 'subset': r1, r2 = r2, r1 # rotate so r1 is always the longer one if m == 'super' or m == 'subset': n2 = len(r2.ref_exons) # check that (a) r1 and r2 end on same 3' exon, that is the last acceptor site agrees # AND (b) the 5' start of r2 is sandwiched between the matching r1 exon coordinates if r1.strand == '+': return abs(r1.ref_exons[-1].start - r2.ref_exons[-1].start) <= internal_fuzzy_max_dist and \ r1.ref_exons[-n2].start <= r2.ref_exons[0].start < r1.ref_exons[-n2].end else: return abs(r1.ref_exons[0].end - r2.ref_exons[0].end) <= internal_fuzzy_max_dist and \ r1.ref_exons[n2-1].start <= r2.ref_exons[-1].end < r1.ref_exons[n2].end return False d = {} recs = defaultdict(lambda: {'+':IntervalTree(), '-':IntervalTree()}) # chr --> strand --> tree fuzzy_match = defaultdict(lambda: []) for r in GFF.collapseGFFReader(gff_filename): d[r.seqid] = r has_match = False r.segments = r.ref_exons for r2 in recs[r.chr][r.strand].find(r.start, r.end): r2.segments = r2.ref_exons m = compare_junctions.compare_junctions(r, r2, internal_fuzzy_max_dist=internal_fuzzy_max_dist) if can_merge(m, r, r2): fuzzy_match[r2.seqid].append(r.seqid) has_match = True break if not has_match: recs[r.chr][r.strand].insert(r.start, r.end, r) fuzzy_match[r.seqid] = [r.seqid] group_info = {} with open(group_filename) as f: for line in f: pbid, members = line.strip().split('\t') group_info[pbid] = [x for x in members.split(',')] # pick for each fuzzy group the one that has the most exons (if tie, then most FL) keys = fuzzy_match.keys() keys.sort(key=lambda x: map(int, x.split('.')[1:])) f_gff = open(gff_filename+'.fuzzy', 'w') f_group = open(group_filename+'.fuzzy', 'w') for k in keys: all_members = [] best_pbid, best_size, best_num_exons = fuzzy_match[k][0], len(group_info[fuzzy_match[k][0]]), len(d[fuzzy_match[k][0]].ref_exons) all_members += group_info[fuzzy_match[k][0]] for pbid in fuzzy_match[k][1:]: _size = get_fl_from_id(group_info[pbid]) _num_exons = len(d[pbid].ref_exons) all_members += group_info[pbid] if _num_exons > best_num_exons or (_num_exons == best_num_exons and _size > best_size): best_pbid, best_size, best_num_exons = pbid, _size, _num_exons GFF.write_collapseGFF_format(f_gff, d[best_pbid]) f_group.write("{0}\t{1}\n".format(best_pbid, ",".join(all_members))) f_gff.close() f_group.close() return fuzzy_match