def categorize(transcripts): # find the intron domains of the transcripts boundaries = find_exon_boundaries(transcripts) cds_sense = set() cds_unstranded = set() test_transcripts = [] for t in transcripts: if 'cds' in t.attrs: for n in split_exons(t, boundaries): cds_sense.add((t.strand, n[0], n[1])) cds_unstranded.add(n) else: test_transcripts.append(t) node_dict = {} noncoding_transcripts = [] utr_unstranded = set() for t in test_transcripts: mcat = t.attrs['category'] tcat = t.attrs['transcript_category'] has_cds = False nodes = [] for n in split_exons(t, boundaries): nstrand = (t.strand, n[0], n[1]) nodes.append(nstrand) if ((t.strand == NO_STRAND) and (n in cds_unstranded)): node_dict[nstrand] = (tcat, mcat, 'cds') has_cds = True elif nstrand in cds_sense: node_dict[nstrand] = (tcat, mcat, 'cds') has_cds = True if has_cds: for n in nodes: if n not in node_dict: node_dict[nstrand] = (tcat, mcat, 'utr') utr_unstranded.add((n[1], n[2])) else: noncoding_transcripts.append(t) for t in noncoding_transcripts: mcat = t.attrs['category'] tcat = t.attrs['transcript_category'] for n in split_exons(t, boundaries): nstrand = (t.strand, n[0], n[1]) if nstrand not in node_dict: # check antisense if n in cds_unstranded: node_dict[nstrand] = (tcat, mcat, 'cds_antisense') elif n in utr_unstranded: node_dict[nstrand] = (tcat, mcat, 'utr_antisense') else: node_dict[nstrand] = (tcat, mcat, 'noncoding') for n, d in node_dict.iteritems(): strand, start, end = n tcat, mcat, ccat = d yield start, end, strand, tcat, mcat, ccat
def categorize(transcripts): # find the intron domains of the transcripts boundaries = find_exon_boundaries(transcripts) cds_sense = set() cds_unstranded = set() test_transcripts = [] for t in transcripts: if 'cds' in t.attrs: for n in split_exons(t, boundaries): cds_sense.add((t.strand, n[0], n[1])) cds_unstranded.add(n) else: test_transcripts.append(t) node_dict = {} noncoding_transcripts = [] utr_unstranded = set() for t in test_transcripts: mcat = t.attrs['category'] tcat = t.attrs['transcript_category'] has_cds = False nodes = [] for n in split_exons(t, boundaries): nstrand = (t.strand, n[0], n[1]) nodes.append(nstrand) if ((t.strand == NO_STRAND) and (n in cds_unstranded)): node_dict[nstrand] = (tcat, mcat, 'cds') has_cds = True elif nstrand in cds_sense: node_dict[nstrand] = (tcat, mcat, 'cds') has_cds = True if has_cds: for n in nodes: if n not in node_dict: node_dict[nstrand] = (tcat, mcat, 'utr') utr_unstranded.add((n[1],n[2])) else: noncoding_transcripts.append(t) for t in noncoding_transcripts: mcat = t.attrs['category'] tcat = t.attrs['transcript_category'] for n in split_exons(t, boundaries): nstrand = (t.strand, n[0], n[1]) if nstrand not in node_dict: # check antisense if n in cds_unstranded: node_dict[nstrand] = (tcat, mcat, 'cds_antisense') elif n in utr_unstranded: node_dict[nstrand] = (tcat, mcat, 'utr_antisense') else: node_dict[nstrand] = (tcat, mcat, 'noncoding') for n,d in node_dict.iteritems(): strand, start, end = n tcat, mcat, ccat = d yield start, end, strand, tcat, mcat, ccat
def create_undirected_transcript_graph(transcripts, add_node_func, **kwargs): ''' add all transcripts to a single undirected graph ''' # find the intron domains of the transcripts boundaries = find_exon_boundaries(transcripts) # initialize transcript graph as undirected at first G = nx.Graph() # add transcripts for t in transcripts: # split exons that cross boundaries and to get the # nodes in the transcript path nodes = list(Exon(start,end) for start,end in split_exons(t, boundaries)) # add nodes/edges to graph u = nodes[0] add_node_func(G, u, t, **kwargs) for v in nodes[1:]: add_node_func(G, v, t, **kwargs) G.add_edge(u, v) u = v return G
def annotate_locus(transcripts, gtf_sample_attr): # store reference introns # (strand,start,end) -> ids (set) ref_intron_dict = collections.defaultdict(lambda: []) ref_node_dict = collections.defaultdict(lambda: ([],[])) node_score_dict = collections.defaultdict(lambda: [0.0, 0.0]) all_introns = set() # find the intron domains of the transcripts boundaries = find_exon_boundaries(transcripts) # add transcript to intron and graph data structures inp_transcripts = [] for t in transcripts: # separate ref and nonref transcripts is_ref = bool(int(t.attrs[GTFAttr.REF])) if is_ref: # split exons that cross boundaries and get the # nodes in the transcript path for n in split_exons(t, boundaries): ref_node_dict[n][t.strand].append(t) # add to introns for start,end in t.iterintrons(): ref_intron_dict[(t.strand, start, end)].append(t) all_introns.add((t.strand,start,end)) else: if t.strand != NO_STRAND: score = float(t.attrs[GTFAttr.SCORE]) for n in split_exons(t, boundaries): node_score_dict[n][t.strand] += score inp_transcripts.append(t) # add to introns for start,end in t.iterintrons(): all_introns.add((t.strand,start,end)) # index introns for fast intersection intron_tree = IntervalTree() for strand,start,end in all_introns: intron_tree.insert_interval(Interval(start,end,strand=strand)) del all_introns # categorize transcripts strand_transcript_lists = [[], [], []] for t in inp_transcripts: # get transcript nodes and introns nodes = list(split_exons(t, boundaries)) introns = set(t.iterintrons()) # try to resolve strand strand = t.strand if strand == NO_STRAND: strand = resolve_strand(nodes, node_score_dict, ref_node_dict) # define opposite strand if strand == NO_STRAND: opp_strand = NO_STRAND else: opp_strand = (strand + 1) % 2 # get all reference transcripts that share introns intron_ref_dict = {} for start,end in introns: if (strand, start, end) in ref_intron_dict: refs = ref_intron_dict[(strand, start, end)] intron_ref_dict.update((ref.attrs[GTFAttr.TRANSCRIPT_ID],ref) for ref in refs) intron_refs = [] for ref in intron_ref_dict.itervalues(): intron_refs.append((ref,list(split_exons(ref, boundaries)))) # get all reference transcripts that share coverage same_strand_ref_dict = {} opp_strand_ref_dict = {} for n in nodes: if n in ref_node_dict: strand_refs = ref_node_dict[n] same_strand_ref_dict.update((ref.attrs[GTFAttr.TRANSCRIPT_ID],ref) for ref in strand_refs[strand]) opp_strand_ref_dict.update((ref.attrs[GTFAttr.TRANSCRIPT_ID],ref) for ref in strand_refs[opp_strand]) same_strand_refs = [] for ref in same_strand_ref_dict.itervalues(): same_strand_refs.append((ref,list(split_exons(ref, boundaries)))) opp_strand_refs = [] for ref in opp_strand_ref_dict.itervalues(): opp_strand_refs.append((ref,list(split_exons(ref, boundaries)))) # categorize cinf = categorize_transcript(t, nodes, introns, intron_refs, same_strand_refs, opp_strand_refs, intron_tree, ignore_test=False) if cinf.is_test: # recategorize test transcripts cinf2 = categorize_transcript(t, nodes, introns, intron_refs, same_strand_refs, opp_strand_refs, intron_tree, ignore_test=True) cinf = cinf._replace(category=cinf2.category) # add annotation attributes best_ref_id = (cinf.ref.attrs[GTFAttr.TRANSCRIPT_ID] if cinf.ref is not None else 'na') t.attrs[GTFAttr.CATEGORY] = cinf.category t.attrs[GTFAttr.TEST] = '1' if cinf.is_test else '0' t.attrs[GTFAttr.ANN_REF_ID] = best_ref_id t.attrs[GTFAttr.ANN_COV_RATIO] = cinf.ann_cov_ratio t.attrs[GTFAttr.ANN_INTRON_RATIO] = cinf.ann_intron_ratio # group transcripts by strand strand_transcript_lists[strand].append(t) # explictly delete large data structures del ref_intron_dict del ref_node_dict del node_score_dict del intron_tree del inp_transcripts # annotate score and recurrence for transcripts for strand_transcripts in strand_transcript_lists: # find the intron domains of the transcripts boundaries = find_exon_boundaries(strand_transcripts) # gather node score/recurrence data new_data_func = lambda: {'ids': set(), 'score': 0.0, 'pct': 0.0} node_data = collections.defaultdict(new_data_func) for t in strand_transcripts: sample_id = t.attrs[gtf_sample_attr] score = float(t.attrs[GTFAttr.SCORE]) pctrank = float(t.attrs[GTFAttr.PCTRANK]) # split exons that cross boundaries and to get the # nodes in the transcript path for n in split_exons(t, boundaries): nd = node_data[n] nd['ids'].add(sample_id) nd['score'] += score nd['pct'] += pctrank # calculate recurrence and score statistics for t in strand_transcripts: nodes = list(split_exons(t, boundaries)) mean_score, mean_pctrank, mean_recur = \ compute_recurrence_and_score(nodes, node_data) t.attrs[GTFAttr.MEAN_SCORE] = mean_score t.attrs[GTFAttr.MEAN_PCTRANK] = mean_pctrank t.attrs[GTFAttr.MEAN_RECURRENCE] = mean_recur
def annotate_locus(transcripts, gtf_sample_attr): # store reference introns # (strand,start,end) -> ids (set) ref_intron_dict = collections.defaultdict(lambda: []) ref_node_dict = collections.defaultdict(lambda: ([], [])) node_score_dict = collections.defaultdict(lambda: [0.0, 0.0]) all_introns = set() # find the intron domains of the transcripts boundaries = find_exon_boundaries(transcripts) # add transcript to intron and graph data structures inp_transcripts = [] for t in transcripts: # separate ref and nonref transcripts is_ref = bool(int(t.attrs[GTFAttr.REF])) if is_ref: # split exons that cross boundaries and get the # nodes in the transcript path for n in split_exons(t, boundaries): ref_node_dict[n][t.strand].append(t) # add to introns for start, end in t.iterintrons(): ref_intron_dict[(t.strand, start, end)].append(t) all_introns.add((t.strand, start, end)) else: if t.strand != NO_STRAND: score = float(t.attrs[GTFAttr.SCORE]) for n in split_exons(t, boundaries): node_score_dict[n][t.strand] += score inp_transcripts.append(t) # add to introns for start, end in t.iterintrons(): all_introns.add((t.strand, start, end)) # index introns for fast intersection intron_tree = IntervalTree() for strand, start, end in all_introns: intron_tree.insert_interval(Interval(start, end, strand=strand)) del all_introns # categorize transcripts strand_transcript_lists = [[], [], []] for t in inp_transcripts: # get transcript nodes and introns nodes = list(split_exons(t, boundaries)) introns = set(t.iterintrons()) # try to resolve strand strand = t.strand if strand == NO_STRAND: strand = resolve_strand(nodes, node_score_dict, ref_node_dict) # define opposite strand if strand == NO_STRAND: opp_strand = NO_STRAND else: opp_strand = (strand + 1) % 2 # get all reference transcripts that share introns intron_ref_dict = {} for start, end in introns: if (strand, start, end) in ref_intron_dict: refs = ref_intron_dict[(strand, start, end)] intron_ref_dict.update( (ref.attrs[GTFAttr.TRANSCRIPT_ID], ref) for ref in refs) intron_refs = [] for ref in intron_ref_dict.itervalues(): intron_refs.append((ref, list(split_exons(ref, boundaries)))) # get all reference transcripts that share coverage same_strand_ref_dict = {} opp_strand_ref_dict = {} for n in nodes: if n in ref_node_dict: strand_refs = ref_node_dict[n] same_strand_ref_dict.update( (ref.attrs[GTFAttr.TRANSCRIPT_ID], ref) for ref in strand_refs[strand]) opp_strand_ref_dict.update( (ref.attrs[GTFAttr.TRANSCRIPT_ID], ref) for ref in strand_refs[opp_strand]) same_strand_refs = [] for ref in same_strand_ref_dict.itervalues(): same_strand_refs.append((ref, list(split_exons(ref, boundaries)))) opp_strand_refs = [] for ref in opp_strand_ref_dict.itervalues(): opp_strand_refs.append((ref, list(split_exons(ref, boundaries)))) # categorize cinf = categorize_transcript(t, nodes, introns, intron_refs, same_strand_refs, opp_strand_refs, intron_tree, ignore_test=False) if cinf.is_test: # recategorize test transcripts cinf2 = categorize_transcript(t, nodes, introns, intron_refs, same_strand_refs, opp_strand_refs, intron_tree, ignore_test=True) cinf = cinf._replace(category=cinf2.category) # add annotation attributes best_ref_id = (cinf.ref.attrs[GTFAttr.TRANSCRIPT_ID] if cinf.ref is not None else 'na') t.attrs[GTFAttr.CATEGORY] = cinf.category t.attrs[GTFAttr.TEST] = '1' if cinf.is_test else '0' t.attrs[GTFAttr.ANN_REF_ID] = best_ref_id t.attrs[GTFAttr.ANN_COV_RATIO] = cinf.ann_cov_ratio t.attrs[GTFAttr.ANN_INTRON_RATIO] = cinf.ann_intron_ratio # group transcripts by strand strand_transcript_lists[strand].append(t) # explictly delete large data structures del ref_intron_dict del ref_node_dict del node_score_dict del intron_tree del inp_transcripts # annotate score and recurrence for transcripts for strand_transcripts in strand_transcript_lists: # find the intron domains of the transcripts boundaries = find_exon_boundaries(strand_transcripts) # gather node score/recurrence data new_data_func = lambda: {'ids': set(), 'score': 0.0, 'pct': 0.0} node_data = collections.defaultdict(new_data_func) for t in strand_transcripts: sample_id = t.attrs[gtf_sample_attr] score = float(t.attrs[GTFAttr.SCORE]) pctrank = float(t.attrs[GTFAttr.PCTRANK]) # split exons that cross boundaries and to get the # nodes in the transcript path for n in split_exons(t, boundaries): nd = node_data[n] nd['ids'].add(sample_id) nd['score'] += score nd['pct'] += pctrank # calculate recurrence and score statistics for t in strand_transcripts: nodes = list(split_exons(t, boundaries)) mean_score, mean_pctrank, mean_recur = \ compute_recurrence_and_score(nodes, node_data) t.attrs[GTFAttr.MEAN_SCORE] = mean_score t.attrs[GTFAttr.MEAN_PCTRANK] = mean_pctrank t.attrs[GTFAttr.MEAN_RECURRENCE] = mean_recur
def compare_locus(transcripts): # store reference introns # (strand,start,end) -> ids (set) ref_intron_dict = collections.defaultdict(lambda: []) ref_node_dict = collections.defaultdict(lambda: []) ref_splicing_patterns = collections.defaultdict(lambda: []) ref_dict = {} # find the intron domains of the transcripts boundaries = find_exon_boundaries(transcripts) test_transcripts = [] for t in transcripts: # separate ref and nonref transcripts is_ref = bool(int(t.attrs[GTFAttr.REF])) if is_ref: # add to dict ref_id = t.attrs[GTFAttr.TRANSCRIPT_ID] ref_dict[ref_id] = t # split exons that cross boundaries and get the # nodes in the transcript path for n in split_exons(t, boundaries): ref_node_dict[n].append(t) # add to introns splicing_pattern = [] for start, end in t.iterintrons(): intron = (t.strand, start, end) ref_intron_dict[intron].append(t) splicing_pattern.append(intron) # add to splicing patterns if len(splicing_pattern) > 0: ref_splicing_patterns[tuple(splicing_pattern)].append(t) else: test_transcripts.append(t) # index introns for fast intersection intron_tree = IntervalTree() for intron, refs in ref_intron_dict.iteritems(): strand, start, end = intron intron_tree.insert_interval( Interval(start, end, strand=strand, value=refs)) # categorize transcripts for t in test_transcripts: # get transcript nodes and introns nodes = list(split_exons(t, boundaries)) introns = [] for start, end in t.iterintrons(): introns.append((t.strand, start, end)) splicing_pattern = tuple(introns) # keep list of all matching ref transcripts matches = collections.defaultdict(lambda: Match()) # dict of reference transcripts -> category -> list of nodes for n in nodes: if n in ref_node_dict: # look for reference transcripts that share this node for ref in ref_node_dict[n]: if cmp_strand(t.strand, ref.strand): c = Category.SAME_STRAND else: c = Category.OPP_STRAND ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID] m = matches[ref_id] m.nodes[c].append(n) # look for reference introns that overlap this node for hit in intron_tree.find(*n): if cmp_strand(t.strand, hit.strand): c = Category.INTRONIC_SAME_STRAND else: c = Category.INTRONIC_OPP_STRAND for ref in hit.value: ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID] m = matches[ref_id] m.nodes[c].append(n) # dict of introns -> list of reference transcripts for intron in introns: if intron in ref_intron_dict: for ref in ref_intron_dict[intron]: ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID] m = matches[ref_id] m.introns.append(intron) # check splicing pattern matches if len(splicing_pattern) > 0: if splicing_pattern in ref_splicing_patterns: for ref in ref_splicing_patterns[splicing_pattern]: ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID] m = matches[ref_id] m.splicing = True # go through the matches for this transcript and determine # the transcript category match_stats = [] for ref_id, m in matches.iteritems(): ref = ref_dict[ref_id] # calculate coverage same_strand_bp = sum( (n[1] - n[0]) for n in m.nodes[Category.SAME_STRAND]) opp_strand_bp = sum( (n[1] - n[0]) for n in m.nodes[Category.OPP_STRAND]) # count shared introns num_shared_introns = len(m.introns) # decide category for this test/ref transcript pair if m.splicing or (num_shared_introns > 0) or (same_strand_bp > 0): c = Category.SAME_STRAND elif (opp_strand_bp > 0): c = Category.OPP_STRAND else: # count nodes of different types num_same_strand = len(m.nodes[Category.SAME_STRAND]) num_opp_strand = len(m.nodes[Category.OPP_STRAND]) num_intronic_same_strand = len( m.nodes[Category.INTRONIC_SAME_STRAND]) num_intronic_opp_strand = len( m.nodes[Category.INTRONIC_OPP_STRAND]) assert num_same_strand == 0 assert num_opp_strand == 0 num_intronic = (num_intronic_same_strand + num_intronic_opp_strand) assert num_intronic > 0 if (num_intronic == len(nodes)): # completely intronic if num_intronic_same_strand > 0: c = Category.INTRONIC_SAME_STRAND else: c = Category.INTRONIC_OPP_STRAND else: # interleaving means some nodes intronic and other intergenic if num_intronic_same_strand > 0: c = Category.INTERLEAVING_SAME_STRAND else: c = Category.INTERLEAVING_OPP_STRAND # create a match object ms = MatchStats.from_transcript(t, ref) ms.shared_same_strand_bp = same_strand_bp ms.shared_opp_strand_bp = opp_strand_bp ms.shared_introns = num_shared_introns ms.shared_splicing = m.splicing ms.category = Category.to_str(c) ms.distance = 0 match_stats.append(ms) yield (t, match_stats)
def compute(self, transcripts): intron_dict = collections.defaultdict(lambda: CompareData()) node_dict = collections.defaultdict(lambda: CompareData()) splicing_pattern_dict = collections.defaultdict(lambda: CompareData()) # find the intron domains of the transcripts boundaries = find_exon_boundaries(transcripts) unstranded_transcripts = [] for t in transcripts: if t.strand == NO_STRAND: unstranded_transcripts.append(t) continue # separate ref and nonref transcripts is_ref = bool(int(t.attrs[GTFAttr.REF])) # split exons that cross boundaries and get the # nodes in the transcript path for n in split_exons(t, boundaries): n = (t.strand, n[0], n[1]) if is_ref: node_dict[n].has_ref = True else: d = node_dict[n] d.has_test = True d.category = t.attrs['category'] splicing_pattern = [] for start, end in t.iterintrons(): n = (t.strand, start, end) if is_ref: intron_dict[n].has_ref = True else: d = intron_dict[n] d.has_test = True d.category = t.attrs['category'] splicing_pattern.append(n) splicing_pattern = tuple(splicing_pattern) if len(splicing_pattern) > 0: if is_ref: splicing_pattern_dict[splicing_pattern].has_ref = True else: d = splicing_pattern_dict[splicing_pattern] d.has_test = True d.category = t.attrs['category'] # handle unstranded transcripts for t in unstranded_transcripts: # separate ref and nonref transcripts is_ref = bool(int(t.attrs[GTFAttr.REF])) for n in split_exons(t, boundaries): found_node = False for strand in (POS_STRAND, NEG_STRAND): sn = (strand, n[0], n[1]) if sn in node_dict: if is_ref: node_dict[sn].has_ref = True else: d = node_dict[sn] d.has_test = True d.category = t.attrs['category'] found_node = True if not found_node: sn = (NO_STRAND, n[0], n[1]) if is_ref: node_dict[sn].has_ref = True else: d = node_dict[sn] d.has_test = True d.category = t.attrs['category'] introns = list(t.iterintrons()) assert len(introns) == 0 # compile statistics for d in intron_dict.itervalues(): if d.has_ref and d.has_test: self.introns_both += 1 self.introns_by_category[d.category] += 1 elif d.has_ref: self.introns_ref_only += 1 elif d.has_test: self.introns_test_only += 1 self.introns_by_category[d.category] += 1 for d in splicing_pattern_dict.itervalues(): if d.has_ref and d.has_test: self.patterns_both += 1 self.patterns_by_category[d.category] += 1 elif d.has_ref: self.patterns_ref_only += 1 elif d.has_test: self.patterns_test_only += 1 self.patterns_by_category[d.category] += 1 for n, d in node_dict.iteritems(): strand, start, end = n length = end - start if d.has_ref and d.has_test: self.cov_both += length self.cov_by_category[d.category] += length elif d.has_ref: self.cov_ref_only += length elif d.has_test: self.cov_test_only += length self.cov_by_category[d.category] += length
def compare_locus(transcripts): # store reference introns # (strand,start,end) -> ids (set) ref_intron_dict = collections.defaultdict(lambda: []) ref_node_dict = collections.defaultdict(lambda: []) ref_splicing_patterns = collections.defaultdict(lambda: []) ref_dict = {} # find the intron domains of the transcripts boundaries = find_exon_boundaries(transcripts) test_transcripts = [] for t in transcripts: # separate ref and nonref transcripts is_ref = bool(int(t.attrs[GTFAttr.REF])) if is_ref: # add to dict ref_id = t.attrs[GTFAttr.TRANSCRIPT_ID] ref_dict[ref_id] = t # split exons that cross boundaries and get the # nodes in the transcript path for n in split_exons(t, boundaries): ref_node_dict[n].append(t) # add to introns splicing_pattern = [] for start,end in t.iterintrons(): intron = (t.strand, start, end) ref_intron_dict[intron].append(t) splicing_pattern.append(intron) # add to splicing patterns if len(splicing_pattern) > 0: ref_splicing_patterns[tuple(splicing_pattern)].append(t) else: test_transcripts.append(t) # index introns for fast intersection intron_tree = IntervalTree() for intron, refs in ref_intron_dict.iteritems(): strand, start, end = intron intron_tree.insert_interval(Interval(start,end,strand=strand,value=refs)) # categorize transcripts for t in test_transcripts: # get transcript nodes and introns nodes = list(split_exons(t, boundaries)) introns = [] for start,end in t.iterintrons(): introns.append((t.strand,start,end)) splicing_pattern = tuple(introns) # keep list of all matching ref transcripts matches = collections.defaultdict(lambda: Match()) # dict of reference transcripts -> category -> list of nodes for n in nodes: if n in ref_node_dict: # look for reference transcripts that share this node for ref in ref_node_dict[n]: if cmp_strand(t.strand, ref.strand): c = Category.SAME_STRAND else: c = Category.OPP_STRAND ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID] m = matches[ref_id] m.nodes[c].append(n) # look for reference introns that overlap this node for hit in intron_tree.find(*n): if cmp_strand(t.strand, hit.strand): c = Category.INTRONIC_SAME_STRAND else: c = Category.INTRONIC_OPP_STRAND for ref in hit.value: ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID] m = matches[ref_id] m.nodes[c].append(n) # dict of introns -> list of reference transcripts for intron in introns: if intron in ref_intron_dict: for ref in ref_intron_dict[intron]: ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID] m = matches[ref_id] m.introns.append(intron) # check splicing pattern matches if len(splicing_pattern) > 0: if splicing_pattern in ref_splicing_patterns: for ref in ref_splicing_patterns[splicing_pattern]: ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID] m = matches[ref_id] m.splicing = True # go through the matches for this transcript and determine # the transcript category match_stats = [] for ref_id, m in matches.iteritems(): ref = ref_dict[ref_id] # calculate coverage same_strand_bp = sum((n[1] - n[0]) for n in m.nodes[Category.SAME_STRAND]) opp_strand_bp = sum((n[1] - n[0]) for n in m.nodes[Category.OPP_STRAND]) # count shared introns num_shared_introns = len(m.introns) # decide category for this test/ref transcript pair if m.splicing or (num_shared_introns > 0) or (same_strand_bp > 0): c = Category.SAME_STRAND elif (opp_strand_bp > 0): c = Category.OPP_STRAND else: # count nodes of different types num_same_strand = len(m.nodes[Category.SAME_STRAND]) num_opp_strand = len(m.nodes[Category.OPP_STRAND]) num_intronic_same_strand = len(m.nodes[Category.INTRONIC_SAME_STRAND]) num_intronic_opp_strand = len(m.nodes[Category.INTRONIC_OPP_STRAND]) assert num_same_strand == 0 assert num_opp_strand == 0 num_intronic = (num_intronic_same_strand + num_intronic_opp_strand) assert num_intronic > 0 if (num_intronic == len(nodes)): # completely intronic if num_intronic_same_strand > 0: c = Category.INTRONIC_SAME_STRAND else: c = Category.INTRONIC_OPP_STRAND else: # interleaving means some nodes intronic and other intergenic if num_intronic_same_strand > 0: c = Category.INTERLEAVING_SAME_STRAND else: c = Category.INTERLEAVING_OPP_STRAND # create a match object ms = MatchStats.from_transcript(t, ref) ms.shared_same_strand_bp = same_strand_bp ms.shared_opp_strand_bp = opp_strand_bp ms.shared_introns = num_shared_introns ms.shared_splicing = m.splicing ms.category = Category.to_str(c) ms.distance = 0 match_stats.append(ms) yield (t, match_stats)
def compute(self, transcripts): intron_dict = collections.defaultdict(lambda: CompareData()) node_dict = collections.defaultdict(lambda: CompareData()) splicing_pattern_dict = collections.defaultdict(lambda: CompareData()) # find the intron domains of the transcripts boundaries = find_exon_boundaries(transcripts) unstranded_transcripts = [] for t in transcripts: if t.strand == NO_STRAND: unstranded_transcripts.append(t) continue # separate ref and nonref transcripts is_ref = bool(int(t.attrs[GTFAttr.REF])) # split exons that cross boundaries and get the # nodes in the transcript path for n in split_exons(t, boundaries): n = (t.strand, n[0], n[1]) if is_ref: node_dict[n].has_ref = True else: d = node_dict[n] d.has_test = True d.category = t.attrs['category'] splicing_pattern = [] for start,end in t.iterintrons(): n = (t.strand, start, end) if is_ref: intron_dict[n].has_ref = True else: d = intron_dict[n] d.has_test = True d.category = t.attrs['category'] splicing_pattern.append(n) splicing_pattern = tuple(splicing_pattern) if len(splicing_pattern) > 0: if is_ref: splicing_pattern_dict[splicing_pattern].has_ref = True else: d = splicing_pattern_dict[splicing_pattern] d.has_test = True d.category = t.attrs['category'] # handle unstranded transcripts for t in unstranded_transcripts: # separate ref and nonref transcripts is_ref = bool(int(t.attrs[GTFAttr.REF])) for n in split_exons(t, boundaries): found_node = False for strand in (POS_STRAND, NEG_STRAND): sn = (strand, n[0], n[1]) if sn in node_dict: if is_ref: node_dict[sn].has_ref = True else: d = node_dict[sn] d.has_test = True d.category = t.attrs['category'] found_node = True if not found_node: sn = (NO_STRAND, n[0], n[1]) if is_ref: node_dict[sn].has_ref = True else: d = node_dict[sn] d.has_test = True d.category = t.attrs['category'] introns = list(t.iterintrons()) assert len(introns) == 0 # compile statistics for d in intron_dict.itervalues(): if d.has_ref and d.has_test: self.introns_both += 1 self.introns_by_category[d.category] += 1 elif d.has_ref: self.introns_ref_only += 1 elif d.has_test: self.introns_test_only += 1 self.introns_by_category[d.category] += 1 for d in splicing_pattern_dict.itervalues(): if d.has_ref and d.has_test: self.patterns_both += 1 self.patterns_by_category[d.category] += 1 elif d.has_ref: self.patterns_ref_only += 1 elif d.has_test: self.patterns_test_only += 1 self.patterns_by_category[d.category] += 1 for n,d in node_dict.iteritems(): strand, start, end = n length = end - start if d.has_ref and d.has_test: self.cov_both += length self.cov_by_category[d.category] += length elif d.has_ref: self.cov_ref_only += length elif d.has_test: self.cov_test_only += length self.cov_by_category[d.category] += length