def categorize(transcripts):
    # find the intron domains of the transcripts
    boundaries = find_exon_boundaries(transcripts)
    cds_sense = set()
    cds_unstranded = set()
    test_transcripts = []
    for t in transcripts:
        if 'cds' in t.attrs:
            for n in split_exons(t, boundaries):
                cds_sense.add((t.strand, n[0], n[1]))
                cds_unstranded.add(n)
        else:
            test_transcripts.append(t)
    node_dict = {}
    noncoding_transcripts = []
    utr_unstranded = set()
    for t in test_transcripts:
        mcat = t.attrs['category']
        tcat = t.attrs['transcript_category']
        has_cds = False
        nodes = []
        for n in split_exons(t, boundaries):
            nstrand = (t.strand, n[0], n[1])
            nodes.append(nstrand)
            if ((t.strand == NO_STRAND) and (n in cds_unstranded)):
                node_dict[nstrand] = (tcat, mcat, 'cds')
                has_cds = True
            elif nstrand in cds_sense:
                node_dict[nstrand] = (tcat, mcat, 'cds')
                has_cds = True
        if has_cds:
            for n in nodes:
                if n not in node_dict:
                    node_dict[nstrand] = (tcat, mcat, 'utr')
                    utr_unstranded.add((n[1], n[2]))
        else:
            noncoding_transcripts.append(t)

    for t in noncoding_transcripts:
        mcat = t.attrs['category']
        tcat = t.attrs['transcript_category']
        for n in split_exons(t, boundaries):
            nstrand = (t.strand, n[0], n[1])
            if nstrand not in node_dict:
                # check antisense
                if n in cds_unstranded:
                    node_dict[nstrand] = (tcat, mcat, 'cds_antisense')
                elif n in utr_unstranded:
                    node_dict[nstrand] = (tcat, mcat, 'utr_antisense')
                else:
                    node_dict[nstrand] = (tcat, mcat, 'noncoding')
    for n, d in node_dict.iteritems():
        strand, start, end = n
        tcat, mcat, ccat = d
        yield start, end, strand, tcat, mcat, ccat
def categorize(transcripts):
    # find the intron domains of the transcripts
    boundaries = find_exon_boundaries(transcripts)
    cds_sense = set()
    cds_unstranded = set()
    test_transcripts = []
    for t in transcripts:
        if 'cds' in t.attrs:            
            for n in split_exons(t, boundaries):
                cds_sense.add((t.strand, n[0], n[1]))
                cds_unstranded.add(n)
        else:
            test_transcripts.append(t)
    node_dict = {}
    noncoding_transcripts = []
    utr_unstranded = set()
    for t in test_transcripts:
        mcat = t.attrs['category']
        tcat = t.attrs['transcript_category']
        has_cds = False
        nodes = []
        for n in split_exons(t, boundaries):
            nstrand = (t.strand, n[0], n[1])
            nodes.append(nstrand)
            if ((t.strand == NO_STRAND) and (n in cds_unstranded)):
                node_dict[nstrand] = (tcat, mcat, 'cds')
                has_cds = True
            elif nstrand in cds_sense:
                node_dict[nstrand] = (tcat, mcat, 'cds')
                has_cds = True
        if has_cds:
            for n in nodes:
                if n not in node_dict:
                    node_dict[nstrand] = (tcat, mcat, 'utr')
                    utr_unstranded.add((n[1],n[2]))
        else:
            noncoding_transcripts.append(t)

    for t in noncoding_transcripts:
        mcat = t.attrs['category']
        tcat = t.attrs['transcript_category']
        for n in split_exons(t, boundaries):
            nstrand = (t.strand, n[0], n[1])
            if nstrand not in node_dict:
                # check antisense
                if n in cds_unstranded:
                    node_dict[nstrand] = (tcat, mcat, 'cds_antisense')
                elif n in utr_unstranded:
                    node_dict[nstrand] = (tcat, mcat, 'utr_antisense')
                else:
                    node_dict[nstrand] = (tcat, mcat, 'noncoding')
    for n,d in node_dict.iteritems():
        strand, start, end = n
        tcat, mcat, ccat = d
        yield start, end, strand, tcat, mcat, ccat
def create_undirected_transcript_graph(transcripts, add_node_func, **kwargs):
    '''
    add all transcripts to a single undirected graph
    '''
    # find the intron domains of the transcripts
    boundaries = find_exon_boundaries(transcripts)
    # initialize transcript graph as undirected at first
    G = nx.Graph()
    # add transcripts
    for t in transcripts:
        # split exons that cross boundaries and to get the
        # nodes in the transcript path
        nodes = list(Exon(start,end) for start,end in split_exons(t, boundaries))
        # add nodes/edges to graph
        u = nodes[0]
        add_node_func(G, u, t, **kwargs)
        for v in nodes[1:]:
            add_node_func(G, v, t, **kwargs)
            G.add_edge(u, v)
            u = v
    return G
def annotate_locus(transcripts, 
                   gtf_sample_attr): 
    # store reference introns
    # (strand,start,end) -> ids (set) 
    ref_intron_dict = collections.defaultdict(lambda: [])
    ref_node_dict = collections.defaultdict(lambda: ([],[]))
    node_score_dict = collections.defaultdict(lambda: [0.0, 0.0])
    all_introns = set()
    # find the intron domains of the transcripts
    boundaries = find_exon_boundaries(transcripts)
    # add transcript to intron and graph data structures
    inp_transcripts = []
    for t in transcripts:
        # separate ref and nonref transcripts
        is_ref = bool(int(t.attrs[GTFAttr.REF]))
        if is_ref:
            # split exons that cross boundaries and get the
            # nodes in the transcript path
            for n in split_exons(t, boundaries):
                ref_node_dict[n][t.strand].append(t)
            # add to introns
            for start,end in t.iterintrons():
                ref_intron_dict[(t.strand, start, end)].append(t)
                all_introns.add((t.strand,start,end))
        else:
            if t.strand != NO_STRAND:
                score = float(t.attrs[GTFAttr.SCORE])
                for n in split_exons(t, boundaries):
                    node_score_dict[n][t.strand] += score
            inp_transcripts.append(t)
            # add to introns
            for start,end in t.iterintrons():
                all_introns.add((t.strand,start,end))
    # index introns for fast intersection
    intron_tree = IntervalTree()
    for strand,start,end in all_introns:
        intron_tree.insert_interval(Interval(start,end,strand=strand))
    del all_introns
    # categorize transcripts
    strand_transcript_lists = [[], [], []]
    for t in inp_transcripts:
        # get transcript nodes and introns
        nodes = list(split_exons(t, boundaries))
        introns = set(t.iterintrons())
        # try to resolve strand
        strand = t.strand
        if strand == NO_STRAND:
            strand = resolve_strand(nodes, node_score_dict, ref_node_dict)
        # define opposite strand
        if strand == NO_STRAND:
            opp_strand = NO_STRAND
        else:
            opp_strand = (strand + 1) % 2
        # get all reference transcripts that share introns
        intron_ref_dict = {}
        for start,end in introns:
            if (strand, start, end) in ref_intron_dict:
                refs = ref_intron_dict[(strand, start, end)]
                intron_ref_dict.update((ref.attrs[GTFAttr.TRANSCRIPT_ID],ref) 
                                       for ref in refs)
        intron_refs = []
        for ref in intron_ref_dict.itervalues():
            intron_refs.append((ref,list(split_exons(ref, boundaries))))
        # get all reference transcripts that share coverage
        same_strand_ref_dict = {}
        opp_strand_ref_dict = {}
        for n in nodes:
            if n in ref_node_dict:
                strand_refs = ref_node_dict[n]
                same_strand_ref_dict.update((ref.attrs[GTFAttr.TRANSCRIPT_ID],ref) 
                                            for ref in strand_refs[strand])
                opp_strand_ref_dict.update((ref.attrs[GTFAttr.TRANSCRIPT_ID],ref) 
                                           for ref in strand_refs[opp_strand])
        same_strand_refs = []
        for ref in same_strand_ref_dict.itervalues():
            same_strand_refs.append((ref,list(split_exons(ref, boundaries))))
        opp_strand_refs = []
        for ref in opp_strand_ref_dict.itervalues():            
            opp_strand_refs.append((ref,list(split_exons(ref, boundaries))))
        # categorize
        cinf = categorize_transcript(t, nodes, introns, 
                                     intron_refs,
                                     same_strand_refs,
                                     opp_strand_refs,
                                     intron_tree,
                                     ignore_test=False)
        if cinf.is_test:
            # recategorize test transcripts
            cinf2 = categorize_transcript(t, nodes, introns, 
                                          intron_refs,
                                          same_strand_refs,
                                          opp_strand_refs,
                                          intron_tree,
                                          ignore_test=True)
            cinf = cinf._replace(category=cinf2.category)
        # add annotation attributes
        best_ref_id = (cinf.ref.attrs[GTFAttr.TRANSCRIPT_ID] 
                       if cinf.ref is not None else 'na')
        t.attrs[GTFAttr.CATEGORY] = cinf.category
        t.attrs[GTFAttr.TEST] = '1' if cinf.is_test else '0'
        t.attrs[GTFAttr.ANN_REF_ID] = best_ref_id
        t.attrs[GTFAttr.ANN_COV_RATIO] = cinf.ann_cov_ratio
        t.attrs[GTFAttr.ANN_INTRON_RATIO] = cinf.ann_intron_ratio
        # group transcripts by strand
        strand_transcript_lists[strand].append(t)
    # explictly delete large data structures
    del ref_intron_dict
    del ref_node_dict
    del node_score_dict
    del intron_tree
    del inp_transcripts
    # annotate score and recurrence for transcripts
    for strand_transcripts in strand_transcript_lists:
        # find the intron domains of the transcripts
        boundaries = find_exon_boundaries(strand_transcripts)
        # gather node score/recurrence data
        new_data_func = lambda: {'ids': set(), 
                                 'score': 0.0, 
                                 'pct': 0.0}
        node_data = collections.defaultdict(new_data_func)
        for t in strand_transcripts:
            sample_id = t.attrs[gtf_sample_attr]
            score = float(t.attrs[GTFAttr.SCORE])
            pctrank = float(t.attrs[GTFAttr.PCTRANK])
            # split exons that cross boundaries and to get the
            # nodes in the transcript path
            for n in split_exons(t, boundaries):
                nd = node_data[n]
                nd['ids'].add(sample_id)
                nd['score'] += score
                nd['pct'] += pctrank
        # calculate recurrence and score statistics
        for t in strand_transcripts:
            nodes = list(split_exons(t, boundaries))
            mean_score, mean_pctrank, mean_recur = \
                compute_recurrence_and_score(nodes, node_data)
            t.attrs[GTFAttr.MEAN_SCORE] = mean_score
            t.attrs[GTFAttr.MEAN_PCTRANK] = mean_pctrank
            t.attrs[GTFAttr.MEAN_RECURRENCE] = mean_recur
def annotate_locus(transcripts, gtf_sample_attr):
    # store reference introns
    # (strand,start,end) -> ids (set)
    ref_intron_dict = collections.defaultdict(lambda: [])
    ref_node_dict = collections.defaultdict(lambda: ([], []))
    node_score_dict = collections.defaultdict(lambda: [0.0, 0.0])
    all_introns = set()
    # find the intron domains of the transcripts
    boundaries = find_exon_boundaries(transcripts)
    # add transcript to intron and graph data structures
    inp_transcripts = []
    for t in transcripts:
        # separate ref and nonref transcripts
        is_ref = bool(int(t.attrs[GTFAttr.REF]))
        if is_ref:
            # split exons that cross boundaries and get the
            # nodes in the transcript path
            for n in split_exons(t, boundaries):
                ref_node_dict[n][t.strand].append(t)
            # add to introns
            for start, end in t.iterintrons():
                ref_intron_dict[(t.strand, start, end)].append(t)
                all_introns.add((t.strand, start, end))
        else:
            if t.strand != NO_STRAND:
                score = float(t.attrs[GTFAttr.SCORE])
                for n in split_exons(t, boundaries):
                    node_score_dict[n][t.strand] += score
            inp_transcripts.append(t)
            # add to introns
            for start, end in t.iterintrons():
                all_introns.add((t.strand, start, end))
    # index introns for fast intersection
    intron_tree = IntervalTree()
    for strand, start, end in all_introns:
        intron_tree.insert_interval(Interval(start, end, strand=strand))
    del all_introns
    # categorize transcripts
    strand_transcript_lists = [[], [], []]
    for t in inp_transcripts:
        # get transcript nodes and introns
        nodes = list(split_exons(t, boundaries))
        introns = set(t.iterintrons())
        # try to resolve strand
        strand = t.strand
        if strand == NO_STRAND:
            strand = resolve_strand(nodes, node_score_dict, ref_node_dict)
        # define opposite strand
        if strand == NO_STRAND:
            opp_strand = NO_STRAND
        else:
            opp_strand = (strand + 1) % 2
        # get all reference transcripts that share introns
        intron_ref_dict = {}
        for start, end in introns:
            if (strand, start, end) in ref_intron_dict:
                refs = ref_intron_dict[(strand, start, end)]
                intron_ref_dict.update(
                    (ref.attrs[GTFAttr.TRANSCRIPT_ID], ref) for ref in refs)
        intron_refs = []
        for ref in intron_ref_dict.itervalues():
            intron_refs.append((ref, list(split_exons(ref, boundaries))))
        # get all reference transcripts that share coverage
        same_strand_ref_dict = {}
        opp_strand_ref_dict = {}
        for n in nodes:
            if n in ref_node_dict:
                strand_refs = ref_node_dict[n]
                same_strand_ref_dict.update(
                    (ref.attrs[GTFAttr.TRANSCRIPT_ID], ref)
                    for ref in strand_refs[strand])
                opp_strand_ref_dict.update(
                    (ref.attrs[GTFAttr.TRANSCRIPT_ID], ref)
                    for ref in strand_refs[opp_strand])
        same_strand_refs = []
        for ref in same_strand_ref_dict.itervalues():
            same_strand_refs.append((ref, list(split_exons(ref, boundaries))))
        opp_strand_refs = []
        for ref in opp_strand_ref_dict.itervalues():
            opp_strand_refs.append((ref, list(split_exons(ref, boundaries))))
        # categorize
        cinf = categorize_transcript(t,
                                     nodes,
                                     introns,
                                     intron_refs,
                                     same_strand_refs,
                                     opp_strand_refs,
                                     intron_tree,
                                     ignore_test=False)
        if cinf.is_test:
            # recategorize test transcripts
            cinf2 = categorize_transcript(t,
                                          nodes,
                                          introns,
                                          intron_refs,
                                          same_strand_refs,
                                          opp_strand_refs,
                                          intron_tree,
                                          ignore_test=True)
            cinf = cinf._replace(category=cinf2.category)
        # add annotation attributes
        best_ref_id = (cinf.ref.attrs[GTFAttr.TRANSCRIPT_ID]
                       if cinf.ref is not None else 'na')
        t.attrs[GTFAttr.CATEGORY] = cinf.category
        t.attrs[GTFAttr.TEST] = '1' if cinf.is_test else '0'
        t.attrs[GTFAttr.ANN_REF_ID] = best_ref_id
        t.attrs[GTFAttr.ANN_COV_RATIO] = cinf.ann_cov_ratio
        t.attrs[GTFAttr.ANN_INTRON_RATIO] = cinf.ann_intron_ratio
        # group transcripts by strand
        strand_transcript_lists[strand].append(t)
    # explictly delete large data structures
    del ref_intron_dict
    del ref_node_dict
    del node_score_dict
    del intron_tree
    del inp_transcripts
    # annotate score and recurrence for transcripts
    for strand_transcripts in strand_transcript_lists:
        # find the intron domains of the transcripts
        boundaries = find_exon_boundaries(strand_transcripts)
        # gather node score/recurrence data
        new_data_func = lambda: {'ids': set(), 'score': 0.0, 'pct': 0.0}
        node_data = collections.defaultdict(new_data_func)
        for t in strand_transcripts:
            sample_id = t.attrs[gtf_sample_attr]
            score = float(t.attrs[GTFAttr.SCORE])
            pctrank = float(t.attrs[GTFAttr.PCTRANK])
            # split exons that cross boundaries and to get the
            # nodes in the transcript path
            for n in split_exons(t, boundaries):
                nd = node_data[n]
                nd['ids'].add(sample_id)
                nd['score'] += score
                nd['pct'] += pctrank
        # calculate recurrence and score statistics
        for t in strand_transcripts:
            nodes = list(split_exons(t, boundaries))
            mean_score, mean_pctrank, mean_recur = \
                compute_recurrence_and_score(nodes, node_data)
            t.attrs[GTFAttr.MEAN_SCORE] = mean_score
            t.attrs[GTFAttr.MEAN_PCTRANK] = mean_pctrank
            t.attrs[GTFAttr.MEAN_RECURRENCE] = mean_recur
def compare_locus(transcripts):
    # store reference introns
    # (strand,start,end) -> ids (set)
    ref_intron_dict = collections.defaultdict(lambda: [])
    ref_node_dict = collections.defaultdict(lambda: [])
    ref_splicing_patterns = collections.defaultdict(lambda: [])
    ref_dict = {}
    # find the intron domains of the transcripts
    boundaries = find_exon_boundaries(transcripts)
    test_transcripts = []
    for t in transcripts:
        # separate ref and nonref transcripts
        is_ref = bool(int(t.attrs[GTFAttr.REF]))
        if is_ref:
            # add to dict
            ref_id = t.attrs[GTFAttr.TRANSCRIPT_ID]
            ref_dict[ref_id] = t
            # split exons that cross boundaries and get the
            # nodes in the transcript path
            for n in split_exons(t, boundaries):
                ref_node_dict[n].append(t)
            # add to introns
            splicing_pattern = []
            for start, end in t.iterintrons():
                intron = (t.strand, start, end)
                ref_intron_dict[intron].append(t)
                splicing_pattern.append(intron)
            # add to splicing patterns
            if len(splicing_pattern) > 0:
                ref_splicing_patterns[tuple(splicing_pattern)].append(t)
        else:
            test_transcripts.append(t)
    # index introns for fast intersection
    intron_tree = IntervalTree()
    for intron, refs in ref_intron_dict.iteritems():
        strand, start, end = intron
        intron_tree.insert_interval(
            Interval(start, end, strand=strand, value=refs))
    # categorize transcripts
    for t in test_transcripts:
        # get transcript nodes and introns
        nodes = list(split_exons(t, boundaries))
        introns = []
        for start, end in t.iterintrons():
            introns.append((t.strand, start, end))
        splicing_pattern = tuple(introns)
        # keep list of all matching ref transcripts
        matches = collections.defaultdict(lambda: Match())
        # dict of reference transcripts -> category -> list of nodes
        for n in nodes:
            if n in ref_node_dict:
                # look for reference transcripts that share this node
                for ref in ref_node_dict[n]:
                    if cmp_strand(t.strand, ref.strand):
                        c = Category.SAME_STRAND
                    else:
                        c = Category.OPP_STRAND
                    ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID]
                    m = matches[ref_id]
                    m.nodes[c].append(n)
            # look for reference introns that overlap this node
            for hit in intron_tree.find(*n):
                if cmp_strand(t.strand, hit.strand):
                    c = Category.INTRONIC_SAME_STRAND
                else:
                    c = Category.INTRONIC_OPP_STRAND
                for ref in hit.value:
                    ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID]
                    m = matches[ref_id]
                    m.nodes[c].append(n)
        # dict of introns -> list of reference transcripts
        for intron in introns:
            if intron in ref_intron_dict:
                for ref in ref_intron_dict[intron]:
                    ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID]
                    m = matches[ref_id]
                    m.introns.append(intron)
        # check splicing pattern matches
        if len(splicing_pattern) > 0:
            if splicing_pattern in ref_splicing_patterns:
                for ref in ref_splicing_patterns[splicing_pattern]:
                    ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID]
                    m = matches[ref_id]
                    m.splicing = True
        # go through the matches for this transcript and determine
        # the transcript category
        match_stats = []
        for ref_id, m in matches.iteritems():
            ref = ref_dict[ref_id]
            # calculate coverage
            same_strand_bp = sum(
                (n[1] - n[0]) for n in m.nodes[Category.SAME_STRAND])
            opp_strand_bp = sum(
                (n[1] - n[0]) for n in m.nodes[Category.OPP_STRAND])
            # count shared introns
            num_shared_introns = len(m.introns)
            # decide category for this test/ref transcript pair
            if m.splicing or (num_shared_introns > 0) or (same_strand_bp > 0):
                c = Category.SAME_STRAND
            elif (opp_strand_bp > 0):
                c = Category.OPP_STRAND
            else:
                # count nodes of different types
                num_same_strand = len(m.nodes[Category.SAME_STRAND])
                num_opp_strand = len(m.nodes[Category.OPP_STRAND])
                num_intronic_same_strand = len(
                    m.nodes[Category.INTRONIC_SAME_STRAND])
                num_intronic_opp_strand = len(
                    m.nodes[Category.INTRONIC_OPP_STRAND])
                assert num_same_strand == 0
                assert num_opp_strand == 0
                num_intronic = (num_intronic_same_strand +
                                num_intronic_opp_strand)
                assert num_intronic > 0
                if (num_intronic == len(nodes)):
                    # completely intronic
                    if num_intronic_same_strand > 0:
                        c = Category.INTRONIC_SAME_STRAND
                    else:
                        c = Category.INTRONIC_OPP_STRAND
                else:
                    # interleaving means some nodes intronic and other intergenic
                    if num_intronic_same_strand > 0:
                        c = Category.INTERLEAVING_SAME_STRAND
                    else:
                        c = Category.INTERLEAVING_OPP_STRAND
            # create a match object
            ms = MatchStats.from_transcript(t, ref)
            ms.shared_same_strand_bp = same_strand_bp
            ms.shared_opp_strand_bp = opp_strand_bp
            ms.shared_introns = num_shared_introns
            ms.shared_splicing = m.splicing
            ms.category = Category.to_str(c)
            ms.distance = 0
            match_stats.append(ms)
        yield (t, match_stats)
 def compute(self, transcripts):
     intron_dict = collections.defaultdict(lambda: CompareData())
     node_dict = collections.defaultdict(lambda: CompareData())
     splicing_pattern_dict = collections.defaultdict(lambda: CompareData())
     # find the intron domains of the transcripts
     boundaries = find_exon_boundaries(transcripts)
     unstranded_transcripts = []
     for t in transcripts:
         if t.strand == NO_STRAND:
             unstranded_transcripts.append(t)
             continue
         # separate ref and nonref transcripts
         is_ref = bool(int(t.attrs[GTFAttr.REF]))
         # split exons that cross boundaries and get the
         # nodes in the transcript path
         for n in split_exons(t, boundaries):
             n = (t.strand, n[0], n[1])
             if is_ref:
                 node_dict[n].has_ref = True
             else:
                 d = node_dict[n]
                 d.has_test = True
                 d.category = t.attrs['category']
         splicing_pattern = []
         for start, end in t.iterintrons():
             n = (t.strand, start, end)
             if is_ref:
                 intron_dict[n].has_ref = True
             else:
                 d = intron_dict[n]
                 d.has_test = True
                 d.category = t.attrs['category']
             splicing_pattern.append(n)
         splicing_pattern = tuple(splicing_pattern)
         if len(splicing_pattern) > 0:
             if is_ref:
                 splicing_pattern_dict[splicing_pattern].has_ref = True
             else:
                 d = splicing_pattern_dict[splicing_pattern]
                 d.has_test = True
                 d.category = t.attrs['category']
     # handle unstranded transcripts
     for t in unstranded_transcripts:
         # separate ref and nonref transcripts
         is_ref = bool(int(t.attrs[GTFAttr.REF]))
         for n in split_exons(t, boundaries):
             found_node = False
             for strand in (POS_STRAND, NEG_STRAND):
                 sn = (strand, n[0], n[1])
                 if sn in node_dict:
                     if is_ref:
                         node_dict[sn].has_ref = True
                     else:
                         d = node_dict[sn]
                         d.has_test = True
                         d.category = t.attrs['category']
                     found_node = True
             if not found_node:
                 sn = (NO_STRAND, n[0], n[1])
                 if is_ref:
                     node_dict[sn].has_ref = True
                 else:
                     d = node_dict[sn]
                     d.has_test = True
                     d.category = t.attrs['category']
         introns = list(t.iterintrons())
         assert len(introns) == 0
     # compile statistics
     for d in intron_dict.itervalues():
         if d.has_ref and d.has_test:
             self.introns_both += 1
             self.introns_by_category[d.category] += 1
         elif d.has_ref:
             self.introns_ref_only += 1
         elif d.has_test:
             self.introns_test_only += 1
             self.introns_by_category[d.category] += 1
     for d in splicing_pattern_dict.itervalues():
         if d.has_ref and d.has_test:
             self.patterns_both += 1
             self.patterns_by_category[d.category] += 1
         elif d.has_ref:
             self.patterns_ref_only += 1
         elif d.has_test:
             self.patterns_test_only += 1
             self.patterns_by_category[d.category] += 1
     for n, d in node_dict.iteritems():
         strand, start, end = n
         length = end - start
         if d.has_ref and d.has_test:
             self.cov_both += length
             self.cov_by_category[d.category] += length
         elif d.has_ref:
             self.cov_ref_only += length
         elif d.has_test:
             self.cov_test_only += length
             self.cov_by_category[d.category] += length
def compare_locus(transcripts):
    # store reference introns
    # (strand,start,end) -> ids (set) 
    ref_intron_dict = collections.defaultdict(lambda: [])
    ref_node_dict = collections.defaultdict(lambda: [])
    ref_splicing_patterns = collections.defaultdict(lambda: [])
    ref_dict = {}
    # find the intron domains of the transcripts
    boundaries = find_exon_boundaries(transcripts)
    test_transcripts = []
    for t in transcripts:
        # separate ref and nonref transcripts
        is_ref = bool(int(t.attrs[GTFAttr.REF]))
        if is_ref:
            # add to dict
            ref_id = t.attrs[GTFAttr.TRANSCRIPT_ID]
            ref_dict[ref_id] = t
            # split exons that cross boundaries and get the
            # nodes in the transcript path
            for n in split_exons(t, boundaries):
                ref_node_dict[n].append(t)
            # add to introns
            splicing_pattern = []
            for start,end in t.iterintrons():
                intron = (t.strand, start, end)
                ref_intron_dict[intron].append(t)
                splicing_pattern.append(intron)
            # add to splicing patterns
            if len(splicing_pattern) > 0:
                ref_splicing_patterns[tuple(splicing_pattern)].append(t)
        else:
            test_transcripts.append(t)
    # index introns for fast intersection
    intron_tree = IntervalTree()
    for intron, refs in ref_intron_dict.iteritems():
        strand, start, end = intron
        intron_tree.insert_interval(Interval(start,end,strand=strand,value=refs))
    # categorize transcripts
    for t in test_transcripts:
        # get transcript nodes and introns
        nodes = list(split_exons(t, boundaries))
        introns = []
        for start,end in t.iterintrons():
            introns.append((t.strand,start,end))
        splicing_pattern = tuple(introns)
        # keep list of all matching ref transcripts
        matches = collections.defaultdict(lambda: Match())
        # dict of reference transcripts -> category -> list of nodes
        for n in nodes:
            if n in ref_node_dict:
                # look for reference transcripts that share this node
                for ref in ref_node_dict[n]:
                    if cmp_strand(t.strand, ref.strand):
                        c = Category.SAME_STRAND
                    else:
                        c = Category.OPP_STRAND
                    ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID]
                    m = matches[ref_id]
                    m.nodes[c].append(n)
            # look for reference introns that overlap this node
            for hit in intron_tree.find(*n):
                if cmp_strand(t.strand, hit.strand):
                    c = Category.INTRONIC_SAME_STRAND
                else:
                    c = Category.INTRONIC_OPP_STRAND
                for ref in hit.value: 
                    ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID]
                    m = matches[ref_id]
                    m.nodes[c].append(n)          
        # dict of introns -> list of reference transcripts
        for intron in introns:
            if intron in ref_intron_dict:
                for ref in ref_intron_dict[intron]:
                    ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID]
                    m = matches[ref_id]
                    m.introns.append(intron)
        # check splicing pattern matches
        if len(splicing_pattern) > 0:
            if splicing_pattern in ref_splicing_patterns:
                for ref in ref_splicing_patterns[splicing_pattern]:
                    ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID]
                    m = matches[ref_id]
                    m.splicing = True
        # go through the matches for this transcript and determine
        # the transcript category
        match_stats = []
        for ref_id, m in matches.iteritems():
            ref = ref_dict[ref_id]
            # calculate coverage
            same_strand_bp = sum((n[1] - n[0]) for n in m.nodes[Category.SAME_STRAND])
            opp_strand_bp = sum((n[1] - n[0]) for n in m.nodes[Category.OPP_STRAND])
            # count shared introns
            num_shared_introns = len(m.introns)
            # decide category for this test/ref transcript pair
            if m.splicing or (num_shared_introns > 0) or (same_strand_bp > 0):
                c = Category.SAME_STRAND
            elif (opp_strand_bp > 0):
                c = Category.OPP_STRAND
            else:
                # count nodes of different types
                num_same_strand = len(m.nodes[Category.SAME_STRAND])
                num_opp_strand = len(m.nodes[Category.OPP_STRAND])
                num_intronic_same_strand = len(m.nodes[Category.INTRONIC_SAME_STRAND])
                num_intronic_opp_strand = len(m.nodes[Category.INTRONIC_OPP_STRAND])
                assert num_same_strand == 0
                assert num_opp_strand == 0
                num_intronic = (num_intronic_same_strand +
                                num_intronic_opp_strand)
                assert num_intronic > 0
                if (num_intronic == len(nodes)):
                    # completely intronic
                    if num_intronic_same_strand > 0:
                        c = Category.INTRONIC_SAME_STRAND
                    else:
                        c = Category.INTRONIC_OPP_STRAND
                else:
                    # interleaving means some nodes intronic and other intergenic
                    if num_intronic_same_strand > 0:
                        c = Category.INTERLEAVING_SAME_STRAND
                    else:
                        c = Category.INTERLEAVING_OPP_STRAND
            # create a match object
            ms = MatchStats.from_transcript(t, ref)
            ms.shared_same_strand_bp = same_strand_bp
            ms.shared_opp_strand_bp = opp_strand_bp
            ms.shared_introns = num_shared_introns
            ms.shared_splicing = m.splicing
            ms.category = Category.to_str(c)            
            ms.distance = 0
            match_stats.append(ms)
        yield (t, match_stats)
 def compute(self, transcripts):
     intron_dict = collections.defaultdict(lambda: CompareData())
     node_dict = collections.defaultdict(lambda: CompareData())
     splicing_pattern_dict = collections.defaultdict(lambda: CompareData())
     # find the intron domains of the transcripts
     boundaries = find_exon_boundaries(transcripts)
     unstranded_transcripts = []
     for t in transcripts:
         if t.strand == NO_STRAND:
             unstranded_transcripts.append(t)
             continue
         # separate ref and nonref transcripts
         is_ref = bool(int(t.attrs[GTFAttr.REF]))
         # split exons that cross boundaries and get the
         # nodes in the transcript path
         for n in split_exons(t, boundaries):
             n = (t.strand, n[0], n[1])
             if is_ref:
                 node_dict[n].has_ref = True
             else:
                 d = node_dict[n]
                 d.has_test = True
                 d.category = t.attrs['category']
         splicing_pattern = []
         for start,end in t.iterintrons():
             n = (t.strand, start, end)
             if is_ref:
                 intron_dict[n].has_ref = True
             else:
                 d = intron_dict[n]
                 d.has_test = True
                 d.category = t.attrs['category']
             splicing_pattern.append(n)
         splicing_pattern = tuple(splicing_pattern)
         if len(splicing_pattern) > 0:
             if is_ref:
                 splicing_pattern_dict[splicing_pattern].has_ref = True
             else:
                 d = splicing_pattern_dict[splicing_pattern]
                 d.has_test = True
                 d.category = t.attrs['category']
     # handle unstranded transcripts
     for t in unstranded_transcripts:
         # separate ref and nonref transcripts
         is_ref = bool(int(t.attrs[GTFAttr.REF]))
         for n in split_exons(t, boundaries):
             found_node = False
             for strand in (POS_STRAND, NEG_STRAND):
                 sn = (strand, n[0], n[1])
                 if sn in node_dict:
                     if is_ref:
                         node_dict[sn].has_ref = True
                     else:
                         d = node_dict[sn]
                         d.has_test = True
                         d.category = t.attrs['category']
                     found_node = True
             if not found_node:
                 sn = (NO_STRAND, n[0], n[1])
                 if is_ref:
                     node_dict[sn].has_ref = True
                 else:
                     d = node_dict[sn]
                     d.has_test = True
                     d.category = t.attrs['category']
         introns = list(t.iterintrons())
         assert len(introns) == 0
     # compile statistics
     for d in intron_dict.itervalues():
         if d.has_ref and d.has_test:
             self.introns_both += 1
             self.introns_by_category[d.category] += 1
         elif d.has_ref:
             self.introns_ref_only += 1
         elif d.has_test:
             self.introns_test_only += 1
             self.introns_by_category[d.category] += 1
     for d in splicing_pattern_dict.itervalues():
         if d.has_ref and d.has_test:
             self.patterns_both += 1
             self.patterns_by_category[d.category] += 1
         elif d.has_ref:
             self.patterns_ref_only += 1
         elif d.has_test:
             self.patterns_test_only += 1
             self.patterns_by_category[d.category] += 1
     for n,d in node_dict.iteritems():
         strand, start, end = n
         length = end - start
         if d.has_ref and d.has_test:
             self.cov_both += length
             self.cov_by_category[d.category] += length
         elif d.has_ref:
             self.cov_ref_only += length
         elif d.has_test:
             self.cov_test_only += length
             self.cov_by_category[d.category] += length