def read_reference_gtf(ref_gtf_file):
    gene_map = {}
    for f in GTFFeature.parse(open(ref_gtf_file)):
        # get gene by id
        gene_id = f.attrs["gene_id"]
        if gene_id not in gene_map:
            g = Gene()
            g.gene_id = gene_id
            g.chrom = f.seqid
            g.strand = f.strand
            g.gene_start = f.start
            g.gene_end = f.end
            gene_map[gene_id] = g
        else:
            g = gene_map[gene_id]
        # update gene
        g.gene_start = min(g.gene_start, f.start)
        g.gene_end = max(g.gene_end, f.end)
        if f.feature_type == "exon":
            g.exons.add((f.start, f.end))
        elif f.feature_type == "CDS":
            g.is_coding = True
        if "gene_name" in f.attrs:
            g.gene_names.add(f.attrs["gene_name"])
        g.annotation_sources.add(f.source)
    logging.info("Sorting genes")
    genes = sorted(gene_map.values(),
                   key=operator.attrgetter('chrom', 'gene_start'))
    del gene_map
    # cluster loci
    logging.debug("Building interval index")
    locus_cluster_trees = collections.defaultdict(lambda: ClusterTree(0, 1))
    locus_trees = collections.defaultdict(lambda: IntervalTree())
    for i, g in enumerate(genes):
        locus_cluster_trees[g.chrom].insert(g.gene_start, g.gene_end, i)
    for chrom, cluster_tree in locus_cluster_trees.iteritems():
        for locus_start, locus_end, indexes in cluster_tree.getregions():
            # cluster gene exons and add to interval tree
            exon_tree = IntervalTree()
            for i in indexes:
                g = genes[i]
                cluster_tree = ClusterTree(0, 1)
                for start, end in g.exons:
                    cluster_tree.insert(start, end, 1)
                # update exons
                exon_clusters = []
                for start, end, indexes in cluster_tree.getregions():
                    exon_clusters.append((start, end))
                g.exons = exon_clusters
                del cluster_tree
                for start, end in g.exons:
                    exon_tree.insert_interval(Interval(start, end, value=g))
            # add to locus interval tree
            locus_trees[chrom].insert_interval(
                Interval(locus_start, locus_end, value=exon_tree))
    logging.debug("Done indexing reference GTF file")
    return locus_trees
Example #2
0
def build_interval_tree_from_bed(bed_file):
    trees = collections.defaultdict(lambda: IntervalTree())
    for f in BEDFeature.parse(open(bed_file)):
        tree = trees[f.chrom]
        for start,end in f.exons:
            tree.insert_interval(Interval(start, end, strand=f.strand, value=f.name))
    return trees
Example #3
0
def build_locus_trees(gtf_file):
    transcripts = []
    locus_cluster_trees = collections.defaultdict(lambda: ClusterTree(0,1))
    for locus_transcripts in parse_gtf(open(gtf_file)):
        for t in locus_transcripts:
            is_ref = bool(int(t.attrs[GTFAttr.REF]))
            if not is_ref:
                continue
            i = len(transcripts)
            transcripts.append(t)
            locus_cluster_trees[t.chrom].insert(t.start, t.end, i)
    # build interval trees of loci
    locus_trees = collections.defaultdict(lambda: IntervalTree())
    for chrom, cluster_tree in locus_cluster_trees.iteritems():
        for locus_start, locus_end, indexes in cluster_tree.getregions():
            for i in indexes:
                locus_transcripts = [transcripts[i] for i in indexes]
                locus_trees[chrom].insert_interval(Interval(locus_start, locus_end, value=locus_transcripts))
    return locus_trees
Example #4
0
def trim_graph(G, strand, min_trim_length, trim_utr_fraction,
               trim_intron_fraction):
    # get 'chains' of contiguous non-intron nodes with edge degree of
    # one or less
    node_chain_map, chains = get_chains(G, introns=False)
    # setup dictionaries of predecessors and successors
    successor_dict = {}
    for n, nbrdict in G.adjacency_iter():
        successor_dict[n] = nbrdict.keys()
    predecessor_dict = {}
    G.reverse(copy=False)
    for n, nbrdict in G.adjacency_iter():
        predecessor_dict[n] = nbrdict.keys()
    G.reverse(copy=False)
    # setup intron data structures
    introns = {}
    intron_tree = IntervalTree()
    reverse = (strand == NEG_STRAND)
    for u, nbrdict in G.adjacency_iter():
        for v in nbrdict:
            if reverse:
                left, right = v, u
            else:
                left, right = u, v
            # skip contiguous nodes
            if left.end == right.start:
                continue
            # calculate score of the chains
            u_chain_nodes = chains[node_chain_map[u]]
            u_score = max(G.node[n][NODE_SCORE] for n in u_chain_nodes)
            v_chain_nodes = chains[node_chain_map[v]]
            v_score = max(G.node[n][NODE_SCORE] for n in v_chain_nodes)
            # store scores in intron data structures
            introns[(left.end, right.start)] = (u_score, v_score)
            intron_tree.insert_interval(
                Interval(left.end, right.start, value=(u_score, v_score)))
    # trim chains
    all_trim_nodes = set()
    for parent, nodes in chains.iteritems():
        if strand == NEG_STRAND:
            nodes.reverse()
        in_degree = len(predecessor_dict[nodes[0]])
        out_degree = len(successor_dict[nodes[-1]])
        trim_nodes = set()
        if ((in_degree == 1) and (out_degree == 1)
                and (parent.start, parent.end) in introns):
            # intron retention - a chain of nodes precisely matches an
            # intron, so we can potentially remove the entire chain
            pred_score, succ_score = introns[(parent.start, parent.end)]
            cutoff_score = trim_intron_fraction * max(pred_score, succ_score)
            trim_nodes.update(trim_intron(G, nodes, cutoff_score))
        else:
            # determine whether this node chain is intronic. intronic node
            # chains are trimmed more strictly due to intronic pre-mrna
            found_intron = False
            max_pred_score = 0.0
            max_succ_score = 0.0
            for hit in intron_tree.find(parent.start, parent.end):
                # ignore contained introns
                if (hit.start > parent.start) and (hit.end < parent.end):
                    continue
                # set intron flag and keep track of highest coverage
                # overlapping intron to make trimming conservative
                found_intron = True
                pred_score, succ_score = hit.value
                if pred_score > max_pred_score:
                    max_pred_score = pred_score
                if succ_score > max_succ_score:
                    max_succ_score = succ_score
            if (in_degree == 0) and (out_degree == 0):
                if found_intron:
                    cutoff_score = trim_intron_fraction * max(
                        max_pred_score, max_succ_score)
                    trim_nodes.update(trim_intron(G, nodes, cutoff_score))
                trim_nodes.update(
                    trim_bidirectional(G, nodes, min_trim_length,
                                       trim_utr_fraction))
            elif in_degree == 0:
                if found_intron:
                    cutoff_score = trim_intron_fraction * max_succ_score
                    trim_nodes.update(trim_intronic_utr(
                        G, nodes, cutoff_score))
                trim_nodes.update(
                    trim_utr(G, nodes[::-1], min_trim_length,
                             trim_utr_fraction))
            elif out_degree == 0:
                if found_intron:
                    cutoff_score = trim_intron_fraction * max_pred_score
                    trim_nodes.update(
                        trim_intronic_utr(G, nodes[::-1], cutoff_score))
                trim_nodes.update(
                    trim_utr(G, nodes, min_trim_length, trim_utr_fraction))
        all_trim_nodes.update(trim_nodes)
    if len(all_trim_nodes) > 0:
        logging.debug("\t\t(%s) trimmed %d/%d nodes from graph" %
                      (strand_int_to_str(strand), len(all_trim_nodes), len(G)))
    return all_trim_nodes
def annotate_locus(transcripts, gtf_sample_attr):
    # store reference introns
    # (strand,start,end) -> ids (set)
    ref_intron_dict = collections.defaultdict(lambda: [])
    ref_node_dict = collections.defaultdict(lambda: ([], []))
    node_score_dict = collections.defaultdict(lambda: [0.0, 0.0])
    all_introns = set()
    # find the intron domains of the transcripts
    boundaries = find_exon_boundaries(transcripts)
    # add transcript to intron and graph data structures
    inp_transcripts = []
    for t in transcripts:
        # separate ref and nonref transcripts
        is_ref = bool(int(t.attrs[GTFAttr.REF]))
        if is_ref:
            # split exons that cross boundaries and get the
            # nodes in the transcript path
            for n in split_exons(t, boundaries):
                ref_node_dict[n][t.strand].append(t)
            # add to introns
            for start, end in t.iterintrons():
                ref_intron_dict[(t.strand, start, end)].append(t)
                all_introns.add((t.strand, start, end))
        else:
            if t.strand != NO_STRAND:
                score = float(t.attrs[GTFAttr.SCORE])
                for n in split_exons(t, boundaries):
                    node_score_dict[n][t.strand] += score
            inp_transcripts.append(t)
            # add to introns
            for start, end in t.iterintrons():
                all_introns.add((t.strand, start, end))
    # index introns for fast intersection
    intron_tree = IntervalTree()
    for strand, start, end in all_introns:
        intron_tree.insert_interval(Interval(start, end, strand=strand))
    del all_introns
    # categorize transcripts
    strand_transcript_lists = [[], [], []]
    for t in inp_transcripts:
        # get transcript nodes and introns
        nodes = list(split_exons(t, boundaries))
        introns = set(t.iterintrons())
        # try to resolve strand
        strand = t.strand
        if strand == NO_STRAND:
            strand = resolve_strand(nodes, node_score_dict, ref_node_dict)
        # define opposite strand
        if strand == NO_STRAND:
            opp_strand = NO_STRAND
        else:
            opp_strand = (strand + 1) % 2
        # get all reference transcripts that share introns
        intron_ref_dict = {}
        for start, end in introns:
            if (strand, start, end) in ref_intron_dict:
                refs = ref_intron_dict[(strand, start, end)]
                intron_ref_dict.update(
                    (ref.attrs[GTFAttr.TRANSCRIPT_ID], ref) for ref in refs)
        intron_refs = []
        for ref in intron_ref_dict.itervalues():
            intron_refs.append((ref, list(split_exons(ref, boundaries))))
        # get all reference transcripts that share coverage
        same_strand_ref_dict = {}
        opp_strand_ref_dict = {}
        for n in nodes:
            if n in ref_node_dict:
                strand_refs = ref_node_dict[n]
                same_strand_ref_dict.update(
                    (ref.attrs[GTFAttr.TRANSCRIPT_ID], ref)
                    for ref in strand_refs[strand])
                opp_strand_ref_dict.update(
                    (ref.attrs[GTFAttr.TRANSCRIPT_ID], ref)
                    for ref in strand_refs[opp_strand])
        same_strand_refs = []
        for ref in same_strand_ref_dict.itervalues():
            same_strand_refs.append((ref, list(split_exons(ref, boundaries))))
        opp_strand_refs = []
        for ref in opp_strand_ref_dict.itervalues():
            opp_strand_refs.append((ref, list(split_exons(ref, boundaries))))
        # categorize
        cinf = categorize_transcript(t,
                                     nodes,
                                     introns,
                                     intron_refs,
                                     same_strand_refs,
                                     opp_strand_refs,
                                     intron_tree,
                                     ignore_test=False)
        if cinf.is_test:
            # recategorize test transcripts
            cinf2 = categorize_transcript(t,
                                          nodes,
                                          introns,
                                          intron_refs,
                                          same_strand_refs,
                                          opp_strand_refs,
                                          intron_tree,
                                          ignore_test=True)
            cinf = cinf._replace(category=cinf2.category)
        # add annotation attributes
        best_ref_id = (cinf.ref.attrs[GTFAttr.TRANSCRIPT_ID]
                       if cinf.ref is not None else 'na')
        t.attrs[GTFAttr.CATEGORY] = cinf.category
        t.attrs[GTFAttr.TEST] = '1' if cinf.is_test else '0'
        t.attrs[GTFAttr.ANN_REF_ID] = best_ref_id
        t.attrs[GTFAttr.ANN_COV_RATIO] = cinf.ann_cov_ratio
        t.attrs[GTFAttr.ANN_INTRON_RATIO] = cinf.ann_intron_ratio
        # group transcripts by strand
        strand_transcript_lists[strand].append(t)
    # explictly delete large data structures
    del ref_intron_dict
    del ref_node_dict
    del node_score_dict
    del intron_tree
    del inp_transcripts
    # annotate score and recurrence for transcripts
    for strand_transcripts in strand_transcript_lists:
        # find the intron domains of the transcripts
        boundaries = find_exon_boundaries(strand_transcripts)
        # gather node score/recurrence data
        new_data_func = lambda: {'ids': set(), 'score': 0.0, 'pct': 0.0}
        node_data = collections.defaultdict(new_data_func)
        for t in strand_transcripts:
            sample_id = t.attrs[gtf_sample_attr]
            score = float(t.attrs[GTFAttr.SCORE])
            pctrank = float(t.attrs[GTFAttr.PCTRANK])
            # split exons that cross boundaries and to get the
            # nodes in the transcript path
            for n in split_exons(t, boundaries):
                nd = node_data[n]
                nd['ids'].add(sample_id)
                nd['score'] += score
                nd['pct'] += pctrank
        # calculate recurrence and score statistics
        for t in strand_transcripts:
            nodes = list(split_exons(t, boundaries))
            mean_score, mean_pctrank, mean_recur = \
                compute_recurrence_and_score(nodes, node_data)
            t.attrs[GTFAttr.MEAN_SCORE] = mean_score
            t.attrs[GTFAttr.MEAN_PCTRANK] = mean_pctrank
            t.attrs[GTFAttr.MEAN_RECURRENCE] = mean_recur
def compare_locus(transcripts):
    # store reference introns
    # (strand,start,end) -> ids (set)
    ref_intron_dict = collections.defaultdict(lambda: [])
    ref_node_dict = collections.defaultdict(lambda: [])
    ref_splicing_patterns = collections.defaultdict(lambda: [])
    ref_dict = {}
    # find the intron domains of the transcripts
    boundaries = find_exon_boundaries(transcripts)
    test_transcripts = []
    for t in transcripts:
        # separate ref and nonref transcripts
        is_ref = bool(int(t.attrs[GTFAttr.REF]))
        if is_ref:
            # add to dict
            ref_id = t.attrs[GTFAttr.TRANSCRIPT_ID]
            ref_dict[ref_id] = t
            # split exons that cross boundaries and get the
            # nodes in the transcript path
            for n in split_exons(t, boundaries):
                ref_node_dict[n].append(t)
            # add to introns
            splicing_pattern = []
            for start, end in t.iterintrons():
                intron = (t.strand, start, end)
                ref_intron_dict[intron].append(t)
                splicing_pattern.append(intron)
            # add to splicing patterns
            if len(splicing_pattern) > 0:
                ref_splicing_patterns[tuple(splicing_pattern)].append(t)
        else:
            test_transcripts.append(t)
    # index introns for fast intersection
    intron_tree = IntervalTree()
    for intron, refs in ref_intron_dict.iteritems():
        strand, start, end = intron
        intron_tree.insert_interval(
            Interval(start, end, strand=strand, value=refs))
    # categorize transcripts
    for t in test_transcripts:
        # get transcript nodes and introns
        nodes = list(split_exons(t, boundaries))
        introns = []
        for start, end in t.iterintrons():
            introns.append((t.strand, start, end))
        splicing_pattern = tuple(introns)
        # keep list of all matching ref transcripts
        matches = collections.defaultdict(lambda: Match())
        # dict of reference transcripts -> category -> list of nodes
        for n in nodes:
            if n in ref_node_dict:
                # look for reference transcripts that share this node
                for ref in ref_node_dict[n]:
                    if cmp_strand(t.strand, ref.strand):
                        c = Category.SAME_STRAND
                    else:
                        c = Category.OPP_STRAND
                    ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID]
                    m = matches[ref_id]
                    m.nodes[c].append(n)
            # look for reference introns that overlap this node
            for hit in intron_tree.find(*n):
                if cmp_strand(t.strand, hit.strand):
                    c = Category.INTRONIC_SAME_STRAND
                else:
                    c = Category.INTRONIC_OPP_STRAND
                for ref in hit.value:
                    ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID]
                    m = matches[ref_id]
                    m.nodes[c].append(n)
        # dict of introns -> list of reference transcripts
        for intron in introns:
            if intron in ref_intron_dict:
                for ref in ref_intron_dict[intron]:
                    ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID]
                    m = matches[ref_id]
                    m.introns.append(intron)
        # check splicing pattern matches
        if len(splicing_pattern) > 0:
            if splicing_pattern in ref_splicing_patterns:
                for ref in ref_splicing_patterns[splicing_pattern]:
                    ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID]
                    m = matches[ref_id]
                    m.splicing = True
        # go through the matches for this transcript and determine
        # the transcript category
        match_stats = []
        for ref_id, m in matches.iteritems():
            ref = ref_dict[ref_id]
            # calculate coverage
            same_strand_bp = sum(
                (n[1] - n[0]) for n in m.nodes[Category.SAME_STRAND])
            opp_strand_bp = sum(
                (n[1] - n[0]) for n in m.nodes[Category.OPP_STRAND])
            # count shared introns
            num_shared_introns = len(m.introns)
            # decide category for this test/ref transcript pair
            if m.splicing or (num_shared_introns > 0) or (same_strand_bp > 0):
                c = Category.SAME_STRAND
            elif (opp_strand_bp > 0):
                c = Category.OPP_STRAND
            else:
                # count nodes of different types
                num_same_strand = len(m.nodes[Category.SAME_STRAND])
                num_opp_strand = len(m.nodes[Category.OPP_STRAND])
                num_intronic_same_strand = len(
                    m.nodes[Category.INTRONIC_SAME_STRAND])
                num_intronic_opp_strand = len(
                    m.nodes[Category.INTRONIC_OPP_STRAND])
                assert num_same_strand == 0
                assert num_opp_strand == 0
                num_intronic = (num_intronic_same_strand +
                                num_intronic_opp_strand)
                assert num_intronic > 0
                if (num_intronic == len(nodes)):
                    # completely intronic
                    if num_intronic_same_strand > 0:
                        c = Category.INTRONIC_SAME_STRAND
                    else:
                        c = Category.INTRONIC_OPP_STRAND
                else:
                    # interleaving means some nodes intronic and other intergenic
                    if num_intronic_same_strand > 0:
                        c = Category.INTERLEAVING_SAME_STRAND
                    else:
                        c = Category.INTERLEAVING_OPP_STRAND
            # create a match object
            ms = MatchStats.from_transcript(t, ref)
            ms.shared_same_strand_bp = same_strand_bp
            ms.shared_opp_strand_bp = opp_strand_bp
            ms.shared_introns = num_shared_introns
            ms.shared_splicing = m.splicing
            ms.category = Category.to_str(c)
            ms.distance = 0
            match_stats.append(ms)
        yield (t, match_stats)