Ejemplo n.º 1
0
def test_empty_graph_bug():
    t_dict, locus = read_single_locus('empty_graph_bug.gtf')
    transfrags = locus.get_transfrags(Strand.POS)
    sgraph = SpliceGraph.create(transfrags)
    pgf = PathGraphFactory(sgraph)
    K, k = pgf.create_optimal()
    assert K is None
Ejemplo n.º 2
0
def test_path2():
    t_dict, locus = read_single_locus('noc2l_locus.gtf')
    for sgraph in locus.create_splice_graphs():
        pgraphfactory = PathGraphFactory(sgraph)
        pgraph, k = pgraphfactory.create_optimal()
        paths = find_paths(pgraph)
    return
Ejemplo n.º 3
0
def test_path1():
    t_dict, locus = read_single_locus('path1.gtf')
    transfrags = locus.get_transfrags(Strand.POS)
    sgraph = SpliceGraph.create(transfrags)
    k = 2
    pgf = PathGraphFactory(sgraph)
    pgraph = pgf.create(k)
    paths = find_paths(pgraph)
    return
Ejemplo n.º 4
0
def test_path_graph_factory():
    t_dict, locus = read_single_locus('path1.gtf')
    sgraph = SpliceGraph.create(t_dict.values())
    pgraphfactory = PathGraphFactory(sgraph)
    g1 = pgraphfactory.create(k=1)
    assert len(g1) == 5
    g2 = pgraphfactory.create(k=2)
    assert len(g2) == 6
    gopt, k = pgraphfactory.create_optimal()
    assert k == 2
    return
Ejemplo n.º 5
0
def test_topological_sort():
    G = Graph()
    G.add_path((G.SOURCE, 10, 20, 30, 40, G.SINK))
    G.add_path((G.SOURCE, 10, 30, 40, G.SINK))
    G.add_path((G.SOURCE, 10, G.SINK))
    G.add_path((G.SOURCE, 20, G.SINK))
    assert G.is_topological_sort(G.topological_sort())
    t_dict, locus = read_single_locus('noc2l_locus.gtf')
    for sgraph in locus.create_splice_graphs():
        pgf = PathGraphFactory(sgraph)
        G = pgf.create(k=1)
        assert G.is_topological_sort(G.topological_sort())
        assert G.is_topological_sort(G.topological_sort_dfs())
Ejemplo n.º 6
0
def test_ccle55_cuff_noc2l():
    '''Locus containing from 55 CCLE samples assembled with Cufflinks'''
    # pull SpliceGraph out of GTF
    t_dict, locus = read_single_locus('noc2l_locus.gtf')
    found_sgraph = False
    for sgraph in locus.create_splice_graphs():
        if (sgraph.chrom == 'chr1' and sgraph.start == 934942 and
            sgraph.end == 976702 and sgraph.strand == Strand.NEG):
            found_sgraph = True
            break
    assert found_sgraph

    # examine specific change points
    trim = False
    pval = 0.1
    fc_cutoff = 0.8
    n1 = Exon(934942, 944589)
    n1_id = sgraph.get_node_id(n1)
    assert sgraph.G.is_stop[n1_id]
    cps = sgraph.detect_change_points(pval=pval, fc_cutoff=fc_cutoff)
    for cp in cps:
        sgraph.apply_change_point(cp, trim=trim)
    true_starts = set([964528, 957434, 959316])
    true_stops = set([944278])
    assert true_starts.issubset(sgraph.start_sites)
    assert true_stops.issubset(sgraph.stop_sites)

    # rebuild graph and examine start/stop nodes
    sgraph.recreate()

    # get start/stop nodes
    start_nodes, stop_nodes = sgraph.get_start_stop_nodes()
    # convert to node intervals
    start_nodes = set(sgraph.get_node_interval(n_id) for n_id in start_nodes)
    stop_nodes = set(sgraph.get_node_interval(n_id) for n_id in stop_nodes)
    assert Exon(959214, 959316) in start_nodes
    assert Exon(959316, 964528) in start_nodes
    assert Exon(957273, 957434) in start_nodes
    assert Exon(944278, 944321) in stop_nodes

    # ensure best path uses change points
    pgf = PathGraphFactory(sgraph)
    pgraph, k = pgf.create_optimal()
    paths = find_paths(pgraph, max_paths=1)
    assert len(paths) == 1
    path, expr = paths[0]
    path = reconstruct_path(path, pgraph, sgraph)
    assert path[0] == Exon(944321, 944800)
    assert path[-1] == Exon(959214, 959316)
Ejemplo n.º 7
0
def test_ccle55_cuff_noc2l():
    '''Locus containing from 55 CCLE samples assembled with Cufflinks'''
    # pull SpliceGraph out of GTF
    t_dict, locus = read_single_locus('noc2l_locus.gtf')
    found_sgraph = False
    for sgraph in locus.create_splice_graphs():
        if (sgraph.chrom == 'chr1' and sgraph.start == 934942
                and sgraph.end == 976702 and sgraph.strand == Strand.NEG):
            found_sgraph = True
            break
    assert found_sgraph

    # examine specific change points
    trim = False
    pval = 0.1
    fc_cutoff = 0.8
    n1 = Exon(934942, 944589)
    n1_id = sgraph.get_node_id(n1)
    assert sgraph.G.is_stop[n1_id]
    cps = sgraph.detect_change_points(pval=pval, fc_cutoff=fc_cutoff)
    for cp in cps:
        sgraph.apply_change_point(cp, trim=trim)
    true_starts = set([964528, 957434, 959316])
    true_stops = set([944278])
    assert true_starts.issubset(sgraph.start_sites)
    assert true_stops.issubset(sgraph.stop_sites)

    # rebuild graph and examine start/stop nodes
    sgraph.recreate()

    # get start/stop nodes
    start_nodes, stop_nodes = sgraph.get_start_stop_nodes()
    # convert to node intervals
    start_nodes = set(sgraph.get_node_interval(n_id) for n_id in start_nodes)
    stop_nodes = set(sgraph.get_node_interval(n_id) for n_id in stop_nodes)
    assert Exon(959214, 959316) in start_nodes
    assert Exon(959316, 964528) in start_nodes
    assert Exon(957273, 957434) in start_nodes
    assert Exon(944278, 944321) in stop_nodes

    # ensure best path uses change points
    pgf = PathGraphFactory(sgraph)
    pgraph, k = pgf.create_optimal()
    paths = find_paths(pgraph, max_paths=1)
    assert len(paths) == 1
    path, expr = paths[0]
    path = reconstruct_path(path, pgraph, sgraph)
    assert path[0] == Exon(944321, 944800)
    assert path[-1] == Exon(959214, 959316)
Ejemplo n.º 8
0
def assemble_isoforms(sgraph, config):
    # read in transfrag paths
    pgf = PathGraphFactory(sgraph)
    K, k = pgf.create_optimal(kmax=config.path_graph_kmax,
                              loss_threshold=config.path_graph_loss_threshold,
                              stats_fh=config.path_graph_stats_fh)
    if K is None or len(K) == 0:
        return []
    # smooth kmer graph
    K.apply_smoothing()

    genome_id_str = (
        '%s:%d-%d[%s]' %
        (sgraph.chrom, sgraph.start, sgraph.end, Strand.to_gtf(sgraph.strand)))
    logging.debug('%s finding isoforms in k=%d graph (%d kmers) '
                  'source_expr=%f' %
                  (genome_id_str, k, len(K), K.exprs[K.SOURCE_ID]))
    paths = []
    for kmer_path, expr in find_paths(K, config.path_frac, config.max_paths):
        path = reconstruct_path(kmer_path, K, sgraph)
        paths.append((path, expr))
    logging.debug('%s isoforms: %d' % (genome_id_str, len(paths)))
    # build gene clusters
    clusters, filtered = Cluster.build(paths, min_frac=config.isoform_frac)
    logging.debug('%s gene clusters: %d filtered transfrags: %d' %
                  (genome_id_str, len(clusters), len(filtered)))
    gene_isoforms = []
    for cluster in clusters:
        isoforms = []
        for path, expr, rel_frac, abs_frac in cluster.iterpaths():
            isoforms.append(
                Isoform(path=path,
                        expr=expr,
                        rel_frac=rel_frac,
                        abs_frac=abs_frac))
        # apply max isoforms limit (per cluster)
        if config.max_isoforms > 0:
            isoforms = isoforms[:config.max_isoforms]
        gene_isoforms.append(isoforms)
    return gene_isoforms
Ejemplo n.º 9
0
def assemble_isoforms(sgraph, config):
    # read in transfrag paths
    pgf = PathGraphFactory(sgraph)
    K, k = pgf.create_optimal(kmax=config.path_graph_kmax,
                              stats_fh=config.path_graph_stats_fh)
    if K is None or len(K) == 0:
        return []
    # smooth kmer graph
    K.apply_smoothing()
    # find isoforms
    logging.debug('%s finding isoforms in k=%d graph (%d kmers) '
                  'source_expr=%f' % (sgraph, k, len(K), K.exprs[K.SOURCE_ID]))
    paths = []
    for path_kmers, expr in find_paths(K, config.path_frac, config.max_paths):
        # convert path of kmers back to path of nodes in splice graph
        path = K.reconstruct(path_kmers)
        paths.append((path, expr))
    logging.debug('%s isoforms: %d' % (sgraph, len(paths)))

    # build gene clusters
    clusters, filtered = Cluster.build(paths, min_frac=config.isoform_frac)
    logging.debug('%s gene clusters: %d filtered transfrags: %d' %
                  (sgraph, len(clusters), len(filtered)))
    gene_isoforms = []
    for cluster in clusters:
        isoforms = []
        for path, expr, rel_frac, abs_frac in cluster.iterpaths():
            # convert from splice graph node ids to exons
            exons = sgraph.reconstruct_exons(path)
            isoforms.append(
                Isoform(exons=exons,
                        expr=expr,
                        rel_frac=rel_frac,
                        abs_frac=abs_frac))
        # apply max isoforms limit (per cluster)
        if config.max_isoforms > 0:
            isoforms = isoforms[:config.max_isoforms]
        gene_isoforms.append(isoforms)
    return gene_isoforms
Ejemplo n.º 10
0
def assemble_isoforms(sgraph, config):
    # read in transfrag paths
    pgf = PathGraphFactory(sgraph)
    K, k = pgf.create_optimal(kmax=config.path_graph_kmax,
                              stats_fh=config.path_graph_stats_fh)
    if K is None or len(K) == 0:
        return []
    # smooth kmer graph
    K.apply_smoothing()
    # find isoforms
    logging.debug('%s finding isoforms in k=%d graph (%d kmers) '
                  'source_expr=%f' %
                  (sgraph, k, len(K), K.exprs[K.SOURCE_ID]))
    paths = []
    for path_kmers, expr in find_paths(K, config.path_frac, config.max_paths):
        # convert path of kmers back to path of nodes in splice graph
        path = K.reconstruct(path_kmers)
        paths.append((path, expr))
    logging.debug('%s isoforms: %d' % (sgraph, len(paths)))

    # build gene clusters
    clusters, filtered = Cluster.build(paths, min_frac=config.isoform_frac)
    logging.debug('%s gene clusters: %d filtered transfrags: %d' %
                  (sgraph, len(clusters), len(filtered)))
    gene_isoforms = []
    for cluster in clusters:
        isoforms = []
        for path, expr, rel_frac, abs_frac in cluster.iterpaths():
            # convert from splice graph node ids to exons
            exons = sgraph.reconstruct_exons(path)
            isoforms.append(Isoform(exons=exons,
                                    expr=expr,
                                    rel_frac=rel_frac,
                                    abs_frac=abs_frac))
        # apply max isoforms limit (per cluster)
        if config.max_isoforms > 0:
            isoforms = isoforms[:config.max_isoforms]
        gene_isoforms.append(isoforms)
    return gene_isoforms
Ejemplo n.º 11
0
def assemble_isoforms(sgraph, config):
    # read in transfrag paths
    pgf = PathGraphFactory(sgraph)
    K, k = pgf.create_optimal(kmax=config.path_graph_kmax,
                              loss_threshold=config.path_graph_loss_threshold,
                              stats_fh=config.path_graph_stats_fh)
    if K is None or len(K) == 0:
        return []
    # smooth kmer graph
    K.apply_smoothing()

    genome_id_str = ('%s:%d-%d[%s]' %
                     (sgraph.chrom, sgraph.start, sgraph.end,
                      Strand.to_gtf(sgraph.strand)))
    logging.debug('%s finding isoforms in k=%d graph (%d kmers) '
                  'source_expr=%f' %
                  (genome_id_str, k, len(K), K.exprs[K.SOURCE_ID]))
    paths = []
    for kmer_path, expr in find_paths(K, config.path_frac, config.max_paths):
        path = reconstruct_path(kmer_path, K, sgraph)
        paths.append((path, expr))
    logging.debug('%s isoforms: %d' % (genome_id_str, len(paths)))
    # build gene clusters
    clusters, filtered = Cluster.build(paths, min_frac=config.isoform_frac)
    logging.debug('%s gene clusters: %d filtered transfrags: %d' %
                  (genome_id_str, len(clusters), len(filtered)))
    gene_isoforms = []
    for cluster in clusters:
        isoforms = []
        for path, expr, rel_frac, abs_frac in cluster.iterpaths():
            isoforms.append(Isoform(path=path, expr=expr, rel_frac=rel_frac,
                                    abs_frac=abs_frac))
        # apply max isoforms limit (per cluster)
        if config.max_isoforms > 0:
            isoforms = isoforms[:config.max_isoforms]
        gene_isoforms.append(isoforms)
    return gene_isoforms