Exemple #1
0
def test_path2():
    t_dict, locus = read_single_locus('noc2l_locus.gtf')
    for sgraph in locus.create_splice_graphs():
        pgraphfactory = PathGraphFactory(sgraph)
        pgraph, k = pgraphfactory.create_optimal()
        paths = find_paths(pgraph)
    return
Exemple #2
0
def test_path1():
    t_dict, locus = read_single_locus('path1.gtf')
    transfrags = locus.get_transfrags(Strand.POS)
    sgraph = SpliceGraph.create(transfrags)
    k = 2
    pgf = PathGraphFactory(sgraph)
    pgraph = pgf.create(k)
    paths = find_paths(pgraph)
    return
Exemple #3
0
def test_path2():
    t_dict, locus = read_single_locus('noc2l_locus.gtf')
    for sgraph in locus.create_splice_graphs():
        K, k = create_optimal_path_graph(sgraph)
        paths1 = find_paths(K, 'expr')
        paths2 = cpathfinder.find_paths(K, 'expr')
        assert len(paths1) == len(paths2)
        for p1, p2 in zip(paths1, paths2):
            p1, e1 = p1
            p2, e2 = p2
            assert p1 == p2
            assert abs(e1 - e2) < 1e-5
def test_path2():
    t_dict, locus = read_single_locus('noc2l_locus.gtf')
    for sgraph in locus.create_splice_graphs():
        K, k = create_optimal_path_graph(sgraph)
        paths1 = find_paths(K, 'expr')
        paths2 = cpathfinder.find_paths(K, 'expr')
        assert len(paths1) == len(paths2)
        for p1, p2 in zip(paths1, paths2):
            p1, e1 = p1
            p2, e2 = p2
            assert p1 == p2
            assert abs(e1-e2) < 1e-5
Exemple #5
0
def test_ccle55_cuff_noc2l():
    '''Locus containing from 55 CCLE samples assembled with Cufflinks'''
    # pull SpliceGraph out of GTF
    t_dict, locus = read_single_locus('noc2l_locus.gtf')
    found_sgraph = False
    for sgraph in locus.create_splice_graphs():
        if (sgraph.chrom == 'chr1' and sgraph.start == 934942 and
            sgraph.end == 976702 and sgraph.strand == Strand.NEG):
            found_sgraph = True
            break
    assert found_sgraph

    # examine specific change points
    trim = False
    pval = 0.1
    fc_cutoff = 0.8
    n1 = Exon(934942, 944589)
    n1_id = sgraph.get_node_id(n1)
    assert sgraph.G.is_stop[n1_id]
    cps = sgraph.detect_change_points(pval=pval, fc_cutoff=fc_cutoff)
    for cp in cps:
        sgraph.apply_change_point(cp, trim=trim)
    true_starts = set([964528, 957434, 959316])
    true_stops = set([944278])
    assert true_starts.issubset(sgraph.start_sites)
    assert true_stops.issubset(sgraph.stop_sites)

    # rebuild graph and examine start/stop nodes
    sgraph.recreate()

    # get start/stop nodes
    start_nodes, stop_nodes = sgraph.get_start_stop_nodes()
    # convert to node intervals
    start_nodes = set(sgraph.get_node_interval(n_id) for n_id in start_nodes)
    stop_nodes = set(sgraph.get_node_interval(n_id) for n_id in stop_nodes)
    assert Exon(959214, 959316) in start_nodes
    assert Exon(959316, 964528) in start_nodes
    assert Exon(957273, 957434) in start_nodes
    assert Exon(944278, 944321) in stop_nodes

    # ensure best path uses change points
    pgf = PathGraphFactory(sgraph)
    pgraph, k = pgf.create_optimal()
    paths = find_paths(pgraph, max_paths=1)
    assert len(paths) == 1
    path, expr = paths[0]
    path = reconstruct_path(path, pgraph, sgraph)
    assert path[0] == Exon(944321, 944800)
    assert path[-1] == Exon(959214, 959316)
Exemple #6
0
def test_ccle55_cuff_noc2l():
    '''Locus containing from 55 CCLE samples assembled with Cufflinks'''
    # pull SpliceGraph out of GTF
    t_dict, locus = read_single_locus('noc2l_locus.gtf')
    found_sgraph = False
    for sgraph in locus.create_splice_graphs():
        if (sgraph.chrom == 'chr1' and sgraph.start == 934942
                and sgraph.end == 976702 and sgraph.strand == Strand.NEG):
            found_sgraph = True
            break
    assert found_sgraph

    # examine specific change points
    trim = False
    pval = 0.1
    fc_cutoff = 0.8
    n1 = Exon(934942, 944589)
    n1_id = sgraph.get_node_id(n1)
    assert sgraph.G.is_stop[n1_id]
    cps = sgraph.detect_change_points(pval=pval, fc_cutoff=fc_cutoff)
    for cp in cps:
        sgraph.apply_change_point(cp, trim=trim)
    true_starts = set([964528, 957434, 959316])
    true_stops = set([944278])
    assert true_starts.issubset(sgraph.start_sites)
    assert true_stops.issubset(sgraph.stop_sites)

    # rebuild graph and examine start/stop nodes
    sgraph.recreate()

    # get start/stop nodes
    start_nodes, stop_nodes = sgraph.get_start_stop_nodes()
    # convert to node intervals
    start_nodes = set(sgraph.get_node_interval(n_id) for n_id in start_nodes)
    stop_nodes = set(sgraph.get_node_interval(n_id) for n_id in stop_nodes)
    assert Exon(959214, 959316) in start_nodes
    assert Exon(959316, 964528) in start_nodes
    assert Exon(957273, 957434) in start_nodes
    assert Exon(944278, 944321) in stop_nodes

    # ensure best path uses change points
    pgf = PathGraphFactory(sgraph)
    pgraph, k = pgf.create_optimal()
    paths = find_paths(pgraph, max_paths=1)
    assert len(paths) == 1
    path, expr = paths[0]
    path = reconstruct_path(path, pgraph, sgraph)
    assert path[0] == Exon(944321, 944800)
    assert path[-1] == Exon(959214, 959316)
Exemple #7
0
def test_path1():
    t_dict, locus = read_single_locus('path1.gtf')
    transfrags = locus.get_transfrags(Strand.POS)
    sgraph = SpliceGraph.create(transfrags)
    k = 2
    K = create_path_graph(sgraph, k)
    paths1 = find_paths(K, 'expr')
    paths2 = cpathfinder.find_paths(K, 'expr')
    assert len(paths1) == len(paths2)
    for p1, p2 in zip(paths1, paths2):
        p1, e1 = p1
        p2, e2 = p2
        assert p1 == p2
        assert abs(e1 - e2) < 1e-8
    return
def test_path1():
    t_dict, locus = read_single_locus('path1.gtf')
    transfrags = locus.get_transfrags(Strand.POS)
    sgraph = SpliceGraph.create(transfrags)
    k = 2
    K = create_path_graph(sgraph, k)
    paths1 = find_paths(K, 'expr')
    paths2 = cpathfinder.find_paths(K, 'expr')
    assert len(paths1) == len(paths2)
    for p1, p2 in zip(paths1, paths2):
        p1, e1 = p1
        p2, e2 = p2
        assert p1 == p2
        assert abs(e1-e2) < 1e-8
    return
Exemple #9
0
def assemble_isoforms(sgraph, config):
    # read in transfrag paths
    pgf = PathGraphFactory(sgraph)
    K, k = pgf.create_optimal(kmax=config.path_graph_kmax,
                              loss_threshold=config.path_graph_loss_threshold,
                              stats_fh=config.path_graph_stats_fh)
    if K is None or len(K) == 0:
        return []
    # smooth kmer graph
    K.apply_smoothing()

    genome_id_str = (
        '%s:%d-%d[%s]' %
        (sgraph.chrom, sgraph.start, sgraph.end, Strand.to_gtf(sgraph.strand)))
    logging.debug('%s finding isoforms in k=%d graph (%d kmers) '
                  'source_expr=%f' %
                  (genome_id_str, k, len(K), K.exprs[K.SOURCE_ID]))
    paths = []
    for kmer_path, expr in find_paths(K, config.path_frac, config.max_paths):
        path = reconstruct_path(kmer_path, K, sgraph)
        paths.append((path, expr))
    logging.debug('%s isoforms: %d' % (genome_id_str, len(paths)))
    # build gene clusters
    clusters, filtered = Cluster.build(paths, min_frac=config.isoform_frac)
    logging.debug('%s gene clusters: %d filtered transfrags: %d' %
                  (genome_id_str, len(clusters), len(filtered)))
    gene_isoforms = []
    for cluster in clusters:
        isoforms = []
        for path, expr, rel_frac, abs_frac in cluster.iterpaths():
            isoforms.append(
                Isoform(path=path,
                        expr=expr,
                        rel_frac=rel_frac,
                        abs_frac=abs_frac))
        # apply max isoforms limit (per cluster)
        if config.max_isoforms > 0:
            isoforms = isoforms[:config.max_isoforms]
        gene_isoforms.append(isoforms)
    return gene_isoforms
Exemple #10
0
def assemble_isoforms(sgraph, config):
    # read in transfrag paths
    pgf = PathGraphFactory(sgraph)
    K, k = pgf.create_optimal(kmax=config.path_graph_kmax,
                              stats_fh=config.path_graph_stats_fh)
    if K is None or len(K) == 0:
        return []
    # smooth kmer graph
    K.apply_smoothing()
    # find isoforms
    logging.debug('%s finding isoforms in k=%d graph (%d kmers) '
                  'source_expr=%f' % (sgraph, k, len(K), K.exprs[K.SOURCE_ID]))
    paths = []
    for path_kmers, expr in find_paths(K, config.path_frac, config.max_paths):
        # convert path of kmers back to path of nodes in splice graph
        path = K.reconstruct(path_kmers)
        paths.append((path, expr))
    logging.debug('%s isoforms: %d' % (sgraph, len(paths)))

    # build gene clusters
    clusters, filtered = Cluster.build(paths, min_frac=config.isoform_frac)
    logging.debug('%s gene clusters: %d filtered transfrags: %d' %
                  (sgraph, len(clusters), len(filtered)))
    gene_isoforms = []
    for cluster in clusters:
        isoforms = []
        for path, expr, rel_frac, abs_frac in cluster.iterpaths():
            # convert from splice graph node ids to exons
            exons = sgraph.reconstruct_exons(path)
            isoforms.append(
                Isoform(exons=exons,
                        expr=expr,
                        rel_frac=rel_frac,
                        abs_frac=abs_frac))
        # apply max isoforms limit (per cluster)
        if config.max_isoforms > 0:
            isoforms = isoforms[:config.max_isoforms]
        gene_isoforms.append(isoforms)
    return gene_isoforms
Exemple #11
0
def assemble_isoforms(sgraph, config):
    # read in transfrag paths
    pgf = PathGraphFactory(sgraph)
    K, k = pgf.create_optimal(kmax=config.path_graph_kmax,
                              stats_fh=config.path_graph_stats_fh)
    if K is None or len(K) == 0:
        return []
    # smooth kmer graph
    K.apply_smoothing()
    # find isoforms
    logging.debug('%s finding isoforms in k=%d graph (%d kmers) '
                  'source_expr=%f' %
                  (sgraph, k, len(K), K.exprs[K.SOURCE_ID]))
    paths = []
    for path_kmers, expr in find_paths(K, config.path_frac, config.max_paths):
        # convert path of kmers back to path of nodes in splice graph
        path = K.reconstruct(path_kmers)
        paths.append((path, expr))
    logging.debug('%s isoforms: %d' % (sgraph, len(paths)))

    # build gene clusters
    clusters, filtered = Cluster.build(paths, min_frac=config.isoform_frac)
    logging.debug('%s gene clusters: %d filtered transfrags: %d' %
                  (sgraph, len(clusters), len(filtered)))
    gene_isoforms = []
    for cluster in clusters:
        isoforms = []
        for path, expr, rel_frac, abs_frac in cluster.iterpaths():
            # convert from splice graph node ids to exons
            exons = sgraph.reconstruct_exons(path)
            isoforms.append(Isoform(exons=exons,
                                    expr=expr,
                                    rel_frac=rel_frac,
                                    abs_frac=abs_frac))
        # apply max isoforms limit (per cluster)
        if config.max_isoforms > 0:
            isoforms = isoforms[:config.max_isoforms]
        gene_isoforms.append(isoforms)
    return gene_isoforms
Exemple #12
0
def assemble_isoforms(sgraph, config):
    # read in transfrag paths
    pgf = PathGraphFactory(sgraph)
    K, k = pgf.create_optimal(kmax=config.path_graph_kmax,
                              loss_threshold=config.path_graph_loss_threshold,
                              stats_fh=config.path_graph_stats_fh)
    if K is None or len(K) == 0:
        return []
    # smooth kmer graph
    K.apply_smoothing()

    genome_id_str = ('%s:%d-%d[%s]' %
                     (sgraph.chrom, sgraph.start, sgraph.end,
                      Strand.to_gtf(sgraph.strand)))
    logging.debug('%s finding isoforms in k=%d graph (%d kmers) '
                  'source_expr=%f' %
                  (genome_id_str, k, len(K), K.exprs[K.SOURCE_ID]))
    paths = []
    for kmer_path, expr in find_paths(K, config.path_frac, config.max_paths):
        path = reconstruct_path(kmer_path, K, sgraph)
        paths.append((path, expr))
    logging.debug('%s isoforms: %d' % (genome_id_str, len(paths)))
    # build gene clusters
    clusters, filtered = Cluster.build(paths, min_frac=config.isoform_frac)
    logging.debug('%s gene clusters: %d filtered transfrags: %d' %
                  (genome_id_str, len(clusters), len(filtered)))
    gene_isoforms = []
    for cluster in clusters:
        isoforms = []
        for path, expr, rel_frac, abs_frac in cluster.iterpaths():
            isoforms.append(Isoform(path=path, expr=expr, rel_frac=rel_frac,
                                    abs_frac=abs_frac))
        # apply max isoforms limit (per cluster)
        if config.max_isoforms > 0:
            isoforms = isoforms[:config.max_isoforms]
        gene_isoforms.append(isoforms)
    return gene_isoforms