def test_empty_graph_bug(): t_dict, locus = read_single_locus('empty_graph_bug.gtf') transfrags = locus.get_transfrags(Strand.POS) sgraph = SpliceGraph.create(transfrags) pgf = PathGraphFactory(sgraph) K, k = pgf.create_optimal() assert K is None
def test_path2(): t_dict, locus = read_single_locus('noc2l_locus.gtf') for sgraph in locus.create_splice_graphs(): pgraphfactory = PathGraphFactory(sgraph) pgraph, k = pgraphfactory.create_optimal() paths = find_paths(pgraph) return
def test_path1(): t_dict, locus = read_single_locus('path1.gtf') transfrags = locus.get_transfrags(Strand.POS) sgraph = SpliceGraph.create(transfrags) k = 2 pgf = PathGraphFactory(sgraph) pgraph = pgf.create(k) paths = find_paths(pgraph) return
def test_path_graph_factory(): t_dict, locus = read_single_locus('path1.gtf') sgraph = SpliceGraph.create(t_dict.values()) pgraphfactory = PathGraphFactory(sgraph) g1 = pgraphfactory.create(k=1) assert len(g1) == 5 g2 = pgraphfactory.create(k=2) assert len(g2) == 6 gopt, k = pgraphfactory.create_optimal() assert k == 2 return
def test_topological_sort(): G = Graph() G.add_path((G.SOURCE, 10, 20, 30, 40, G.SINK)) G.add_path((G.SOURCE, 10, 30, 40, G.SINK)) G.add_path((G.SOURCE, 10, G.SINK)) G.add_path((G.SOURCE, 20, G.SINK)) assert G.is_topological_sort(G.topological_sort()) t_dict, locus = read_single_locus('noc2l_locus.gtf') for sgraph in locus.create_splice_graphs(): pgf = PathGraphFactory(sgraph) G = pgf.create(k=1) assert G.is_topological_sort(G.topological_sort()) assert G.is_topological_sort(G.topological_sort_dfs())
def test_ccle55_cuff_noc2l(): '''Locus containing from 55 CCLE samples assembled with Cufflinks''' # pull SpliceGraph out of GTF t_dict, locus = read_single_locus('noc2l_locus.gtf') found_sgraph = False for sgraph in locus.create_splice_graphs(): if (sgraph.chrom == 'chr1' and sgraph.start == 934942 and sgraph.end == 976702 and sgraph.strand == Strand.NEG): found_sgraph = True break assert found_sgraph # examine specific change points trim = False pval = 0.1 fc_cutoff = 0.8 n1 = Exon(934942, 944589) n1_id = sgraph.get_node_id(n1) assert sgraph.G.is_stop[n1_id] cps = sgraph.detect_change_points(pval=pval, fc_cutoff=fc_cutoff) for cp in cps: sgraph.apply_change_point(cp, trim=trim) true_starts = set([964528, 957434, 959316]) true_stops = set([944278]) assert true_starts.issubset(sgraph.start_sites) assert true_stops.issubset(sgraph.stop_sites) # rebuild graph and examine start/stop nodes sgraph.recreate() # get start/stop nodes start_nodes, stop_nodes = sgraph.get_start_stop_nodes() # convert to node intervals start_nodes = set(sgraph.get_node_interval(n_id) for n_id in start_nodes) stop_nodes = set(sgraph.get_node_interval(n_id) for n_id in stop_nodes) assert Exon(959214, 959316) in start_nodes assert Exon(959316, 964528) in start_nodes assert Exon(957273, 957434) in start_nodes assert Exon(944278, 944321) in stop_nodes # ensure best path uses change points pgf = PathGraphFactory(sgraph) pgraph, k = pgf.create_optimal() paths = find_paths(pgraph, max_paths=1) assert len(paths) == 1 path, expr = paths[0] path = reconstruct_path(path, pgraph, sgraph) assert path[0] == Exon(944321, 944800) assert path[-1] == Exon(959214, 959316)
def assemble_isoforms(sgraph, config): # read in transfrag paths pgf = PathGraphFactory(sgraph) K, k = pgf.create_optimal(kmax=config.path_graph_kmax, loss_threshold=config.path_graph_loss_threshold, stats_fh=config.path_graph_stats_fh) if K is None or len(K) == 0: return [] # smooth kmer graph K.apply_smoothing() genome_id_str = ( '%s:%d-%d[%s]' % (sgraph.chrom, sgraph.start, sgraph.end, Strand.to_gtf(sgraph.strand))) logging.debug('%s finding isoforms in k=%d graph (%d kmers) ' 'source_expr=%f' % (genome_id_str, k, len(K), K.exprs[K.SOURCE_ID])) paths = [] for kmer_path, expr in find_paths(K, config.path_frac, config.max_paths): path = reconstruct_path(kmer_path, K, sgraph) paths.append((path, expr)) logging.debug('%s isoforms: %d' % (genome_id_str, len(paths))) # build gene clusters clusters, filtered = Cluster.build(paths, min_frac=config.isoform_frac) logging.debug('%s gene clusters: %d filtered transfrags: %d' % (genome_id_str, len(clusters), len(filtered))) gene_isoforms = [] for cluster in clusters: isoforms = [] for path, expr, rel_frac, abs_frac in cluster.iterpaths(): isoforms.append( Isoform(path=path, expr=expr, rel_frac=rel_frac, abs_frac=abs_frac)) # apply max isoforms limit (per cluster) if config.max_isoforms > 0: isoforms = isoforms[:config.max_isoforms] gene_isoforms.append(isoforms) return gene_isoforms
def assemble_isoforms(sgraph, config): # read in transfrag paths pgf = PathGraphFactory(sgraph) K, k = pgf.create_optimal(kmax=config.path_graph_kmax, stats_fh=config.path_graph_stats_fh) if K is None or len(K) == 0: return [] # smooth kmer graph K.apply_smoothing() # find isoforms logging.debug('%s finding isoforms in k=%d graph (%d kmers) ' 'source_expr=%f' % (sgraph, k, len(K), K.exprs[K.SOURCE_ID])) paths = [] for path_kmers, expr in find_paths(K, config.path_frac, config.max_paths): # convert path of kmers back to path of nodes in splice graph path = K.reconstruct(path_kmers) paths.append((path, expr)) logging.debug('%s isoforms: %d' % (sgraph, len(paths))) # build gene clusters clusters, filtered = Cluster.build(paths, min_frac=config.isoform_frac) logging.debug('%s gene clusters: %d filtered transfrags: %d' % (sgraph, len(clusters), len(filtered))) gene_isoforms = [] for cluster in clusters: isoforms = [] for path, expr, rel_frac, abs_frac in cluster.iterpaths(): # convert from splice graph node ids to exons exons = sgraph.reconstruct_exons(path) isoforms.append( Isoform(exons=exons, expr=expr, rel_frac=rel_frac, abs_frac=abs_frac)) # apply max isoforms limit (per cluster) if config.max_isoforms > 0: isoforms = isoforms[:config.max_isoforms] gene_isoforms.append(isoforms) return gene_isoforms
def assemble_isoforms(sgraph, config): # read in transfrag paths pgf = PathGraphFactory(sgraph) K, k = pgf.create_optimal(kmax=config.path_graph_kmax, stats_fh=config.path_graph_stats_fh) if K is None or len(K) == 0: return [] # smooth kmer graph K.apply_smoothing() # find isoforms logging.debug('%s finding isoforms in k=%d graph (%d kmers) ' 'source_expr=%f' % (sgraph, k, len(K), K.exprs[K.SOURCE_ID])) paths = [] for path_kmers, expr in find_paths(K, config.path_frac, config.max_paths): # convert path of kmers back to path of nodes in splice graph path = K.reconstruct(path_kmers) paths.append((path, expr)) logging.debug('%s isoforms: %d' % (sgraph, len(paths))) # build gene clusters clusters, filtered = Cluster.build(paths, min_frac=config.isoform_frac) logging.debug('%s gene clusters: %d filtered transfrags: %d' % (sgraph, len(clusters), len(filtered))) gene_isoforms = [] for cluster in clusters: isoforms = [] for path, expr, rel_frac, abs_frac in cluster.iterpaths(): # convert from splice graph node ids to exons exons = sgraph.reconstruct_exons(path) isoforms.append(Isoform(exons=exons, expr=expr, rel_frac=rel_frac, abs_frac=abs_frac)) # apply max isoforms limit (per cluster) if config.max_isoforms > 0: isoforms = isoforms[:config.max_isoforms] gene_isoforms.append(isoforms) return gene_isoforms
def assemble_isoforms(sgraph, config): # read in transfrag paths pgf = PathGraphFactory(sgraph) K, k = pgf.create_optimal(kmax=config.path_graph_kmax, loss_threshold=config.path_graph_loss_threshold, stats_fh=config.path_graph_stats_fh) if K is None or len(K) == 0: return [] # smooth kmer graph K.apply_smoothing() genome_id_str = ('%s:%d-%d[%s]' % (sgraph.chrom, sgraph.start, sgraph.end, Strand.to_gtf(sgraph.strand))) logging.debug('%s finding isoforms in k=%d graph (%d kmers) ' 'source_expr=%f' % (genome_id_str, k, len(K), K.exprs[K.SOURCE_ID])) paths = [] for kmer_path, expr in find_paths(K, config.path_frac, config.max_paths): path = reconstruct_path(kmer_path, K, sgraph) paths.append((path, expr)) logging.debug('%s isoforms: %d' % (genome_id_str, len(paths))) # build gene clusters clusters, filtered = Cluster.build(paths, min_frac=config.isoform_frac) logging.debug('%s gene clusters: %d filtered transfrags: %d' % (genome_id_str, len(clusters), len(filtered))) gene_isoforms = [] for cluster in clusters: isoforms = [] for path, expr, rel_frac, abs_frac in cluster.iterpaths(): isoforms.append(Isoform(path=path, expr=expr, rel_frac=rel_frac, abs_frac=abs_frac)) # apply max isoforms limit (per cluster) if config.max_isoforms > 0: isoforms = isoforms[:config.max_isoforms] gene_isoforms.append(isoforms) return gene_isoforms