def assemble_gene(sgraph, locus_id_str, config): genome_id_str = ('%s:%d-%d[%s]' % (sgraph.chrom, sgraph.start, sgraph.end, Strand.to_gtf(sgraph.strand))) logging.debug('%s locus: %s nodes: %d' % (genome_id_str, locus_id_str, len(sgraph.G))) # output splice graph node data for f in sgraph.get_node_gtf(): print >>config.splice_graph_gtf_fh, str(f) if config.change_point: # detect change points changepts = sgraph.detect_change_points( pval=config.change_point_pvalue, fc_cutoff=config.change_point_fold_change) logging.debug('%s locus %s change points: %d' % (genome_id_str, locus_id_str, len(changepts))) for cp in changepts: sgraph.apply_change_point(cp, config.change_point_trim) # output splice graph change points for f in sgraph.get_change_point_gtf(cp): print >>config.splice_graph_gtf_fh, str(f) # must recreate splice graph after finding change points if len(changepts) > 0: sgraph.recreate() # run isoform path finding algorithm, filter and group into genes for gene_isoforms in assemble_isoforms(sgraph, config): # assign gene_id and tss_id assign_ids(gene_isoforms, sgraph.strand, config.gene_id_iter, config.tss_id_iter) # write output for isoform in gene_isoforms: # assign transcript id t_id = config.t_id_iter.next() # get strings for each id t_id_str = "TU%d" % t_id tss_id_str = "TSS%d" % (isoform.tss_id) gene_id_str = "G%d" % (isoform.gene_id) # write to GTF for f in get_gtf_features(chrom=sgraph.chrom, strand=sgraph.strand, exons=isoform.path, locus_id=locus_id_str, gene_id=gene_id_str, tss_id=tss_id_str, transcript_id=t_id_str, expr=isoform.expr, rel_frac=isoform.rel_frac, abs_frac=isoform.abs_frac): print >>config.assembly_gtf_fh, str(f) # write to BED name = "%s|%s(%.1f)" % (gene_id_str, t_id_str, isoform.expr) fields = write_bed(sgraph.chrom, name, sgraph.strand, int(round(1000.0 * isoform.rel_frac)), isoform.path) print >>config.assembly_bed_fh, '\t'.join(fields)
def assemble_gene(sgraph, locus_id_str, config): genome_id_str = ( '%s:%d-%d[%s]' % (sgraph.chrom, sgraph.start, sgraph.end, Strand.to_gtf(sgraph.strand))) logging.debug('%s locus: %s nodes: %d' % (genome_id_str, locus_id_str, len(sgraph.G))) # output splice graph node data for f in sgraph.get_node_gtf(): print >> config.splice_graph_gtf_fh, str(f) if config.change_point: # detect change points changepts = sgraph.detect_change_points( pval=config.change_point_pvalue, fc_cutoff=config.change_point_fold_change) logging.debug('%s locus %s change points: %d' % (genome_id_str, locus_id_str, len(changepts))) for cp in changepts: sgraph.apply_change_point(cp, config.change_point_trim) # output splice graph change points for f in sgraph.get_change_point_gtf(cp): print >> config.splice_graph_gtf_fh, str(f) # must recreate splice graph after finding change points if len(changepts) > 0: sgraph.recreate() # run isoform path finding algorithm, filter and group into genes for gene_isoforms in assemble_isoforms(sgraph, config): # assign gene_id and tss_id assign_ids(gene_isoforms, sgraph.strand, config.gene_id_iter, config.tss_id_iter) # write output for isoform in gene_isoforms: # assign transcript id t_id = config.t_id_iter.next() # get strings for each id t_id_str = "TU%d" % t_id tss_id_str = "TSS%d" % (isoform.tss_id) gene_id_str = "G%d" % (isoform.gene_id) # write to GTF for f in get_gtf_features(chrom=sgraph.chrom, strand=sgraph.strand, exons=isoform.path, locus_id=locus_id_str, gene_id=gene_id_str, tss_id=tss_id_str, transcript_id=t_id_str, expr=isoform.expr, rel_frac=isoform.rel_frac, abs_frac=isoform.abs_frac): print >> config.assembly_gtf_fh, str(f) # write to BED name = "%s|%s(%.1f)" % (gene_id_str, t_id_str, isoform.expr) fields = write_bed(sgraph.chrom, name, sgraph.strand, int(round(1000.0 * isoform.rel_frac)), isoform.path) print >> config.assembly_bed_fh, '\t'.join(fields)
def test_multi_strand2(): t_dict, locus = read_single_locus("multi_strand2.gtf") transfrags_pos = locus.get_transfrags(Strand.POS) sgpos = SpliceGraph.create(transfrags_pos) sgdict = {} for sg in sgpos.split(): k = "%s:%d-%d[%s]" % (sg.chrom, sg.start, sg.end, Strand.to_gtf(sg.strand)) sgdict[k] = sg assert "chr1:100-300[+]" in sgdict assert "chr1:400-600[+]" in sgdict
def test_multi_strand2(): t_dict, locus = read_single_locus('multi_strand2.gtf') transfrags_pos = locus.get_transfrags(Strand.POS) sgpos = SpliceGraph.create(transfrags_pos) sgdict = {} for sg in sgpos.split(): k = ('%s:%d-%d[%s]' % (sg.chrom, sg.start, sg.end, Strand.to_gtf(sg.strand))) sgdict[k] = sg assert 'chr1:100-300[+]' in sgdict assert 'chr1:400-600[+]' in sgdict
def get_gtf_features(chrom, strand, exons, locus_id, gene_id, tss_id, transcript_id, expr, rel_frac, abs_frac): tx_start = exons[0].start tx_end = exons[-1].end strand_str = Strand.to_gtf(strand) attr_dict = { 'locus_id': locus_id, 'gene_id': gene_id, 'tss_id': tss_id, 'transcript_id': transcript_id } f = GTF.Feature() f.seqid = chrom f.source = 'taco' f.feature = 'transcript' f.start = tx_start f.end = tx_end f.score = int(round(1000.0 * rel_frac)) f.strand = strand_str f.phase = '.' f.attrs = { 'expr': '%.3f' % expr, 'rel_frac': '%.5f' % rel_frac, 'abs_frac': '%.5f' % abs_frac } f.attrs.update(attr_dict) yield f for e in exons: f = GTF.Feature() f.seqid = chrom f.source = 'taco' f.feature = 'exon' f.start = e.start f.end = e.end f.score = int(round(1000.0 * rel_frac)) f.strand = strand_str f.phase = '.' f.attrs = {} f.attrs.update(attr_dict) yield f
def assemble_isoforms(sgraph, config): # read in transfrag paths pgf = PathGraphFactory(sgraph) K, k = pgf.create_optimal(kmax=config.path_graph_kmax, loss_threshold=config.path_graph_loss_threshold, stats_fh=config.path_graph_stats_fh) if K is None or len(K) == 0: return [] # smooth kmer graph K.apply_smoothing() genome_id_str = ( '%s:%d-%d[%s]' % (sgraph.chrom, sgraph.start, sgraph.end, Strand.to_gtf(sgraph.strand))) logging.debug('%s finding isoforms in k=%d graph (%d kmers) ' 'source_expr=%f' % (genome_id_str, k, len(K), K.exprs[K.SOURCE_ID])) paths = [] for kmer_path, expr in find_paths(K, config.path_frac, config.max_paths): path = reconstruct_path(kmer_path, K, sgraph) paths.append((path, expr)) logging.debug('%s isoforms: %d' % (genome_id_str, len(paths))) # build gene clusters clusters, filtered = Cluster.build(paths, min_frac=config.isoform_frac) logging.debug('%s gene clusters: %d filtered transfrags: %d' % (genome_id_str, len(clusters), len(filtered))) gene_isoforms = [] for cluster in clusters: isoforms = [] for path, expr, rel_frac, abs_frac in cluster.iterpaths(): isoforms.append( Isoform(path=path, expr=expr, rel_frac=rel_frac, abs_frac=abs_frac)) # apply max isoforms limit (per cluster) if config.max_isoforms > 0: isoforms = isoforms[:config.max_isoforms] gene_isoforms.append(isoforms) return gene_isoforms
def assemble_isoforms(sgraph, config): # read in transfrag paths pgf = PathGraphFactory(sgraph) K, k = pgf.create_optimal(kmax=config.path_graph_kmax, loss_threshold=config.path_graph_loss_threshold, stats_fh=config.path_graph_stats_fh) if K is None or len(K) == 0: return [] # smooth kmer graph K.apply_smoothing() genome_id_str = ('%s:%d-%d[%s]' % (sgraph.chrom, sgraph.start, sgraph.end, Strand.to_gtf(sgraph.strand))) logging.debug('%s finding isoforms in k=%d graph (%d kmers) ' 'source_expr=%f' % (genome_id_str, k, len(K), K.exprs[K.SOURCE_ID])) paths = [] for kmer_path, expr in find_paths(K, config.path_frac, config.max_paths): path = reconstruct_path(kmer_path, K, sgraph) paths.append((path, expr)) logging.debug('%s isoforms: %d' % (genome_id_str, len(paths))) # build gene clusters clusters, filtered = Cluster.build(paths, min_frac=config.isoform_frac) logging.debug('%s gene clusters: %d filtered transfrags: %d' % (genome_id_str, len(clusters), len(filtered))) gene_isoforms = [] for cluster in clusters: isoforms = [] for path, expr, rel_frac, abs_frac in cluster.iterpaths(): isoforms.append(Isoform(path=path, expr=expr, rel_frac=rel_frac, abs_frac=abs_frac)) # apply max isoforms limit (per cluster) if config.max_isoforms > 0: isoforms = isoforms[:config.max_isoforms] gene_isoforms.append(isoforms) return gene_isoforms
def write_bed(chrom, name, strand, score, exons): assert all(exons[0].start < x.start for x in exons[1:]) assert all(exons[-1].end > x.end for x in exons[:-1]) tx_start = exons[0].start tx_end = exons[-1].end block_sizes = [] block_starts = [] for e in exons: block_starts.append(e.start - tx_start) block_sizes.append(e.end - e.start) # make bed fields fields = [ chrom, str(tx_start), str(tx_end), str(name), str(score), Strand.to_gtf(strand), str(tx_start), str(tx_start), '0', str(len(exons)), ','.join(map(str, block_sizes)) + ',', ','.join(map(str, block_starts)) + ',' ] return fields
def write_bed(chrom, name, strand, score, exons): assert all(exons[0].start < x.start for x in exons[1:]) assert all(exons[-1].end > x.end for x in exons[:-1]) tx_start = exons[0].start tx_end = exons[-1].end block_sizes = [] block_starts = [] for e in exons: block_starts.append(e.start - tx_start) block_sizes.append(e.end - e.start) # make bed fields fields = [chrom, str(tx_start), str(tx_end), str(name), str(score), Strand.to_gtf(strand), str(tx_start), str(tx_start), '0', str(len(exons)), ','.join(map(str, block_sizes)) + ',', ','.join(map(str, block_starts)) + ','] return fields
def get_gtf_features(chrom, strand, exons, locus_id, gene_id, tss_id, transcript_id, expr, rel_frac, abs_frac): tx_start = exons[0].start tx_end = exons[-1].end strand_str = Strand.to_gtf(strand) attr_dict = {'locus_id': locus_id, 'gene_id': gene_id, 'tss_id': tss_id, 'transcript_id': transcript_id} f = GTF.Feature() f.seqid = chrom f.source = 'taco' f.feature = 'transcript' f.start = tx_start f.end = tx_end f.score = int(round(1000.0 * rel_frac)) f.strand = strand_str f.phase = '.' f.attrs = {'expr': '%.3f' % expr, 'rel_frac': '%.5f' % rel_frac, 'abs_frac': '%.5f' % abs_frac} f.attrs.update(attr_dict) yield f for e in exons: f = GTF.Feature() f.seqid = chrom f.source = 'taco' f.feature = 'exon' f.start = e.start f.end = e.end f.score = int(round(1000.0 * rel_frac)) f.strand = strand_str f.phase = '.' f.attrs = {} f.attrs.update(attr_dict) yield f