def get_node_gtf(self): graph_id = ( 'G_%s_%d_%d_%s' % (self.chrom, self.start, self.end, Strand.to_gtf(self.strand))) # iterate through locus and return change point data for n_id in self.G: n = self.get_node_interval(n_id) expr_data = self.get_expr_data(*n) ref_starts = _array_subset(self.ref_start_sites, *n) ref_stops = _array_subset(self.ref_stop_sites, *n) # return gtf feature for each node f = GTF.Feature() f.seqid = self.chrom f.source = 'taco' f.feature = 'node' f.start = n[0] f.end = n[1] f.score = 0 f.strand = Strand.to_gtf(self.strand) f.phase = '.' f.attrs = { 'graph_id': graph_id, 'expr_min': str(expr_data.min()), 'expr_max': str(expr_data.max()), 'expr_mean': str(expr_data.mean()), 'ref_starts': ','.join(map(str, ref_starts)), 'ref_stops': ','.join(map(str, ref_stops)) } yield f
def get_change_point_gtf(self, cp): graph_id = ('G_%s_%d_%d_%s' % (self.chrom, self.start, self.end, Strand.to_gtf(self.strand))) features = [] f = GTF.Feature() f.seqid = self.chrom f.source = 'taco' f.feature = 'changept' f.start = cp.pos f.end = cp.pos + 1 f.score = 0 f.strand = Strand.to_gtf(self.strand) f.phase = '.' f.attrs = {'graph_id': graph_id, 'sign': str(cp.sign), 'pvalue': str(cp.pvalue), 'foldchange': str(cp.foldchange)} features.append(f) f = GTF.Feature() f.seqid = self.chrom f.source = 'taco' f.feature = 'changeinterval' f.start = cp.start f.end = cp.end f.score = 0 f.strand = Strand.to_gtf(self.strand) f.phase = '.' f.attrs = {'graph_id': graph_id, 'sign': str(cp.sign), 'pvalue': str(cp.pvalue), 'foldchange': str(cp.foldchange)} features.append(f) return features
def get_node_gtf(self): graph_id = ('G_%s_%d_%d_%s' % (self.chrom, self.start, self.end, Strand.to_gtf(self.strand))) # iterate through locus and return change point data for n_id in self.G.node_ids_iter(): n = self.get_node_interval(n_id) expr_data = self.get_expr_data(*n) ref_starts = _array_subset(self.ref_start_sites, *n) ref_stops = _array_subset(self.ref_stop_sites, *n) # return gtf feature for each node f = GTF.Feature() f.seqid = self.chrom f.source = 'taco' f.feature = 'node' f.start = n[0] f.end = n[1] f.score = 0 f.strand = Strand.to_gtf(self.strand) f.phase = '.' f.attrs = {'graph_id': graph_id, 'expr_min': str(expr_data.min()), 'expr_max': str(expr_data.max()), 'expr_mean': str(expr_data.mean()), 'ref_starts': ','.join(map(str, ref_starts)), 'ref_stops': ','.join(map(str, ref_stops))} yield f
def assemble_gene(sgraph, locus_id_str, config): logging.debug('%s:%d-%d[%s] nodes=%d' % (sgraph.chrom, sgraph.start, sgraph.end, Strand.to_gtf(sgraph.strand), len(sgraph.G))) # output splice graph node data for f in sgraph.get_node_gtf(): print >> config.splice_graph_gtf_fh, str(f) if config.change_point: # detect change points changepts = sgraph.detect_change_points( pval=config.change_point_pvalue, fc_cutoff=config.change_point_fold_change) logging.debug('%s:%d-%d[%s] change points: %d' % (sgraph.chrom, sgraph.start, sgraph.end, Strand.to_gtf(sgraph.strand), len(changepts))) for cp in changepts: sgraph.apply_change_point(cp, config.change_point_trim) # output splice graph change points for f in sgraph.get_change_point_gtf(cp): print >> config.splice_graph_gtf_fh, str(f) # must recreate splice graph after finding change points if len(changepts) > 0: sgraph.recreate() # run isoform path finding algorithm, filter and group into genes for gene_isoforms in assemble_isoforms(sgraph, config): # assign gene_id and tss_id assign_ids(gene_isoforms, sgraph.strand, config.gene_id_value_obj, config.tss_id_value_obj) # write output for isoform in gene_isoforms: # assign transcript id t_id = config.t_id_value_obj.next() # get strings for each id t_id_str = "TU%d" % t_id tss_id_str = "TSS%d" % (isoform.tss_id) gene_id_str = "G%d" % (isoform.gene_id) # write to GTF for f in get_gtf_features(chrom=sgraph.chrom, strand=sgraph.strand, exons=isoform.path, locus_id=locus_id_str, gene_id=gene_id_str, tss_id=tss_id_str, transcript_id=t_id_str, expr=isoform.expr, rel_frac=isoform.rel_frac, abs_frac=isoform.abs_frac): print >> config.assembly_gtf_fh, str(f) # write to BED name = "%s|%s(%.1f)" % (gene_id_str, t_id_str, isoform.expr) fields = write_bed(sgraph.chrom, name, sgraph.strand, int(round(1000.0 * isoform.rel_frac)), isoform.path) print >> config.assembly_bed_fh, '\t'.join(fields)
def assemble_gene(sgraph, locus_id_str, config): logging.debug('%s:%d-%d[%s] nodes=%d' % (sgraph.chrom, sgraph.start, sgraph.end, Strand.to_gtf(sgraph.strand), len(sgraph.G))) # output splice graph node data for f in sgraph.get_node_gtf(): print >>config.splice_graph_gtf_fh, str(f) if config.change_point: # detect change points changepts = sgraph.detect_change_points( pval=config.change_point_pvalue, fc_cutoff=config.change_point_fold_change) logging.debug('%s:%d-%d[%s] change points: %d' % (sgraph.chrom, sgraph.start, sgraph.end, Strand.to_gtf(sgraph.strand), len(changepts))) for cp in changepts: sgraph.apply_change_point(cp, config.change_point_trim) # output splice graph change points for f in sgraph.get_change_point_gtf(cp): print >>config.splice_graph_gtf_fh, str(f) # must recreate splice graph after finding change points if len(changepts) > 0: sgraph.recreate() # run isoform path finding algorithm, filter and group into genes for gene_isoforms in assemble_isoforms(sgraph, config): # assign gene_id and tss_id assign_ids(gene_isoforms, sgraph.strand, config.gene_id_value_obj, config.tss_id_value_obj) # write output for isoform in gene_isoforms: # assign transcript id t_id = config.t_id_value_obj.next() # get strings for each id t_id_str = "TU%d" % t_id tss_id_str = "TSS%d" % (isoform.tss_id) gene_id_str = "G%d" % (isoform.gene_id) # write to GTF for f in get_gtf_features(chrom=sgraph.chrom, strand=sgraph.strand, exons=isoform.path, locus_id=locus_id_str, gene_id=gene_id_str, tss_id=tss_id_str, transcript_id=t_id_str, expr=isoform.expr, rel_frac=isoform.rel_frac, abs_frac=isoform.abs_frac): print >>config.assembly_gtf_fh, str(f) # write to BED name = "%s|%s(%.1f)" % (gene_id_str, t_id_str, isoform.expr) fields = write_bed(sgraph.chrom, name, sgraph.strand, int(round(1000.0 * isoform.rel_frac)), isoform.path) print >>config.assembly_bed_fh, '\t'.join(fields)
def to_gtf(self): strand_str = Strand.to_gtf(self.strand) f = GTF.Feature() f.seqid = self.chrom f.source = 'taco' f.feature = 'transcript' f.start = self.start f.end = self.end f.score = 0.0 f.strand = strand_str f.phase = '.' f.attrs = {GTF.Attr.TRANSCRIPT_ID: self._id, GTF.Attr.SAMPLE_ID: self.sample_id, GTF.Attr.EXPR: str(self.expr), GTF.Attr.REF: str(int(self.is_ref))} yield f for e in self.exons: f = GTF.Feature() f.seqid = self.chrom f.source = 'taco' f.feature = 'exon' f.start = e.start f.end = e.end f.score = 0.0 f.strand = strand_str f.phase = '.' f.attrs = {GTF.Attr.TRANSCRIPT_ID: self._id} yield f
def to_gtf(self): strand_str = Strand.to_gtf(self.strand) f = GTF.Feature() f.seqid = self.chrom f.source = 'taco' f.feature = 'transcript' f.start = self.start f.end = self.end f.score = 0.0 f.strand = strand_str f.phase = '.' f.attrs = { GTF.Attr.TRANSCRIPT_ID: self._id, GTF.Attr.SAMPLE_ID: self.sample_id, GTF.Attr.EXPR: str(self.expr), GTF.Attr.REF: str(int(self.is_ref)) } yield f for e in self.exons: f = GTF.Feature() f.seqid = self.chrom f.source = 'taco' f.feature = 'exon' f.start = e.start f.end = e.end f.score = 0.0 f.strand = strand_str f.phase = '.' f.attrs = {GTF.Attr.TRANSCRIPT_ID: self._id} yield f
def create_optimal_path_graph(sgraph, kmax=0, loss_threshold=0.10, stats_fh=None): ''' create a path graph from the original splice graph using paths of length 'k' for assembly. The parameter 'k' will be chosen by maximizing the number of reachable k-mers in the path graph while tolerating at most 'loss_threshold' percent of expression. ''' # find upper bound to k user_kmax = kmax kmax = find_longest_path(sgraph) if user_kmax > 0: # user can force a specific kmax (for debugging/testing purposes) kmax = min(user_kmax, kmax) sgraph_id_str = '%s:%d-%d[%s]' % (sgraph.chrom, sgraph.start, sgraph.end, Strand.to_gtf(sgraph.strand)) tot_expr = sum(sgraph.get_node_expr_data(n).mean() for n in sgraph.G) def compute_kmers(k): K = create_path_graph(sgraph, k) valid = K.graph['valid'] short_transfrags = K.graph['short_transfrags'] num_lost_kmers = K.graph['num_lost_kmers'] lost_nodes = get_lost_nodes(sgraph, K) lost_expr = sum( sgraph.get_node_expr_data(n).mean() for n in lost_nodes) lost_expr_frac = 0.0 if tot_expr == 0 else lost_expr / tot_expr logging.debug('%s k=%d kmax=%d t=%d kmers=%d short_transfrags=%d ' 'lost_kmers=%d tot_expr=%.3f lost_expr=%.3f ' 'lost_expr_frac=%.3f valid=%d' % (sgraph_id_str, k, kmax, len(sgraph.transfrags), len(K), len(short_transfrags), num_lost_kmers, tot_expr, lost_expr, lost_expr_frac, int(valid))) if stats_fh: fields = [ sgraph_id_str, k, kmax, len(sgraph.transfrags), len(K), len(short_transfrags), num_lost_kmers, tot_expr, lost_expr, lost_expr_frac, int(valid) ] print >> stats_fh, '\t'.join(map(str, fields)) if not valid: return -k #if lost_expr_frac > loss_threshold: # return -k return len(K) k, num_kmers = maximize_bisect(compute_kmers, 1, kmax, 0) logging.debug('Creating path graph k=%d num_kmers=%d' % (k, num_kmers)) K = create_path_graph(sgraph, k) logging.debug('Rescuing short transfrags') rescue_short_transfrags_saindex(K) return K, k
def from_gtf(f): '''GTF.Feature object to Transfrag''' return Transfrag(chrom=f.seqid, strand=Strand.from_gtf(f.strand), _id=f.attrs[GTF.Attr.TRANSCRIPT_ID], sample_id=f.attrs.get(GTF.Attr.SAMPLE_ID, None), expr=float(f.attrs.get(GTF.Attr.EXPR, 0.0)), is_ref=bool(int(f.attrs.get(GTF.Attr.REF, '0'))), exons=None)
def get_stats(self, K, kmax=None, lost_short=0, lost_short_expr=0.0, is_opt=0): if kmax is None: kmax = self.longest_path_length expr_frac = 0.0 if self.total_expr == 0 else K.graph_expr / self.total_expr opt = int(round(expr_frac * len(K))) fields = [self.chrom, self.start, self.end, Strand.to_gtf(self.strand), K.k, kmax, len(self.paths), len(K.short_transfrags), K.short_expr, lost_short, lost_short_expr, len(K), K.num_lost_kmers, self.total_expr, K.graph_expr, expr_frac, int(K.valid), opt, is_opt] return fields
def create_optimal(self, kmax=0, loss_threshold=0.10, stats_fh=None): ''' create a graph where nodes are paths of length 'k'. the parameter 'k' is chosen to maximizing the number of reachable k-mers in the path graph while tolerating at most 'loss_threshold' percent of expression. ''' if len(self.paths) == 0: return None, 0 # find upper bound to k user_kmax = kmax kmax = self.longest_path_length() if user_kmax > 0: # user can force a specific kmax (for debugging/testing purposes) kmax = min(user_kmax, kmax) id_str = ( '%s:%d-%d[%s]' % (self.chrom, self.start, self.end, Strand.to_gtf(self.strand))) def compute_kmers(k): K = self.create(k) tot_expr = sum(K.exprs[i] for i in K.node_ids_iter()) lost_expr = K.lost_kmer_expr lost_expr_frac = 0.0 if tot_expr == 0 else lost_expr / tot_expr if stats_fh: fields = [ self.chrom, self.start, self.end, Strand.to_gtf(self.strand), k, kmax, len(self.paths), K.n, len(K.short_transfrags), K.num_lost_kmers, tot_expr, lost_expr, lost_expr_frac, int(K.valid) ] print >> stats_fh, '\t'.join(map(str, fields)) if not K.valid: return -k #if lost_expr_frac > loss_threshold: # return -k return len(K) k, num_kmers = maximize_bisect(compute_kmers, 1, kmax, 0) logging.debug('%s creating path graph k=%d num_kmers=%d' % (id_str, k, num_kmers)) K = self.create(k) logging.debug('%s rescuing short transfrags kmers=%d' % (id_str, len(K))) num_lost = self.rescue_short_transfrags(K, K.short_transfrags) logging.debug('%s lost %d of %d short transfrags' % (id_str, num_lost, len(K.short_transfrags))) return K, k
def parse_gtf(gtf_iter, sample_id, gtf_expr_attr, is_ref): ''' returns list of Transfrag objects ''' t_dict = collections.OrderedDict() total_expr = 0.0 cur_t_id = 1 for gtf_line in gtf_iter: if not gtf_line: continue if not gtf_line.strip(): continue if gtf_line.startswith("#"): continue f = GTF.Feature.from_str(gtf_line) if f.feature == 'transcript': t_id = f.attrs[GTF.Attr.TRANSCRIPT_ID] if t_id in t_dict: raise GTFError("Transcript '%s' duplicate detected" % t_id) # rename transcript id new_t_id = "%s.%d" % (sample_id, cur_t_id) cur_t_id += 1 # parse expression if is_ref: expr = 0.0 else: if gtf_expr_attr not in f.attrs: raise GTFError("GTF expression attribute '%s' not found" % (gtf_expr_attr)) expr = float(f.attrs[gtf_expr_attr]) total_expr += expr # create transfrag t = Transfrag(chrom=f.seqid, strand=Strand.from_gtf(f.strand), _id=new_t_id, expr=float(expr), is_ref=is_ref, exons=None) t_dict[t_id] = t elif f.feature == 'exon': t_id = f.attrs[GTF.Attr.TRANSCRIPT_ID] if t_id not in t_dict: logging.error('Feature: "%s"' % str(f)) raise GTFError("Transcript '%s' exon feature appeared in " "gtf file prior to transcript feature" % t_id) t = t_dict[t_id] t.exons.append(Exon(f.start, f.end)) return t_dict.values(), total_expr
def create_optimal_path_graph(sgraph, kmax=0, loss_threshold=0.10, stats_fh=None): ''' create a path graph from the original splice graph using paths of length 'k' for assembly. The parameter 'k' will be chosen by maximizing the number of reachable k-mers in the path graph while tolerating at most 'loss_threshold' percent of expression. ''' # find upper bound to k user_kmax = kmax kmax = find_longest_path(sgraph) if user_kmax > 0: # user can force a specific kmax (for debugging/testing purposes) kmax = min(user_kmax, kmax) sgraph_id_str = '%s:%d-%d[%s]' % (sgraph.chrom, sgraph.start, sgraph.end, Strand.to_gtf(sgraph.strand)) tot_expr = sum(sgraph.get_node_expr_data(n).mean() for n in sgraph.G) def compute_kmers(k): K = create_path_graph(sgraph, k) valid = K.graph['valid'] short_transfrags = K.graph['short_transfrags'] num_lost_kmers = K.graph['num_lost_kmers'] lost_nodes = get_lost_nodes(sgraph, K) lost_expr = sum(sgraph.get_node_expr_data(n).mean() for n in lost_nodes) lost_expr_frac = 0.0 if tot_expr == 0 else lost_expr / tot_expr logging.debug('%s k=%d kmax=%d t=%d kmers=%d short_transfrags=%d ' 'lost_kmers=%d tot_expr=%.3f lost_expr=%.3f ' 'lost_expr_frac=%.3f valid=%d' % (sgraph_id_str, k, kmax, len(sgraph.transfrags), len(K), len(short_transfrags), num_lost_kmers, tot_expr, lost_expr, lost_expr_frac, int(valid))) if stats_fh: fields = [sgraph_id_str, k, kmax, len(sgraph.transfrags), len(K), len(short_transfrags), num_lost_kmers, tot_expr, lost_expr, lost_expr_frac, int(valid)] print >>stats_fh, '\t'.join(map(str, fields)) if not valid: return -k #if lost_expr_frac > loss_threshold: # return -k return len(K) k, num_kmers = maximize_bisect(compute_kmers, 1, kmax, 0) logging.debug('Creating path graph k=%d num_kmers=%d' % (k, num_kmers)) K = create_path_graph(sgraph, k) logging.debug('Rescuing short transfrags') rescue_short_transfrags_saindex(K) return K, k
def write_splice_bed(self, fh): intron_dict = collections.defaultdict(float) for strand in Strand.POS, Strand.NEG: for t in self.strand_transfrags[strand]: if t.is_ref: continue for start, end in t.iterintrons(): intron_dict[(start, end, strand)] += t.expr for intron, expr in intron_dict.iteritems(): start, end, strand = intron fields = [self.chrom, str(start - 1), str(end + 1), 'JUNC', str(expr), Strand.to_gtf(strand), str(start - 1), str(end + 1), '255,0,0', '2', '1,1', '0,%d' % (end + 1 - start)] print >>fh, '\t'.join(fields)
def create_optimal(self, kmax=0, loss_threshold=0.10, stats_fh=None): ''' create a graph where nodes are paths of length 'k'. the parameter 'k' is chosen to maximizing the number of reachable k-mers in the path graph while tolerating at most 'loss_threshold' percent of expression. ''' if len(self.paths) == 0: return None, 0 # find upper bound to k user_kmax = kmax kmax = self.longest_path_length() if user_kmax > 0: # user can force a specific kmax (for debugging/testing purposes) kmax = min(user_kmax, kmax) id_str = ('%s:%d-%d[%s]' % (self.chrom, self.start, self.end, Strand.to_gtf(self.strand))) def compute_kmers(k): K = self.create(k) tot_expr = sum(K.exprs[i] for i in K.node_ids_iter()) lost_expr = K.lost_kmer_expr lost_expr_frac = 0.0 if tot_expr == 0 else lost_expr / tot_expr if stats_fh: fields = [self.chrom, self.start, self.end, Strand.to_gtf(self.strand), k, kmax, len(self.paths), K.n, len(K.short_transfrags), K.num_lost_kmers, tot_expr, lost_expr, lost_expr_frac, int(K.valid)] print >>stats_fh, '\t'.join(map(str, fields)) if not K.valid: return -k #if lost_expr_frac > loss_threshold: # return -k return len(K) k, num_kmers = maximize_bisect(compute_kmers, 1, kmax, 0) logging.debug('%s creating path graph k=%d num_kmers=%d' % (id_str, k, num_kmers)) K = self.create(k) logging.debug('%s rescuing short transfrags kmers=%d' % (id_str, len(K))) num_lost = self.rescue_short_transfrags(K, K.short_transfrags) logging.debug('%s lost %d of %d short transfrags' % (id_str, num_lost, len(K.short_transfrags))) return K, k
def compute_kmers(k): K = self.create(k) tot_expr = sum(K.exprs[i] for i in K.node_ids_iter()) lost_expr = K.lost_kmer_expr lost_expr_frac = 0.0 if tot_expr == 0 else lost_expr / tot_expr if stats_fh: fields = [self.chrom, self.start, self.end, Strand.to_gtf(self.strand), k, kmax, len(self.paths), K.n, len(K.short_transfrags), K.num_lost_kmers, tot_expr, lost_expr, lost_expr_frac, int(K.valid)] print >>stats_fh, '\t'.join(map(str, fields)) if not K.valid: return -k #if lost_expr_frac > loss_threshold: # return -k return len(K)
def get_stats(self, K, kmax=None, lost_short=0, lost_short_expr=0.0, is_opt=0): if kmax is None: kmax = self.longest_path_length expr_frac = 0.0 if self.total_expr == 0 else K.graph_expr / self.total_expr opt = int(round(expr_frac * len(K))) fields = [ self.chrom, self.start, self.end, Strand.to_gtf(self.strand), K.k, kmax, len(self.paths), len(K.short_transfrags), K.short_expr, lost_short, lost_short_expr, len(K), K.num_lost_kmers, self.total_expr, K.graph_expr, expr_frac, int(K.valid), opt, is_opt ] return fields
def to_bed(self): tx_start = self.exons[0].start tx_end = self.exons[-1].end block_sizes = [] block_starts = [] for e in self.exons: block_starts.append(e.start - tx_start) block_sizes.append(e.end - e.start) # make bed fields fields = [ self.chrom, str(tx_start), str(tx_end), self._id, str(self.expr), Strand.to_gtf(self.strand), '0', '0', '0', str(len(self.exons)), ','.join(map(str, block_sizes)), ','.join(map(str, block_starts)) ] return fields
def compute_kmers(k): K = self.create(k) tot_expr = sum(K.exprs[i] for i in K.node_ids_iter()) lost_expr = K.lost_kmer_expr lost_expr_frac = 0.0 if tot_expr == 0 else lost_expr / tot_expr if stats_fh: fields = [ self.chrom, self.start, self.end, Strand.to_gtf(self.strand), k, kmax, len(self.paths), K.n, len(K.short_transfrags), K.num_lost_kmers, tot_expr, lost_expr, lost_expr_frac, int(K.valid) ] print >> stats_fh, '\t'.join(map(str, fields)) if not K.valid: return -k #if lost_expr_frac > loss_threshold: # return -k return len(K)
def get_gtf_features(chrom, strand, exons, locus_id, gene_id, tss_id, transcript_id, expr, rel_frac, abs_frac): tx_start = exons[0].start tx_end = exons[-1].end strand_str = Strand.to_gtf(strand) attr_dict = { 'locus_id': locus_id, 'gene_id': gene_id, 'tss_id': tss_id, 'transcript_id': transcript_id } f = GTF.Feature() f.seqid = chrom f.source = 'taco' f.feature = 'transcript' f.start = tx_start f.end = tx_end f.score = int(round(1000.0 * rel_frac)) f.strand = strand_str f.phase = '.' f.attrs = { 'expr': '%.3f' % expr, 'rel_frac': '%.5f' % rel_frac, 'abs_frac': '%.5f' % abs_frac } f.attrs.update(attr_dict) yield f for e in exons: f = GTF.Feature() f.seqid = chrom f.source = 'taco' f.feature = 'exon' f.start = e.start f.end = e.end f.score = int(round(1000.0 * rel_frac)) f.strand = strand_str f.phase = '.' f.attrs = {} f.attrs.update(attr_dict) yield f
def from_bed(line): fields = line.strip().split('\t') chrom = fields[0] tx_start = int(fields[1]) _id = fields[3] is_ref = (_id.split('.')[0] == Sample.REF_ID) expr = float(fields[4]) strand = Strand.from_bed(fields[5]) num_exons = int(fields[9]) block_sizes = fields[10].split(',') block_starts = fields[11].split(',') exons = [] for i in xrange(num_exons): start = tx_start + int(block_starts[i]) end = start + int(block_sizes[i]) exons.append(Exon(start, end)) return Transfrag(chrom=chrom, strand=strand, _id=_id, expr=expr, is_ref=is_ref, exons=exons)
def detect_change_points(self, *args, **kwargs): ''' *args, **kwargs: passed directly to 'run_changepoint' returns list of ChangePoint tuples ''' changepts = [] for n_id in self.G: n = self.get_node_interval(n_id) expr_data = self.get_expr_data(n.start, n.end) for cp in run_changepoint(expr_data, *args, **kwargs): # add offset from start of node to change point positions cp = cp._replace(pos=n.start + cp.pos, start=n.start + cp.start, end=n.start + cp.end) changepts.append(cp) logging.debug( '\t%s:%d-%d[%s] node: %s-%s cp:%d(%d-%d) ' 'p=%.3f fc=%.3f' % (self.chrom, self.start, self.end, Strand.to_gtf(self.strand), n.start, n.end, cp.pos, cp.start, cp.end, cp.pvalue, cp.foldchange)) return changepts
def detect_change_points(self, *args, **kwargs): ''' *args, **kwargs: passed directly to 'run_changepoint' returns list of ChangePoint tuples ''' genome_id_str = ('%s:%d-%d[%s]' % (self.chrom, self.start, self.end, Strand.to_gtf(self.strand))) changepts = [] for n in self.G.nodes_iter(): expr_data = self.get_expr_data(*n) for cp in run_changepoint(expr_data, *args, **kwargs): # add offset from start of node to change point positions cp = cp._replace(pos=n.start + cp.pos, start=n.start + cp.start, end=n.start + cp.end) changepts.append(cp) logging.debug('%s changepoint node=(%s-%s) ' 'pos=%d interval=(%d-%d) p=%.3f fc=%.3f' % (genome_id_str, n.start, n.end, cp.pos, cp.start, cp.end, cp.pvalue, cp.foldchange)) return changepts
def detect_change_points(self, *args, **kwargs): ''' *args, **kwargs: passed directly to 'run_changepoint' returns list of ChangePoint tuples ''' genome_id_str = ( '%s:%d-%d[%s]' % (self.chrom, self.start, self.end, Strand.to_gtf(self.strand))) changepts = [] for n in self.G.nodes_iter(): expr_data = self.get_expr_data(*n) for cp in run_changepoint(expr_data, *args, **kwargs): # add offset from start of node to change point positions cp = cp._replace(pos=n.start + cp.pos, start=n.start + cp.start, end=n.start + cp.end) changepts.append(cp) logging.debug('%s changepoint node=(%s-%s) ' 'pos=%d interval=(%d-%d) p=%.3f fc=%.3f' % (genome_id_str, n.start, n.end, cp.pos, cp.start, cp.end, cp.pvalue, cp.foldchange)) return changepts
def detect_change_points(self, *args, **kwargs): ''' *args, **kwargs: passed directly to 'run_changepoint' returns list of ChangePoint tuples ''' changepts = [] for n_id in self.G: n = self.get_node_interval(n_id) expr_data = self.get_expr_data(n.start, n.end) for cp in run_changepoint(expr_data, *args, **kwargs): # add offset from start of node to change point positions cp = cp._replace(pos=n.start + cp.pos, start=n.start + cp.start, end=n.start + cp.end) changepts.append(cp) logging.debug('\t%s:%d-%d[%s] node: %s-%s cp:%d(%d-%d) ' 'p=%.3f fc=%.3f' % (self.chrom, self.start, self.end, Strand.to_gtf(self.strand), n.start, n.end, cp.pos, cp.start, cp.end, cp.pvalue, cp.foldchange)) return changepts
def get_gtf_features(chrom, strand, exons, locus_id, gene_id, tss_id, transcript_id, expr, rel_frac, abs_frac): tx_start = exons[0].start tx_end = exons[-1].end strand_str = Strand.to_gtf(strand) attr_dict = {'locus_id': locus_id, 'gene_id': gene_id, 'tss_id': tss_id, 'transcript_id': transcript_id} f = GTF.Feature() f.seqid = chrom f.source = 'taco' f.feature = 'transcript' f.start = tx_start f.end = tx_end f.score = int(round(1000.0 * rel_frac)) f.strand = strand_str f.phase = '.' f.attrs = {'expr': '%.3f' % expr, 'rel_frac': '%.5f' % rel_frac, 'abs_frac': '%.5f' % abs_frac} f.attrs.update(attr_dict) yield f for e in exons: f = GTF.Feature() f.seqid = chrom f.source = 'taco' f.feature = 'exon' f.start = e.start f.end = e.end f.score = int(round(1000.0 * rel_frac)) f.strand = strand_str f.phase = '.' f.attrs = {} f.attrs.update(attr_dict) yield f
def write_bed(chrom, name, strand, score, exons): assert all(exons[0].start < x.start for x in exons[1:]) assert all(exons[-1].end > x.end for x in exons[:-1]) tx_start = exons[0].start tx_end = exons[-1].end block_sizes = [] block_starts = [] for e in exons: block_starts.append(e.start - tx_start) block_sizes.append(e.end - e.start) # make bed fields fields = [chrom, str(tx_start), str(tx_end), str(name), str(score), Strand.to_gtf(strand), str(tx_start), str(tx_start), '0', str(len(exons)), ','.join(map(str, block_sizes)) + ',', ','.join(map(str, block_starts)) + ','] return fields
def write_bed(chrom, name, strand, score, exons): assert all(exons[0].start < x.start for x in exons[1:]) assert all(exons[-1].end > x.end for x in exons[:-1]) tx_start = exons[0].start tx_end = exons[-1].end block_sizes = [] block_starts = [] for e in exons: block_starts.append(e.start - tx_start) block_sizes.append(e.end - e.start) # make bed fields fields = [ chrom, str(tx_start), str(tx_end), str(name), str(score), Strand.to_gtf(strand), str(tx_start), str(tx_start), '0', str(len(exons)), ','.join(map(str, block_sizes)) + ',', ','.join(map(str, block_starts)) + ',' ] return fields
def __str__(self): return ('PathGraphFactory %s:%d-%d[%s]' % (self.chrom, self.start, self.end, Strand.to_gtf(self.strand)))
def __str__(self): return ('SpliceGraph %s:%d-%d[%s] transfrags: %d' % (self.chrom, self.start, self.end, Strand.to_gtf( self.strand), len(self.transfrags)))
def __str__(self): return ('SpliceGraph %s:%d-%d[%s] transfrags: %d' % (self.chrom, self.start, self.end, Strand.to_gtf(self.strand), len(self.transfrags)))
def assemble_isoforms(sgraph, config): # create a path graph from the splice graph K, k = create_optimal_path_graph( sgraph, kmax=config.path_graph_kmax, loss_threshold=config.path_graph_loss_threshold, stats_fh=config.path_graph_stats_fh) if K is None: return [] if len(K) == 0: return [] # report lost nodes if config.assembly_loss_gtf_fh is not None: graph_id = ('L_%s:%d-%d[%s]' % (sgraph.chrom, sgraph.start, sgraph.end, Strand.to_gtf(sgraph.strand))) for n_id in get_lost_nodes(sgraph, K): n = sgraph.get_node_interval(n_id) expr_data = sgraph.get_node_expr_data(n_id) # return gtf feature for each node f = GTF.Feature() f.seqid = sgraph.chrom f.source = 'taco' f.feature = 'lost_node' f.start = n[0] f.end = n[1] f.score = 0.0 f.strand = Strand.to_gtf(sgraph.strand) f.phase = '.' f.attrs = {'graph_id': graph_id, 'expr': str(expr_data.mean())} print >> config.assembly_loss_gtf_fh, str(f) # smooth kmer graph smooth_graph(K) source_node = K.graph['source'] source_expr = K.node[source_node][KMER_EXPR] logging.debug('%s:%d-%d[%s] finding paths in k=%d graph ' '(%d nodes) source_expr=%f' % (sgraph.chrom, sgraph.start, sgraph.end, Strand.to_gtf(sgraph.strand), k, len(K), source_expr)) id_kmer_map = K.graph['id_kmer_map'] paths = [] for kmer_path, expr in find_paths(K, KMER_EXPR, config.path_frac, config.max_paths): path = reconstruct_path(kmer_path, id_kmer_map, sgraph) logging.debug("\texpr=%f length=%d" % (expr, len(path))) paths.append((path, expr)) # build gene clusters clusters, filtered = Cluster.build(paths, min_frac=config.isoform_frac) logging.debug('\tclusters: %d filtered: %d' % (len(clusters), len(filtered))) gene_isoforms = [] for cluster in clusters: isoforms = [] for path, expr, rel_frac, abs_frac in cluster.iterpaths(): isoforms.append( Isoform(path=path, expr=expr, rel_frac=rel_frac, abs_frac=abs_frac)) # apply max isoforms limit (per cluster) if config.max_isoforms > 0: isoforms = isoforms[:config.max_isoforms] gene_isoforms.append(isoforms) return gene_isoforms
def assemble_isoforms(sgraph, config): # create a path graph from the splice graph K, k = create_optimal_path_graph( sgraph, kmax=config.path_graph_kmax, loss_threshold=config.path_graph_loss_threshold, stats_fh=config.path_graph_stats_fh) if K is None: return [] if len(K) == 0: return [] # report lost nodes if config.assembly_loss_gtf_fh is not None: graph_id = ('L_%s:%d-%d[%s]' % (sgraph.chrom, sgraph.start, sgraph.end, Strand.to_gtf(sgraph.strand))) for n_id in get_lost_nodes(sgraph, K): n = sgraph.get_node_interval(n_id) expr_data = sgraph.get_node_expr_data(n_id) # return gtf feature for each node f = GTF.Feature() f.seqid = sgraph.chrom f.source = 'taco' f.feature = 'lost_node' f.start = n[0] f.end = n[1] f.score = 0.0 f.strand = Strand.to_gtf(sgraph.strand) f.phase = '.' f.attrs = {'graph_id': graph_id, 'expr': str(expr_data.mean())} print >>config.assembly_loss_gtf_fh, str(f) # smooth kmer graph smooth_graph(K) source_node = K.graph['source'] source_expr = K.node[source_node][KMER_EXPR] logging.debug('%s:%d-%d[%s] finding paths in k=%d graph ' '(%d nodes) source_expr=%f' % (sgraph.chrom, sgraph.start, sgraph.end, Strand.to_gtf(sgraph.strand), k, len(K), source_expr)) id_kmer_map = K.graph['id_kmer_map'] paths = [] for kmer_path, expr in find_paths(K, KMER_EXPR, config.path_frac, config.max_paths): path = reconstruct_path(kmer_path, id_kmer_map, sgraph) logging.debug("\texpr=%f length=%d" % (expr, len(path))) paths.append((path, expr)) # build gene clusters clusters, filtered = Cluster.build(paths, min_frac=config.isoform_frac) logging.debug('\tclusters: %d filtered: %d' % (len(clusters), len(filtered))) gene_isoforms = [] for cluster in clusters: isoforms = [] for path, expr, rel_frac, abs_frac in cluster.iterpaths(): isoforms.append(Isoform(path=path, expr=expr, rel_frac=rel_frac, abs_frac=abs_frac)) # apply max isoforms limit (per cluster) if config.max_isoforms > 0: isoforms = isoforms[:config.max_isoforms] gene_isoforms.append(isoforms) return gene_isoforms