def __init__(self): self.guided_ends = False self.guided_assembly = False self.transfrags = [] self.ref_transfrags = [] self.chrom = None self.start = None self.end = None self.strand = None self.tree_starts = IntervalTree() self.tree_ends = IntervalTree() self.ref_start_sites = set() self.ref_stop_sites = set() self.start_sites = set() self.stop_sites = set() self.expr_data = None self.node_bounds = None self.G = None
def _add_transfrags(self, transfrags): self.transfrags = [] self.ref_transfrags = [] self.chrom = None self.start = None self.end = None self.strand = None self.ref_start_sites = set() self.ref_stop_sites = set() self.tree_starts = IntervalTree() self.tree_ends = IntervalTree() for t in transfrags: if self.chrom is None: self.chrom = t.chrom elif self.chrom != t.chrom: raise TacoError('chrom mismatch') if self.strand is None: self.strand = t.strand elif self.strand != t.strand: raise TacoError('strand mismatch') if self.start is None: self.start = t.start else: self.start = min(t.start, self.start) if self.end is None: self.end = t.end else: self.end = max(t.end, self.end) if t.is_ref: self.ref_start_sites.add(t.txstart) self.ref_stop_sites.add(t.txstop) self.ref_transfrags.append(t) else: self._add_to_interval_trees(t) self.transfrags.append(t) self.ref_start_sites = sorted(self.ref_start_sites) self.ref_stop_sites = sorted(self.ref_stop_sites)
class SpliceGraph(object): def __init__(self): self.guided_ends = False self.guided_assembly = False self.transfrags = [] self.ref_transfrags = [] self.chrom = None self.start = None self.end = None self.strand = None self.tree_starts = IntervalTree() self.tree_ends = IntervalTree() self.ref_start_sites = set() self.ref_stop_sites = set() self.start_sites = set() self.stop_sites = set() self.expr_data = None self.node_bounds = None self.G = None @staticmethod def create(transfrags, guided_ends=False, guided_assembly=False): self = SpliceGraph() self.guided_ends = guided_ends self.guided_assembly = guided_assembly self._add_transfrags(transfrags) self.expr_data = self._compute_expression() self.node_bounds = self._find_node_boundaries() self.G = self._create_splice_graph() self._mark_start_stop_nodes() return self def recreate(self): self._add_transfrags(chain(self.transfrags, self.ref_transfrags)) self.expr_data = self._compute_expression() self.node_bounds = self._find_node_boundaries() self.G = self._create_splice_graph() self._mark_start_stop_nodes() def _add_transfrags(self, transfrags): self.transfrags = [] self.ref_transfrags = [] self.chrom = None self.start = None self.end = None self.strand = None self.ref_start_sites = set() self.ref_stop_sites = set() self.tree_starts = IntervalTree() self.tree_ends = IntervalTree() for t in transfrags: if self.chrom is None: self.chrom = t.chrom elif self.chrom != t.chrom: raise TacoError('chrom mismatch') if self.strand is None: self.strand = t.strand elif self.strand != t.strand: raise TacoError('strand mismatch') if self.start is None: self.start = t.start else: self.start = min(t.start, self.start) if self.end is None: self.end = t.end else: self.end = max(t.end, self.end) if t.is_ref: self.ref_start_sites.add(t.txstart) self.ref_stop_sites.add(t.txstop) self.ref_transfrags.append(t) else: self._add_to_interval_trees(t) self.transfrags.append(t) self.ref_start_sites = sorted(self.ref_start_sites) self.ref_stop_sites = sorted(self.ref_stop_sites) def _compute_expression(self): expr_data = np.zeros(self.end - self.start, dtype=FLOAT_DTYPE) for t in self.transfrags: for exon in t.exons: astart = exon.start - self.start aend = exon.end - self.start expr_data[astart:aend] += t.expr return expr_data def _find_node_boundaries(self): node_bounds = set((self.start, self.end)) node_bounds.update(self.start_sites) node_bounds.update(self.stop_sites) # nodes bounded by regions where expression changes to/from zero node_bounds.update(find_threshold_points(self.expr_data, self.start)) # nodes bounded by introns for t in self.transfrags: node_bounds.update(t.itersplices()) if self.guided_ends or self.guided_assembly: for t in self.ref_transfrags: if self.guided_ends: node_bounds.update((t.start, t.end)) if self.guided_assembly: node_bounds.update(t.itersplices()) return array('i', sorted(node_bounds)) def get_path(self, t): node_id_map = self.G.graph['node_id_map'] nodes = [node_id_map[n] for n in split_transfrag(t, self.node_bounds)] if self.strand == Strand.NEG: nodes.reverse() return tuple(nodes) def _create_splice_graph2(self): G = SGraph() for t in sgraph.itertransfrags(): nodes = [n for n in split_transfrag(t, self.node_bounds)] if self.strand == Strand.NEG: nodes.reverse() G.add_path(nodes, t.is_ref) return G def _create_splice_graph(self): '''returns networkx DiGraph object''' G = nx.DiGraph() node_bounds = self.node_bounds node_id_map = {} id_node_map = {} current_id = 2 for t in self.itertransfrags(): # split exons that cross boundaries and get the # nodes that made up the transfrag nodes = [] for n in split_transfrag(t, node_bounds): n = Exon(*n) if n not in node_id_map: n_id = current_id current_id += 1 node_id_map[n] = n_id id_node_map[n_id] = n else: n_id = node_id_map[n] nodes.append(n_id) if t.strand == Strand.NEG: nodes.reverse() # add nodes/edges to graph u = nodes[0] SGNode.add(G, u, id_node_map[u], is_ref=t.is_ref) for i in xrange(1, len(nodes)): v = nodes[i] SGNode.add(G, v, id_node_map[v], is_ref=t.is_ref) G.add_edge(u, v) u = v G.graph['node_id_map'] = node_id_map G.graph['id_node_map'] = id_node_map return G def _mark_start_stop_nodes(self): G = self.G # get all leaf nodes for n, nd in G.nodes_iter(data=True): if G.in_degree(n) == 0: nd[SGNode.IS_START] = True if G.out_degree(n) == 0: nd[SGNode.IS_STOP] = True # mark change points change_points = set() change_points.update( set((SGNode.IS_START, x) for x in self.start_sites)) change_points.update(set((SGNode.IS_STOP, x) for x in self.stop_sites)) if self.guided_ends: change_points.update( (SGNode.IS_START, x) for x in self.ref_start_sites) change_points.update( (SGNode.IS_STOP, x) for x in self.ref_stop_sites) node_bounds = self.node_bounds strand = self.strand node_id_map = G.graph['node_id_map'] for direction, pos in change_points: if ((direction == SGNode.IS_STOP and strand == Strand.NEG) or (direction == SGNode.IS_START and strand != Strand.NEG)): bisect_func = bisect.bisect_right else: bisect_func = bisect.bisect_left i = bisect_func(node_bounds, pos) n = (node_bounds[i - 1], node_bounds[i]) if n in node_id_map: # 2/5/2016: observed case where trimming caused an interval # with zero expression, and this function then crashed when # attempting to mark the node as a start/stop. We no longer # attempt to mark nodes in zero expression regions. G.node[node_id_map[n]][direction] = True def _add_to_interval_trees(self, t): istart = Interval(t.start, t.start + 1, value=t) iend = Interval(t.end - 1, t.end, value=t) self.tree_starts.insert_interval(istart) self.tree_ends.insert_interval(iend) def _trim_change_point(self, cp): # search for matches in change point interval num_trimmed = 0 if cp.sign < 0: hits = self.tree_ends.find(cp.pos, cp.end) for hit in hits: t = hit.value # last exon start cannot overlap interval last_exon_start = t.exons[-1][0] if cp.pos <= last_exon_start <= cp.end: continue # trim end left t.exons[-1] = Exon(last_exon_start, cp.pos) num_trimmed += 1 else: hits = self.tree_starts.find(cp.start, cp.pos) for hit in hits: t = hit.value # first exon end cannot overlap interval first_exon_end = t.exons[0][1] if cp.start <= first_exon_end <= cp.pos: continue # trim start right t.exons[0] = Exon(cp.pos, first_exon_end) num_trimmed += 1 def detect_change_points(self, *args, **kwargs): ''' *args, **kwargs: passed directly to 'run_changepoint' returns list of ChangePoint tuples ''' changepts = [] for n_id in self.G: n = self.get_node_interval(n_id) expr_data = self.get_expr_data(n.start, n.end) for cp in run_changepoint(expr_data, *args, **kwargs): # add offset from start of node to change point positions cp = cp._replace(pos=n.start + cp.pos, start=n.start + cp.start, end=n.start + cp.end) changepts.append(cp) logging.debug( '\t%s:%d-%d[%s] node: %s-%s cp:%d(%d-%d) ' 'p=%.3f fc=%.3f' % (self.chrom, self.start, self.end, Strand.to_gtf(self.strand), n.start, n.end, cp.pos, cp.start, cp.end, cp.pvalue, cp.foldchange)) return changepts def apply_change_point(self, cp, trim=True): ''' trim: if true, will trim transfrags around change points ''' if ((self.strand != Strand.NEG and cp.sign < 0) or (self.strand == Strand.NEG and cp.sign > 0)): self.stop_sites.add(cp.pos) else: self.start_sites.add(cp.pos) if trim: self._trim_change_point(cp) def get_change_point_gtf(self, cp): graph_id = ( 'G_%s_%d_%d_%s' % (self.chrom, self.start, self.end, Strand.to_gtf(self.strand))) features = [] f = GTF.Feature() f.seqid = self.chrom f.source = 'taco' f.feature = 'changept' f.start = cp.pos f.end = cp.pos + 1 f.score = 0 f.strand = Strand.to_gtf(self.strand) f.phase = '.' f.attrs = { 'graph_id': graph_id, 'sign': str(cp.sign), 'pvalue': str(cp.pvalue), 'foldchange': str(cp.foldchange) } features.append(f) f = GTF.Feature() f.seqid = self.chrom f.source = 'taco' f.feature = 'changeinterval' f.start = cp.start f.end = cp.end f.score = 0 f.strand = Strand.to_gtf(self.strand) f.phase = '.' f.attrs = { 'graph_id': graph_id, 'sign': str(cp.sign), 'pvalue': str(cp.pvalue), 'foldchange': str(cp.foldchange) } features.append(f) return features def split(self): '''splits into weakly connected component subgraphs''' # get connected components of graph which represent independent genes # unconnected components are considered different genes Gsubs = list(nx.weakly_connected_component_subgraphs(self.G)) if len(Gsubs) == 1: yield self return # map nodes to components node_subgraph_map = {} subgraph_transfrag_map = collections.defaultdict(list) for i, Gsub in enumerate(Gsubs): for n_id in Gsub: n = self.get_node_interval(n_id) node_subgraph_map[n] = i # assign transfrags to components for t in self.itertransfrags(): for n in split_transfrag(t, self.node_bounds): subgraph_id = node_subgraph_map[n] subgraph_transfrag_map[subgraph_id].append(t) break # create new graphs using the separate components for subgraph_transfrags in subgraph_transfrag_map.itervalues(): yield SpliceGraph.create(subgraph_transfrags, guided_ends=self.guided_ends, guided_assembly=self.guided_assembly) def get_node_interval(self, n_id): return self.G.node[n_id][SGNode.INTERVAL] def get_node_id(self, n): return self.G.graph['node_id_map'][n] def get_node_expr_data(self, n_id): start, end = self.get_node_interval(n_id) return self.get_expr_data(start, end) def itertransfrags(self): if self.guided_assembly: return chain(self.transfrags, self.ref_transfrags) else: return iter(self.transfrags) def get_start_stop_nodes(self): start_nodes = set() stop_nodes = set() for n, nd in self.G.nodes_iter(data=True): if nd[SGNode.IS_START]: start_nodes.add(n) if nd[SGNode.IS_STOP]: stop_nodes.add(n) return start_nodes, stop_nodes def get_expr_data(self, start=None, end=None): if start is None: start = self.start if end is None: end = self.end if ((start < self.start) or (end > self.end)): m = ('query %d-%d outside locus bounds %d-%d' % (start, end, self.start, self.end)) raise TacoError(m) astart = start - self.start aend = end - self.start return self.expr_data[astart:aend] def get_node_gtf(self): graph_id = ( 'G_%s_%d_%d_%s' % (self.chrom, self.start, self.end, Strand.to_gtf(self.strand))) # iterate through locus and return change point data for n_id in self.G: n = self.get_node_interval(n_id) expr_data = self.get_expr_data(*n) ref_starts = _array_subset(self.ref_start_sites, *n) ref_stops = _array_subset(self.ref_stop_sites, *n) # return gtf feature for each node f = GTF.Feature() f.seqid = self.chrom f.source = 'taco' f.feature = 'node' f.start = n[0] f.end = n[1] f.score = 0 f.strand = Strand.to_gtf(self.strand) f.phase = '.' f.attrs = { 'graph_id': graph_id, 'expr_min': str(expr_data.min()), 'expr_max': str(expr_data.max()), 'expr_mean': str(expr_data.mean()), 'ref_starts': ','.join(map(str, ref_starts)), 'ref_stops': ','.join(map(str, ref_stops)) } yield f
class SpliceGraph(object): def __init__(self): self.guided_ends = False self.guided_assembly = False self.transfrags = [] self.ref_transfrags = [] self.chrom = None self.start = None self.end = None self.strand = None self.tree_starts = IntervalTree() self.tree_ends = IntervalTree() self.ref_start_sites = set() self.ref_stop_sites = set() self.start_sites = set() self.stop_sites = set() self.expr_data = None self.node_bounds = None self.G = None def __str__(self): return ('SpliceGraph %s:%d-%d[%s] transfrags: %d' % (self.chrom, self.start, self.end, Strand.to_gtf( self.strand), len(self.transfrags))) @staticmethod def create(transfrags, guided_ends=False, guided_assembly=False): self = SpliceGraph() self.guided_ends = guided_ends self.guided_assembly = guided_assembly self._add_transfrags(transfrags) self.expr_data = self._compute_expression() self.node_bounds = self._find_node_boundaries() self.G = self._create_splice_graph() self._mark_start_stop_nodes() return self def recreate(self): self._add_transfrags(chain(self.transfrags, self.ref_transfrags)) self.expr_data = self._compute_expression() self.node_bounds = self._find_node_boundaries() self.G = self._create_splice_graph() self._mark_start_stop_nodes() def _add_transfrags(self, transfrags): self.transfrags = [] self.ref_transfrags = [] self.chrom = None self.start = None self.end = None self.strand = None self.ref_start_sites = set() self.ref_stop_sites = set() self.tree_starts = IntervalTree() self.tree_ends = IntervalTree() for t in transfrags: if self.chrom is None: self.chrom = t.chrom elif self.chrom != t.chrom: raise TacoError('chrom mismatch') if self.strand is None: self.strand = t.strand elif self.strand != t.strand: raise TacoError('strand mismatch') if self.start is None: self.start = t.start else: self.start = min(t.start, self.start) if self.end is None: self.end = t.end else: self.end = max(t.end, self.end) if t.is_ref: self.ref_start_sites.add(t.txstart) self.ref_stop_sites.add(t.txstop) self.ref_transfrags.append(t) else: self._add_to_interval_trees(t) self.transfrags.append(t) self.ref_start_sites = sorted(self.ref_start_sites) self.ref_stop_sites = sorted(self.ref_stop_sites) def _compute_expression(self): expr_data = np.zeros(self.end - self.start, dtype=FLOAT_DTYPE) for t in self.transfrags: for exon in t.exons: astart = exon.start - self.start aend = exon.end - self.start expr_data[astart:aend] += t.expr return expr_data def _find_node_boundaries(self): node_bounds = set((self.start, self.end)) node_bounds.update(self.start_sites) node_bounds.update(self.stop_sites) # nodes bounded by regions where expression changes to/from zero node_bounds.update(find_threshold_points(self.expr_data, self.start)) # nodes bounded by introns for t in self.transfrags: node_bounds.update(t.itersplices()) if self.guided_ends or self.guided_assembly: for t in self.ref_transfrags: if self.guided_ends: node_bounds.update((t.start, t.end)) if self.guided_assembly: node_bounds.update(t.itersplices()) return array('i', sorted(node_bounds)) def get_path(self, t): node_id_map = self.G.node_id_map nodes = [node_id_map[n] for n in split_transfrag(t, self.node_bounds)] if self.strand == Strand.NEG: nodes.reverse() return tuple(nodes) def _create_splice_graph(self): G = SGraph() for t in self.itertransfrags(): nodes = [Exon(*n) for n in split_transfrag(t, self.node_bounds)] if self.strand == Strand.NEG: nodes.reverse() G.add_path(nodes, t.is_ref) # after graph is built, set node properties for start and stop nodes G.set_start_stop_nodes() return G def _mark_start_stop_nodes(self): G = self.G node_bounds = self.node_bounds strand = self.strand # mark start nodes changepts = set(self.start_sites) if self.guided_ends: changepts.update(self.ref_start_sites) for pos in changepts: if strand != Strand.NEG: bisect_func = bisect.bisect_right else: bisect_func = bisect.bisect_left i = bisect_func(node_bounds, pos) n = (node_bounds[i - 1], node_bounds[i]) if G.has_node(n): n_id = G.get_node_id(n) G.is_start[n_id] = True # mark stop nodes changepts = set(self.stop_sites) if self.guided_ends: changepts.update(self.ref_stop_sites) for pos in changepts: if strand == Strand.NEG: bisect_func = bisect.bisect_right else: bisect_func = bisect.bisect_left i = bisect_func(node_bounds, pos) n = (node_bounds[i - 1], node_bounds[i]) if G.has_node(n): n_id = G.get_node_id(n) G.is_stop[n_id] = True def _add_to_interval_trees(self, t): istart = Interval(t.start, t.start + 1, value=t) iend = Interval(t.end - 1, t.end, value=t) self.tree_starts.insert_interval(istart) self.tree_ends.insert_interval(iend) def _trim_change_point(self, cp): # search for matches in change point interval num_trimmed = 0 if cp.sign < 0: hits = self.tree_ends.find(cp.pos, cp.end) for hit in hits: t = hit.value # last exon start cannot overlap interval last_exon_start = t.exons[-1][0] if cp.pos <= last_exon_start <= cp.end: continue # trim end left t.exons[-1] = Exon(last_exon_start, cp.pos) num_trimmed += 1 else: hits = self.tree_starts.find(cp.start, cp.pos) for hit in hits: t = hit.value # first exon end cannot overlap interval first_exon_end = t.exons[0][1] if cp.start <= first_exon_end <= cp.pos: continue # trim start right t.exons[0] = Exon(cp.pos, first_exon_end) num_trimmed += 1 def detect_change_points(self, *args, **kwargs): ''' *args, **kwargs: passed directly to 'run_changepoint' returns list of ChangePoint tuples ''' changepts = [] for n in self.G.nodes_iter(): expr_data = self.get_expr_data(*n) for cp in run_changepoint(expr_data, *args, **kwargs): # add offset from start of node to change point positions cp = cp._replace(pos=n.start + cp.pos, start=n.start + cp.start, end=n.start + cp.end) changepts.append(cp) logging.debug('%s changepoint node=(%s-%s) ' 'pos=%d interval=(%d-%d) p=%.3f fc=%.3f' % (str(self), n.start, n.end, cp.pos, cp.start, cp.end, cp.pvalue, cp.foldchange)) return changepts def apply_change_point(self, cp, trim=True): ''' trim: if true, will trim transfrags around change points ''' if ((self.strand != Strand.NEG and cp.sign < 0) or (self.strand == Strand.NEG and cp.sign > 0)): self.stop_sites.add(cp.pos) else: self.start_sites.add(cp.pos) if trim: self._trim_change_point(cp) def get_change_point_gtf(self, cp): graph_id = ( 'G_%s_%d_%d_%s' % (self.chrom, self.start, self.end, Strand.to_gtf(self.strand))) features = [] f = GTF.Feature() f.seqid = self.chrom f.source = 'taco' f.feature = 'changept' f.start = cp.pos f.end = cp.pos + 1 f.score = 0 f.strand = Strand.to_gtf(self.strand) f.phase = '.' f.attrs = { 'graph_id': graph_id, 'sign': str(cp.sign), 'pvalue': str(cp.pvalue), 'foldchange': str(cp.foldchange) } features.append(f) f = GTF.Feature() f.seqid = self.chrom f.source = 'taco' f.feature = 'changeinterval' f.start = cp.start f.end = cp.end f.score = 0 f.strand = Strand.to_gtf(self.strand) f.phase = '.' f.attrs = { 'graph_id': graph_id, 'sign': str(cp.sign), 'pvalue': str(cp.pvalue), 'foldchange': str(cp.foldchange) } features.append(f) return features def split(self): '''splits into weakly connected component subgraphs''' # get connected components of graph which represent independent genes # unconnected components are considered different genes num_components, components = self.G.weakly_connected_components() if num_components == 1: yield self return # map transfrags to components component_transfrag_map = collections.defaultdict(list) # assign transfrags to components for t in self.itertransfrags(): for n in split_transfrag(t, self.node_bounds): component_id = components[self.G.get_node_id(n)] component_transfrag_map[component_id].append(t) break # create new graphs using the separate components for component_transfrags in component_transfrag_map.itervalues(): yield SpliceGraph.create(component_transfrags, guided_ends=self.guided_ends, guided_assembly=self.guided_assembly) def get_node_interval(self, n_id): return self.G.get_node(n_id) def get_node_id(self, n): return self.G.get_node_id(n) def get_node_expr_data(self, n_id): start, end = self.G.get_node(n_id) return self.get_expr_data(start, end) def itertransfrags(self): if self.guided_assembly: return chain(self.transfrags, self.ref_transfrags) else: return iter(self.transfrags) def get_start_stop_nodes(self): start_nodes = set() stop_nodes = set() for i in self.G.node_ids_iter(): if self.G.is_start[i]: start_nodes.add(i) if self.G.is_stop[i]: stop_nodes.add(i) return start_nodes, stop_nodes def reconstruct_exons(self, path): # reverse negative stranded data so that all paths go from # small -> large genomic coords if self.strand == Strand.NEG: path.reverse() # convert from integer node labels to genome (start, end) tuples path = [self.get_node_interval(nid) for nid in path] # collapse contiguous nodes along path newpath = [] chain = [path[0]] for v in path[1:]: if chain[-1].end != v.start: # update path with merge chain node newpath.append(Exon(chain[0].start, chain[-1].end)) # reset chain chain = [] chain.append(v) # add last chain newpath.append(Exon(chain[0].start, chain[-1].end)) return newpath def get_expr_data(self, start=None, end=None): if start is None: start = self.start if end is None: end = self.end if ((start < self.start) or (end > self.end)): m = ('query %d-%d outside locus bounds %d-%d' % (start, end, self.start, self.end)) raise TacoError(m) astart = start - self.start aend = end - self.start return self.expr_data[astart:aend] def get_node_gtf(self): graph_id = ( 'G_%s_%d_%d_%s' % (self.chrom, self.start, self.end, Strand.to_gtf(self.strand))) # iterate through locus and return change point data for n_id in self.G.node_ids_iter(): n = self.get_node_interval(n_id) expr_data = self.get_expr_data(*n) ref_starts = _array_subset(self.ref_start_sites, *n) ref_stops = _array_subset(self.ref_stop_sites, *n) # return gtf feature for each node f = GTF.Feature() f.seqid = self.chrom f.source = 'taco' f.feature = 'node' f.start = n[0] f.end = n[1] f.score = 0 f.strand = Strand.to_gtf(self.strand) f.phase = '.' f.attrs = { 'graph_id': graph_id, 'expr_min': str(expr_data.min()), 'expr_max': str(expr_data.max()), 'expr_mean': str(expr_data.mean()), 'ref_starts': ','.join(map(str, ref_starts)), 'ref_stops': ','.join(map(str, ref_stops)) } yield f
class SpliceGraph(object): def __init__(self): self.guided_ends = False self.guided_assembly = False self.transfrags = [] self.ref_transfrags = [] self.chrom = None self.start = None self.end = None self.strand = None self.tree_starts = IntervalTree() self.tree_ends = IntervalTree() self.ref_start_sites = set() self.ref_stop_sites = set() self.start_sites = set() self.stop_sites = set() self.expr_data = None self.node_bounds = None self.G = None def __str__(self): return ('SpliceGraph %s:%d-%d[%s] transfrags: %d' % (self.chrom, self.start, self.end, Strand.to_gtf(self.strand), len(self.transfrags))) @staticmethod def create(transfrags, guided_ends=False, guided_assembly=False): self = SpliceGraph() self.guided_ends = guided_ends self.guided_assembly = guided_assembly self._add_transfrags(transfrags) self.expr_data = self._compute_expression() self.node_bounds = self._find_node_boundaries() self.G = self._create_splice_graph() self._mark_start_stop_nodes() return self def recreate(self): self._add_transfrags(chain(self.transfrags, self.ref_transfrags)) self.expr_data = self._compute_expression() self.node_bounds = self._find_node_boundaries() self.G = self._create_splice_graph() self._mark_start_stop_nodes() def _add_transfrags(self, transfrags): self.transfrags = [] self.ref_transfrags = [] self.chrom = None self.start = None self.end = None self.strand = None self.ref_start_sites = set() self.ref_stop_sites = set() self.tree_starts = IntervalTree() self.tree_ends = IntervalTree() for t in transfrags: if self.chrom is None: self.chrom = t.chrom elif self.chrom != t.chrom: raise TacoError('chrom mismatch') if self.strand is None: self.strand = t.strand elif self.strand != t.strand: raise TacoError('strand mismatch') if self.start is None: self.start = t.start else: self.start = min(t.start, self.start) if self.end is None: self.end = t.end else: self.end = max(t.end, self.end) if t.is_ref: self.ref_start_sites.add(t.txstart) self.ref_stop_sites.add(t.txstop) self.ref_transfrags.append(t) else: self._add_to_interval_trees(t) self.transfrags.append(t) self.ref_start_sites = sorted(self.ref_start_sites) self.ref_stop_sites = sorted(self.ref_stop_sites) def _compute_expression(self): expr_data = np.zeros(self.end - self.start, dtype=FLOAT_DTYPE) for t in self.transfrags: for exon in t.exons: astart = exon.start - self.start aend = exon.end - self.start expr_data[astart:aend] += t.expr return expr_data def _find_node_boundaries(self): node_bounds = set((self.start, self.end)) node_bounds.update(self.start_sites) node_bounds.update(self.stop_sites) # nodes bounded by regions where expression changes to/from zero node_bounds.update(find_threshold_points(self.expr_data, self.start)) # nodes bounded by introns for t in self.transfrags: node_bounds.update(t.itersplices()) if self.guided_ends or self.guided_assembly: for t in self.ref_transfrags: if self.guided_ends: node_bounds.update((t.start, t.end)) if self.guided_assembly: node_bounds.update(t.itersplices()) return array('i', sorted(node_bounds)) def get_path(self, t): node_id_map = self.G.node_id_map nodes = [node_id_map[n] for n in split_transfrag(t, self.node_bounds)] if self.strand == Strand.NEG: nodes.reverse() return tuple(nodes) def _create_splice_graph(self): G = SGraph() for t in self.itertransfrags(): nodes = [Exon(*n) for n in split_transfrag(t, self.node_bounds)] if self.strand == Strand.NEG: nodes.reverse() G.add_path(nodes, t.is_ref) # after graph is built, set node properties for start and stop nodes G.set_start_stop_nodes() return G def _mark_start_stop_nodes(self): G = self.G node_bounds = self.node_bounds strand = self.strand # mark start nodes changepts = set(self.start_sites) if self.guided_ends: changepts.update(self.ref_start_sites) for pos in changepts: if strand != Strand.NEG: bisect_func = bisect.bisect_right else: bisect_func = bisect.bisect_left i = bisect_func(node_bounds, pos) n = (node_bounds[i-1], node_bounds[i]) if G.has_node(n): n_id = G.get_node_id(n) G.is_start[n_id] = True # mark stop nodes changepts = set(self.stop_sites) if self.guided_ends: changepts.update(self.ref_stop_sites) for pos in changepts: if strand == Strand.NEG: bisect_func = bisect.bisect_right else: bisect_func = bisect.bisect_left i = bisect_func(node_bounds, pos) n = (node_bounds[i-1], node_bounds[i]) if G.has_node(n): n_id = G.get_node_id(n) G.is_stop[n_id] = True def _add_to_interval_trees(self, t): istart = Interval(t.start, t.start+1, value=t) iend = Interval(t.end-1, t.end, value=t) self.tree_starts.insert_interval(istart) self.tree_ends.insert_interval(iend) def _trim_change_point(self, cp): # search for matches in change point interval num_trimmed = 0 if cp.sign < 0: hits = self.tree_ends.find(cp.pos, cp.end) for hit in hits: t = hit.value # last exon start cannot overlap interval last_exon_start = t.exons[-1][0] if cp.pos <= last_exon_start <= cp.end: continue # trim end left t.exons[-1] = Exon(last_exon_start, cp.pos) num_trimmed += 1 else: hits = self.tree_starts.find(cp.start, cp.pos) for hit in hits: t = hit.value # first exon end cannot overlap interval first_exon_end = t.exons[0][1] if cp.start <= first_exon_end <= cp.pos: continue # trim start right t.exons[0] = Exon(cp.pos, first_exon_end) num_trimmed += 1 def detect_change_points(self, *args, **kwargs): ''' *args, **kwargs: passed directly to 'run_changepoint' returns list of ChangePoint tuples ''' changepts = [] for n in self.G.nodes_iter(): expr_data = self.get_expr_data(*n) for cp in run_changepoint(expr_data, *args, **kwargs): # add offset from start of node to change point positions cp = cp._replace(pos=n.start + cp.pos, start=n.start + cp.start, end=n.start + cp.end) changepts.append(cp) logging.debug('%s changepoint node=(%s-%s) ' 'pos=%d interval=(%d-%d) p=%.3f fc=%.3f' % (str(self), n.start, n.end, cp.pos, cp.start, cp.end, cp.pvalue, cp.foldchange)) return changepts def apply_change_point(self, cp, trim=True): ''' trim: if true, will trim transfrags around change points ''' if ((self.strand != Strand.NEG and cp.sign < 0) or (self.strand == Strand.NEG and cp.sign > 0)): self.stop_sites.add(cp.pos) else: self.start_sites.add(cp.pos) if trim: self._trim_change_point(cp) def get_change_point_gtf(self, cp): graph_id = ('G_%s_%d_%d_%s' % (self.chrom, self.start, self.end, Strand.to_gtf(self.strand))) features = [] f = GTF.Feature() f.seqid = self.chrom f.source = 'taco' f.feature = 'changept' f.start = cp.pos f.end = cp.pos + 1 f.score = 0 f.strand = Strand.to_gtf(self.strand) f.phase = '.' f.attrs = {'graph_id': graph_id, 'sign': str(cp.sign), 'pvalue': str(cp.pvalue), 'foldchange': str(cp.foldchange)} features.append(f) f = GTF.Feature() f.seqid = self.chrom f.source = 'taco' f.feature = 'changeinterval' f.start = cp.start f.end = cp.end f.score = 0 f.strand = Strand.to_gtf(self.strand) f.phase = '.' f.attrs = {'graph_id': graph_id, 'sign': str(cp.sign), 'pvalue': str(cp.pvalue), 'foldchange': str(cp.foldchange)} features.append(f) return features def split(self): '''splits into weakly connected component subgraphs''' # get connected components of graph which represent independent genes # unconnected components are considered different genes num_components, components = self.G.weakly_connected_components() if num_components == 1: yield self return # map transfrags to components component_transfrag_map = collections.defaultdict(list) # assign transfrags to components for t in self.itertransfrags(): for n in split_transfrag(t, self.node_bounds): component_id = components[self.G.get_node_id(n)] component_transfrag_map[component_id].append(t) break # create new graphs using the separate components for component_transfrags in component_transfrag_map.itervalues(): yield SpliceGraph.create(component_transfrags, guided_ends=self.guided_ends, guided_assembly=self.guided_assembly) def get_node_interval(self, n_id): return self.G.get_node(n_id) def get_node_id(self, n): return self.G.get_node_id(n) def get_node_expr_data(self, n_id): start, end = self.G.get_node(n_id) return self.get_expr_data(start, end) def itertransfrags(self): if self.guided_assembly: return chain(self.transfrags, self.ref_transfrags) else: return iter(self.transfrags) def get_start_stop_nodes(self): start_nodes = set() stop_nodes = set() for i in self.G.node_ids_iter(): if self.G.is_start[i]: start_nodes.add(i) if self.G.is_stop[i]: stop_nodes.add(i) return start_nodes, stop_nodes def reconstruct_exons(self, path): # reverse negative stranded data so that all paths go from # small -> large genomic coords if self.strand == Strand.NEG: path.reverse() # convert from integer node labels to genome (start, end) tuples path = [self.get_node_interval(nid) for nid in path] # collapse contiguous nodes along path newpath = [] chain = [path[0]] for v in path[1:]: if chain[-1].end != v.start: # update path with merge chain node newpath.append(Exon(chain[0].start, chain[-1].end)) # reset chain chain = [] chain.append(v) # add last chain newpath.append(Exon(chain[0].start, chain[-1].end)) return newpath def get_expr_data(self, start=None, end=None): if start is None: start = self.start if end is None: end = self.end if ((start < self.start) or (end > self.end)): m = ('query %d-%d outside locus bounds %d-%d' % (start, end, self.start, self.end)) raise TacoError(m) astart = start - self.start aend = end - self.start return self.expr_data[astart:aend] def get_node_gtf(self): graph_id = ('G_%s_%d_%d_%s' % (self.chrom, self.start, self.end, Strand.to_gtf(self.strand))) # iterate through locus and return change point data for n_id in self.G.node_ids_iter(): n = self.get_node_interval(n_id) expr_data = self.get_expr_data(*n) ref_starts = _array_subset(self.ref_start_sites, *n) ref_stops = _array_subset(self.ref_stop_sites, *n) # return gtf feature for each node f = GTF.Feature() f.seqid = self.chrom f.source = 'taco' f.feature = 'node' f.start = n[0] f.end = n[1] f.score = 0 f.strand = Strand.to_gtf(self.strand) f.phase = '.' f.attrs = {'graph_id': graph_id, 'expr_min': str(expr_data.min()), 'expr_max': str(expr_data.max()), 'expr_mean': str(expr_data.mean()), 'ref_starts': ','.join(map(str, ref_starts)), 'ref_stops': ','.join(map(str, ref_stops))} yield f
class SpliceGraph(object): def __init__(self): self.guided_ends = False self.guided_assembly = False self.transfrags = [] self.ref_transfrags = [] self.chrom = None self.start = None self.end = None self.strand = None self.tree_starts = IntervalTree() self.tree_ends = IntervalTree() self.ref_start_sites = set() self.ref_stop_sites = set() self.start_sites = set() self.stop_sites = set() self.expr_data = None self.node_bounds = None self.G = None @staticmethod def create(transfrags, guided_ends=False, guided_assembly=False): self = SpliceGraph() self.guided_ends = guided_ends self.guided_assembly = guided_assembly self._add_transfrags(transfrags) self.expr_data = self._compute_expression() self.node_bounds = self._find_node_boundaries() self.G = self._create_splice_graph() self._mark_start_stop_nodes() return self def recreate(self): self._add_transfrags(chain(self.transfrags, self.ref_transfrags)) self.expr_data = self._compute_expression() self.node_bounds = self._find_node_boundaries() self.G = self._create_splice_graph() self._mark_start_stop_nodes() def _add_transfrags(self, transfrags): self.transfrags = [] self.ref_transfrags = [] self.chrom = None self.start = None self.end = None self.strand = None self.ref_start_sites = set() self.ref_stop_sites = set() self.tree_starts = IntervalTree() self.tree_ends = IntervalTree() for t in transfrags: if self.chrom is None: self.chrom = t.chrom elif self.chrom != t.chrom: raise TacoError('chrom mismatch') if self.strand is None: self.strand = t.strand elif self.strand != t.strand: raise TacoError('strand mismatch') if self.start is None: self.start = t.start else: self.start = min(t.start, self.start) if self.end is None: self.end = t.end else: self.end = max(t.end, self.end) if t.is_ref: self.ref_start_sites.add(t.txstart) self.ref_stop_sites.add(t.txstop) self.ref_transfrags.append(t) else: self._add_to_interval_trees(t) self.transfrags.append(t) self.ref_start_sites = sorted(self.ref_start_sites) self.ref_stop_sites = sorted(self.ref_stop_sites) def _compute_expression(self): expr_data = np.zeros(self.end - self.start, dtype=FLOAT_DTYPE) for t in self.transfrags: for exon in t.exons: astart = exon.start - self.start aend = exon.end - self.start expr_data[astart:aend] += t.expr return expr_data def _find_node_boundaries(self): node_bounds = set((self.start, self.end)) node_bounds.update(self.start_sites) node_bounds.update(self.stop_sites) # nodes bounded by regions where expression changes to/from zero node_bounds.update(find_threshold_points(self.expr_data, self.start)) # nodes bounded by introns for t in self.transfrags: node_bounds.update(t.itersplices()) if self.guided_ends or self.guided_assembly: for t in self.ref_transfrags: if self.guided_ends: node_bounds.update((t.start, t.end)) if self.guided_assembly: node_bounds.update(t.itersplices()) return array('i', sorted(node_bounds)) def get_path(self, t): node_id_map = self.G.graph['node_id_map'] nodes = [node_id_map[n] for n in split_transfrag(t, self.node_bounds)] if self.strand == Strand.NEG: nodes.reverse() return tuple(nodes) def _create_splice_graph2(self): G = SGraph() for t in sgraph.itertransfrags(): nodes = [n for n in split_transfrag(t, self.node_bounds)] if self.strand == Strand.NEG: nodes.reverse() G.add_path(nodes, t.is_ref) return G def _create_splice_graph(self): '''returns networkx DiGraph object''' G = nx.DiGraph() node_bounds = self.node_bounds node_id_map = {} id_node_map = {} current_id = 2 for t in self.itertransfrags(): # split exons that cross boundaries and get the # nodes that made up the transfrag nodes = [] for n in split_transfrag(t, node_bounds): n = Exon(*n) if n not in node_id_map: n_id = current_id current_id += 1 node_id_map[n] = n_id id_node_map[n_id] = n else: n_id = node_id_map[n] nodes.append(n_id) if t.strand == Strand.NEG: nodes.reverse() # add nodes/edges to graph u = nodes[0] SGNode.add(G, u, id_node_map[u], is_ref=t.is_ref) for i in xrange(1, len(nodes)): v = nodes[i] SGNode.add(G, v, id_node_map[v], is_ref=t.is_ref) G.add_edge(u, v) u = v G.graph['node_id_map'] = node_id_map G.graph['id_node_map'] = id_node_map return G def _mark_start_stop_nodes(self): G = self.G # get all leaf nodes for n, nd in G.nodes_iter(data=True): if G.in_degree(n) == 0: nd[SGNode.IS_START] = True if G.out_degree(n) == 0: nd[SGNode.IS_STOP] = True # mark change points change_points = set() change_points.update(set((SGNode.IS_START, x) for x in self.start_sites)) change_points.update(set((SGNode.IS_STOP, x) for x in self.stop_sites)) if self.guided_ends: change_points.update((SGNode.IS_START, x) for x in self.ref_start_sites) change_points.update((SGNode.IS_STOP, x) for x in self.ref_stop_sites) node_bounds = self.node_bounds strand = self.strand node_id_map = G.graph['node_id_map'] for direction, pos in change_points: if ((direction == SGNode.IS_STOP and strand == Strand.NEG) or (direction == SGNode.IS_START and strand != Strand.NEG)): bisect_func = bisect.bisect_right else: bisect_func = bisect.bisect_left i = bisect_func(node_bounds, pos) n = (node_bounds[i-1], node_bounds[i]) if n in node_id_map: # 2/5/2016: observed case where trimming caused an interval # with zero expression, and this function then crashed when # attempting to mark the node as a start/stop. We no longer # attempt to mark nodes in zero expression regions. G.node[node_id_map[n]][direction] = True def _add_to_interval_trees(self, t): istart = Interval(t.start, t.start+1, value=t) iend = Interval(t.end-1, t.end, value=t) self.tree_starts.insert_interval(istart) self.tree_ends.insert_interval(iend) def _trim_change_point(self, cp): # search for matches in change point interval num_trimmed = 0 if cp.sign < 0: hits = self.tree_ends.find(cp.pos, cp.end) for hit in hits: t = hit.value # last exon start cannot overlap interval last_exon_start = t.exons[-1][0] if cp.pos <= last_exon_start <= cp.end: continue # trim end left t.exons[-1] = Exon(last_exon_start, cp.pos) num_trimmed += 1 else: hits = self.tree_starts.find(cp.start, cp.pos) for hit in hits: t = hit.value # first exon end cannot overlap interval first_exon_end = t.exons[0][1] if cp.start <= first_exon_end <= cp.pos: continue # trim start right t.exons[0] = Exon(cp.pos, first_exon_end) num_trimmed += 1 def detect_change_points(self, *args, **kwargs): ''' *args, **kwargs: passed directly to 'run_changepoint' returns list of ChangePoint tuples ''' changepts = [] for n_id in self.G: n = self.get_node_interval(n_id) expr_data = self.get_expr_data(n.start, n.end) for cp in run_changepoint(expr_data, *args, **kwargs): # add offset from start of node to change point positions cp = cp._replace(pos=n.start + cp.pos, start=n.start + cp.start, end=n.start + cp.end) changepts.append(cp) logging.debug('\t%s:%d-%d[%s] node: %s-%s cp:%d(%d-%d) ' 'p=%.3f fc=%.3f' % (self.chrom, self.start, self.end, Strand.to_gtf(self.strand), n.start, n.end, cp.pos, cp.start, cp.end, cp.pvalue, cp.foldchange)) return changepts def apply_change_point(self, cp, trim=True): ''' trim: if true, will trim transfrags around change points ''' if ((self.strand != Strand.NEG and cp.sign < 0) or (self.strand == Strand.NEG and cp.sign > 0)): self.stop_sites.add(cp.pos) else: self.start_sites.add(cp.pos) if trim: self._trim_change_point(cp) def get_change_point_gtf(self, cp): graph_id = ('G_%s_%d_%d_%s' % (self.chrom, self.start, self.end, Strand.to_gtf(self.strand))) features = [] f = GTF.Feature() f.seqid = self.chrom f.source = 'taco' f.feature = 'changept' f.start = cp.pos f.end = cp.pos + 1 f.score = 0 f.strand = Strand.to_gtf(self.strand) f.phase = '.' f.attrs = {'graph_id': graph_id, 'sign': str(cp.sign), 'pvalue': str(cp.pvalue), 'foldchange': str(cp.foldchange)} features.append(f) f = GTF.Feature() f.seqid = self.chrom f.source = 'taco' f.feature = 'changeinterval' f.start = cp.start f.end = cp.end f.score = 0 f.strand = Strand.to_gtf(self.strand) f.phase = '.' f.attrs = {'graph_id': graph_id, 'sign': str(cp.sign), 'pvalue': str(cp.pvalue), 'foldchange': str(cp.foldchange)} features.append(f) return features def split(self): '''splits into weakly connected component subgraphs''' # get connected components of graph which represent independent genes # unconnected components are considered different genes Gsubs = list(nx.weakly_connected_component_subgraphs(self.G)) if len(Gsubs) == 1: yield self return # map nodes to components node_subgraph_map = {} subgraph_transfrag_map = collections.defaultdict(list) for i, Gsub in enumerate(Gsubs): for n_id in Gsub: n = self.get_node_interval(n_id) node_subgraph_map[n] = i # assign transfrags to components for t in self.itertransfrags(): for n in split_transfrag(t, self.node_bounds): subgraph_id = node_subgraph_map[n] subgraph_transfrag_map[subgraph_id].append(t) break # create new graphs using the separate components for subgraph_transfrags in subgraph_transfrag_map.itervalues(): yield SpliceGraph.create(subgraph_transfrags, guided_ends=self.guided_ends, guided_assembly=self.guided_assembly) def get_node_interval(self, n_id): return self.G.node[n_id][SGNode.INTERVAL] def get_node_id(self, n): return self.G.graph['node_id_map'][n] def get_node_expr_data(self, n_id): start, end = self.get_node_interval(n_id) return self.get_expr_data(start, end) def itertransfrags(self): if self.guided_assembly: return chain(self.transfrags, self.ref_transfrags) else: return iter(self.transfrags) def get_start_stop_nodes(self): start_nodes = set() stop_nodes = set() for n, nd in self.G.nodes_iter(data=True): if nd[SGNode.IS_START]: start_nodes.add(n) if nd[SGNode.IS_STOP]: stop_nodes.add(n) return start_nodes, stop_nodes def get_expr_data(self, start=None, end=None): if start is None: start = self.start if end is None: end = self.end if ((start < self.start) or (end > self.end)): m = ('query %d-%d outside locus bounds %d-%d' % (start, end, self.start, self.end)) raise TacoError(m) astart = start - self.start aend = end - self.start return self.expr_data[astart:aend] def get_node_gtf(self): graph_id = ('G_%s_%d_%d_%s' % (self.chrom, self.start, self.end, Strand.to_gtf(self.strand))) # iterate through locus and return change point data for n_id in self.G: n = self.get_node_interval(n_id) expr_data = self.get_expr_data(*n) ref_starts = _array_subset(self.ref_start_sites, *n) ref_stops = _array_subset(self.ref_stop_sites, *n) # return gtf feature for each node f = GTF.Feature() f.seqid = self.chrom f.source = 'taco' f.feature = 'node' f.start = n[0] f.end = n[1] f.score = 0 f.strand = Strand.to_gtf(self.strand) f.phase = '.' f.attrs = {'graph_id': graph_id, 'expr_min': str(expr_data.min()), 'expr_max': str(expr_data.max()), 'expr_mean': str(expr_data.mean()), 'ref_starts': ','.join(map(str, ref_starts)), 'ref_stops': ','.join(map(str, ref_stops))} yield f