def _trim_change_point(self, cp): # search for matches in change point interval num_trimmed = 0 if cp.sign < 0: hits = self.tree_ends.find(cp.pos, cp.end) for hit in hits: t = hit.value # last exon start cannot overlap interval last_exon_start = t.exons[-1][0] if cp.pos <= last_exon_start <= cp.end: continue # trim end left t.exons[-1] = Exon(last_exon_start, cp.pos) num_trimmed += 1 else: hits = self.tree_starts.find(cp.start, cp.pos) for hit in hits: t = hit.value # first exon end cannot overlap interval first_exon_end = t.exons[0][1] if cp.start <= first_exon_end <= cp.pos: continue # trim start right t.exons[0] = Exon(cp.pos, first_exon_end) num_trimmed += 1
def get_transcripts(stream): """Collect transcripts as collections of CDS/exons""" transcripts = {} for line__ in stream: if isinstance(line__, str): line = line__ elif isinstance(line__, bytes): line = line__.decode() else: raise InputError('Unsupported stream data') if line.startswith('#') or not line.strip(): continue f = parse(line) if f.feature == 'exon': m = re.search('transcript_id "([a-zA-Z0-9\.]+)";', f.attribute) if not m: raise ValueError('Gene id could not be found') index = m.group(1) exons = [] if index not in transcripts: transcripts[index] = mRNA(seqid = f.seqid, start = None, end = None, strand = f.strand, exons = [], attributes = '') transcripts[index].exons.append(Exon(seqid = f.seqid, start = f.start, end = f.end, strand = f.strand)) return transcripts
def _create_splice_graph(self): '''returns networkx DiGraph object''' G = nx.DiGraph() node_bounds = self.node_bounds node_id_map = {} id_node_map = {} current_id = 2 for t in self.itertransfrags(): # split exons that cross boundaries and get the # nodes that made up the transfrag nodes = [] for n in split_transfrag(t, node_bounds): n = Exon(*n) if n not in node_id_map: n_id = current_id current_id += 1 node_id_map[n] = n_id id_node_map[n_id] = n else: n_id = node_id_map[n] nodes.append(n_id) if t.strand == Strand.NEG: nodes.reverse() # add nodes/edges to graph u = nodes[0] SGNode.add(G, u, id_node_map[u], is_ref=t.is_ref) for i in xrange(1, len(nodes)): v = nodes[i] SGNode.add(G, v, id_node_map[v], is_ref=t.is_ref) G.add_edge(u, v) u = v G.graph['node_id_map'] = node_id_map G.graph['id_node_map'] = id_node_map return G
def parse_gtf(gtf_lines, ignore_ref=True): ''' returns OrderedDict key is transcript_id value is Transfrag ''' t_dict = collections.OrderedDict() for gtf_line in gtf_lines: f = GTF.Feature.from_str(gtf_line) t_id = f.attrs[GTF.Attr.TRANSCRIPT_ID] is_ref = bool(int(f.attrs.get(GTF.Attr.REF, '0'))) if is_ref and ignore_ref: continue if f.feature == 'transcript': if t_id in t_dict: raise GTFError("Transcript '%s' duplicate detected" % t_id) t = Transfrag.from_gtf(f) t_dict[t_id] = t elif f.feature == 'exon': if t_id not in t_dict: logging.error('Feature: "%s"' % str(f)) raise GTFError("Transcript '%s' exon feature appeared in " "gtf file prior to transcript feature" % t_id) t = t_dict[t_id] t.exons.append(Exon(f.start, f.end)) return t_dict
def _create_splice_graph(self): G = SGraph() for t in self.itertransfrags(): nodes = [Exon(*n) for n in split_transfrag(t, self.node_bounds)] if self.strand == Strand.NEG: nodes.reverse() G.add_path(nodes, t.is_ref) # after graph is built, set node properties for start and stop nodes G.set_start_stop_nodes() return G
def reconstruct_exons(self, path): # reverse negative stranded data so that all paths go from # small -> large genomic coords if self.strand == Strand.NEG: path.reverse() # convert from integer node labels to genome (start, end) tuples path = [self.get_node_interval(nid) for nid in path] # collapse contiguous nodes along path newpath = [] chain = [path[0]] for v in path[1:]: if chain[-1].end != v.start: # update path with merge chain node newpath.append(Exon(chain[0].start, chain[-1].end)) # reset chain chain = [] chain.append(v) # add last chain newpath.append(Exon(chain[0].start, chain[-1].end)) return newpath
def reconstruct_path(kmer_path, id_kmer_map, sgraph): # reconstruct path from kmer ids path = list(id_kmer_map[kmer_path[1]]) path.extend(id_kmer_map[n][-1] for n in kmer_path[2:-1]) # reverse negative stranded data so that all paths go from # small -> large genomic coords if sgraph.strand == Strand.NEG: path.reverse() # convert from integer node labels to genome (start, end) tuples path = [sgraph.get_node_interval(nid) for nid in path] # collapse contiguous nodes along path newpath = [] chain = [path[0]] for v in path[1:]: if chain[-1].end != v.start: # update path with merge chain node newpath.append(Exon(chain[0].start, chain[-1].end)) # reset chain chain = [] chain.append(v) # add last chain newpath.append(Exon(chain[0].start, chain[-1].end)) return newpath
def parse_gtf(gtf_iter, sample_id, gtf_expr_attr, is_ref): ''' returns list of Transfrag objects ''' t_dict = collections.OrderedDict() total_expr = 0.0 cur_t_id = 1 for gtf_line in gtf_iter: if not gtf_line: continue if not gtf_line.strip(): continue if gtf_line.startswith("#"): continue f = GTF.Feature.from_str(gtf_line) if f.feature == 'transcript': t_id = f.attrs[GTF.Attr.TRANSCRIPT_ID] if t_id in t_dict: raise GTFError("Transcript '%s' duplicate detected" % t_id) # rename transcript id new_t_id = "%s.%d" % (sample_id, cur_t_id) cur_t_id += 1 # parse expression if is_ref: expr = 0.0 else: if gtf_expr_attr not in f.attrs: raise GTFError("GTF expression attribute '%s' not found" % (gtf_expr_attr)) expr = float(f.attrs[gtf_expr_attr]) total_expr += expr # create transfrag t = Transfrag(chrom=f.seqid, strand=Strand.from_gtf(f.strand), _id=new_t_id, expr=float(expr), is_ref=is_ref, exons=None) t_dict[t_id] = t elif f.feature == 'exon': t_id = f.attrs[GTF.Attr.TRANSCRIPT_ID] if t_id not in t_dict: logging.error('Feature: "%s"' % str(f)) raise GTFError("Transcript '%s' exon feature appeared in " "gtf file prior to transcript feature" % t_id) t = t_dict[t_id] t.exons.append(Exon(f.start, f.end)) return t_dict.values(), total_expr
def get_chains(G, introns=True): """ group nodes into chains returns a dict mapping node -> chain, as well as a dict mapping chains to nodes """ if introns: can_collapse_func = can_collapse else: can_collapse_func = can_collapse_contiguous imin2 = lambda x, y: x if x <= y else y imax2 = lambda x, y: x if x >= y else y node_chain_map = {} chains = {} # initialize each node to be in a "chain" by itself for n in G.nodes_iter(): node_chain_map[n] = n chains[n] = set((n, )) for u, v in G.edges_iter(): if not can_collapse_func(G, u, v): continue # get chains containing these nodes u_new = node_chain_map[u] u_chain = chains[u_new] del chains[u_new] v_new = node_chain_map[v] v_chain = chains[v_new] del chains[v_new] # merge chains merged_chain = u_chain.union(v_chain) merged_node = Exon(imin2(u_new.start, v_new.start), imax2(u_new.end, v_new.end)) # point all nodes in chain to new parent for n in merged_chain: node_chain_map[n] = merged_node chains[merged_node] = merged_chain # sort chain nodes by genome position and store as list for parent in chains: chains[parent] = sorted(chains[parent], key=operator.attrgetter('start')) return node_chain_map, chains
def from_bed(line): fields = line.strip().split('\t') chrom = fields[0] tx_start = int(fields[1]) _id = fields[3] is_ref = (_id.split('.')[0] == Sample.REF_ID) expr = float(fields[4]) strand = Strand.from_bed(fields[5]) num_exons = int(fields[9]) block_sizes = fields[10].split(',') block_starts = fields[11].split(',') exons = [] for i in xrange(num_exons): start = tx_start + int(block_starts[i]) end = start + int(block_sizes[i]) exons.append(Exon(start, end)) return Transfrag(chrom=chrom, strand=strand, _id=_id, expr=expr, is_ref=is_ref, exons=exons)
def parse_gtf(gtf_lines): ''' returns list of Transfrag objects ''' t_dict = collections.OrderedDict() for gtf_line in gtf_lines: f = GTF.Feature.from_str(gtf_line) if f.feature == 'transcript': t_id = f.attrs[GTF.Attr.TRANSCRIPT_ID] if t_id in t_dict: raise GTFError("Transcript '%s' duplicate detected" % t_id) t = Transfrag.from_gtf(f) t_dict[t_id] = t elif f.feature == 'exon': t_id = f.attrs[GTF.Attr.TRANSCRIPT_ID] if t_id not in t_dict: logging.error('Feature: "%s"' % str(f)) raise GTFError("Transcript '%s' exon feature appeared in " "gtf file prior to transcript feature" % t_id) t = t_dict[t_id] t.exons.append(Exon(f.start, f.end)) return t_dict
def get_mrnas(stream, source=None): """Collect mRNA extents with exons""" mrnas = {} for line__ in stream: if isinstance(line__, str): line = line__ elif isinstance(line__, bytes): line = line__.decode() else: raise InputError('Unsupported stream data') if line.startswith('#'): continue f = parse(line) if source is not None and f.source != source: continue # if f.feature in ['mRNA', 'transcript', 'primary_transcript', 'miRNA', # 'lnc_RNA', 'gene', 'snoRNA', 'antisense_RNA']: # if f.feature in ['mRNA', 'transcript']: if f.feature == 'mRNA': # m = re.search('ID=(rna\d\d*);', f.attribute) m = re.search('ID=(\w[\w:]*)', f.attribute) if not m: raise ValueError('mRNA id could not be found') index = m.group(1) exons = [] if index in mrnas: if mrnas[index].start is not None: raise RuntimeError( 'mRNA with the same id is already present') exons = mrna[index].exons attributes = {} for a in f.attribute.rstrip().split(';'): r = a.split('=') if (len(r) == 2): attributes[r[0]] = r[1] mrnas[index] = mRNA(seqid=f.seqid, start=f.start, end=f.end, strand=f.strand, exons=exons, attributes=attributes) if f.feature == 'exon': # m = re.search('Parent=(rna\d\d*)', f.attribute) m = re.search('Parent=(\w[\w:]*)', f.attribute) if not m: # there seem to be exons not assigned to mRNAs # raise ValueError('Exon without parent: {0}'.format(f.attribute)) print('WARNING: Exon without parent: {0}'.format(line)) continue index = m.group(1) if index not in mrnas: # raise ValueError('Parent of the exon not found: {0}'.format(line)) # mrnas[index] = mRNA(seqid = f.seqid, start = None, end = None, # strand = None, exons = []) continue mrnas[index].exons.append( Exon(seqid=f.seqid, start=f.start, end=f.end, strand=f.strand)) # these things appear in RNA-seq, but are not mRNA if f.feature == 'five_prime_UTR': m = re.search('ID=(id\d\d*);', f.attribute) if not m: raise ValueError("5'UTR id could not be found") index = m.group(1) if index not in mrnas: mrnas[index] = mRNA(seqid=f.seqid, start=None, end=None, strand=None, exons=[], attributes={}) mrnas[index].exons.append( Exon(seqid=f.seqid, start=f.start, end=f.end, strand=f.strand)) # this is a hack # there are introns with no exons in the gff file, we create fake # exons for easier processing if f.feature == 'intron': m = re.search('ID=(id\d\d*);', f.attribute) if not m: raise ValueError('Intron id could not be found') index = m.group(1) if index in mrnas: raise RuntimeError('mRNA element already present') exon1 = Exon(seqid=f.seqid, start=f.start - 2, end=f.start - 1, strand=f.strand) exon2 = Exon(seqid=f.seqid, start=f.end + 1, end=f.end + 2, strand=f.strand) mrnas[index] = mRNA(seqid=f.seqid, start=None, end=None, strand=None, exons=[exon1, exon2], attributes={}) return mrnas