def test_trim_intron_retention(self): transcripts = read_first_locus("trim_intron_retention1.gtf", score_attr="FPKM") GG = get_transcript_graphs(transcripts) G, tmap = GG[POS_STRAND] # trim at different thresholds trim_nodes = trim_graph(G, POS_STRAND, min_trim_length=0, trim_utr_fraction=0.0, trim_intron_fraction=0.01) correct = set() self.assertTrue(trim_nodes == correct) trim_nodes = trim_graph(G, POS_STRAND, min_trim_length=0, trim_utr_fraction=0.0, trim_intron_fraction=0.11) correct = set([Exon(500, 1500)]) self.assertTrue(trim_nodes == correct) trim_nodes = trim_graph(G, POS_STRAND, min_trim_length=0, trim_utr_fraction=0.0, trim_intron_fraction=0.21) correct = set([Exon(500, 1500), Exon(2000, 9000)]) self.assertTrue(trim_nodes == correct) trim_nodes = trim_graph(G, POS_STRAND, min_trim_length=0, trim_utr_fraction=0.0, trim_intron_fraction=1.0) correct = set([Exon(500, 1500), Exon(2000, 9000)]) self.assertTrue(trim_nodes == correct)
def genome_interval_to_exons(start, end, t): assert start >= t.start assert end <= t.end assert start < end i = 0 while t.exons[i].end < start: i += 1 start_exon = i while t.exons[i].end < end: i += 1 end_exon = i newexons = [] if start_exon == end_exon: newexons.append(Exon(start, end)) else: newexons.append(Exon(start, t.exons[start_exon].end)) for i in xrange(start_exon + 1, end_exon): newexons.append(t.exons[i]) newexons.append(Exon(t.exons[end_exon].start, end)) return newexons
def from_table(line): fields = line.strip().split('\t') self = ORFInfo() self.transcript_id = fields[0] self.gene_id = fields[1] self.orf_id = fields[2] self.frame = int(fields[3]) self.chrom = fields[5] self.start = int(fields[6]) self.end = int(fields[7]) self.strand = fields[8] exon_starts = map(int, fields[9].split(',')) exon_ends = map(int, fields[10].split(',')) self.exons = [Exon(x, y) for x, y in zip(exon_starts, exon_ends)] self.seq = fields[ORFInfo.SEQ_COL_NUM] return self
def create_undirected_transcript_graph(transcripts, add_node_func, **kwargs): ''' add all transcripts to a single undirected graph ''' # find the intron domains of the transcripts boundaries = find_exon_boundaries(transcripts) # initialize transcript graph as undirected at first G = nx.Graph() # add transcripts for t in transcripts: # split exons that cross boundaries and to get the # nodes in the transcript path nodes = list(Exon(start,end) for start,end in split_exons(t, boundaries)) # add nodes/edges to graph u = nodes[0] add_node_func(G, u, t, **kwargs) for v in nodes[1:]: add_node_func(G, v, t, **kwargs) G.add_edge(u, v) u = v return G
def create_directed_graph(strand, transcripts): '''build strand-specific graph''' def add_node_directed(G, n, score): """add node to graph""" if n not in G: G.add_node(n, attr_dict={ NODE_LENGTH: (n.end - n.start), NODE_SCORE: 0.0 }) nd = G.node[n] nd[NODE_SCORE] += score # initialize transcript graph G = nx.DiGraph() # find the intron domains of the transcripts boundaries = find_exon_boundaries(transcripts) # add transcripts for t in transcripts: # split exons that cross boundaries and get the # nodes that made up the transcript # TODO: can generate nodes = [Exon(start, end) for start, end in split_exons(t, boundaries)] if strand == NEG_STRAND: nodes.reverse() # add nodes/edges to graph u = nodes[0] add_node_directed(G, u, t.score) for i in xrange(1, len(nodes)): v = nodes[i] add_node_directed(G, v, t.score) G.add_edge(u, v) u = v # set graph attributes G.graph['boundaries'] = boundaries return G
def create_directed_graph(strand, transcripts): '''build strand-specific graph''' def add_node_directed(G, n, t_id, score): """add node to graph""" if n not in G: G.add_node(n, attr_dict={ TRANSCRIPT_IDS: set(), NODE_LENGTH: (n.end - n.start), NODE_SCORE: 0.0 }) nd = G.node[n] nd[TRANSCRIPT_IDS].add(t_id) nd[NODE_SCORE] += score # find the intron domains of the transcripts boundaries = find_exon_boundaries(transcripts) # initialize transcript graph G = nx.DiGraph() # add transcripts for t in transcripts: t_id = t.attrs[GTFAttr.TRANSCRIPT_ID] # split exons that cross boundaries and get the # nodes that made up the transcript nodes = [Exon(start, end) for start, end in split_exons(t, boundaries)] if strand == NEG_STRAND: nodes.reverse() # add nodes/edges to graph u = nodes[0] add_node_directed(G, u, t_id, t.score) for i in xrange(1, len(nodes)): v = nodes[i] add_node_directed(G, v, t_id, t.score) G.add_edge(u, v) u = v return G
def test_trim_intron_bidir(self): transcripts = read_first_locus("trim_intron_bidir1.gtf", score_attr="FPKM") GG = get_transcript_graphs(transcripts) G, tmap = GG[POS_STRAND] # trim at different thresholds trim_nodes = trim_graph(G, POS_STRAND, min_trim_length=0, trim_utr_fraction=0.0, trim_intron_fraction=0.001) correct = set() self.assertTrue(trim_nodes == correct) trim_nodes = trim_graph(G, POS_STRAND, min_trim_length=0, trim_utr_fraction=0.0, trim_intron_fraction=0.025) correct = set([Exon(1900, 2000), Exon(1000, 1100)]) self.assertTrue(trim_nodes == correct) trim_nodes = trim_graph(G, POS_STRAND, min_trim_length=0, trim_utr_fraction=0.0, trim_intron_fraction=0.2) correct = set([ Exon(1900, 2000), Exon(1100, 1200), Exon(1800, 1900), Exon(1000, 1100) ]) self.assertTrue(trim_nodes == correct) trim_nodes = trim_graph(G, POS_STRAND, min_trim_length=0, trim_utr_fraction=0.0, trim_intron_fraction=0.25) correct = set([ Exon(1900, 2000), Exon(1100, 1200), Exon(1200, 1300), Exon(1700, 1800), Exon(1800, 1900), Exon(1000, 1100) ]) self.assertTrue(trim_nodes == correct) # flip sign of transcripts and try again for t in transcripts: t.strand = NEG_STRAND GG = get_transcript_graphs(transcripts) G, tmap = GG[NEG_STRAND] # trim at different thresholds trim_nodes = trim_graph(G, NEG_STRAND, min_trim_length=0, trim_utr_fraction=0.0, trim_intron_fraction=0.001) correct = set() self.assertTrue(trim_nodes == correct) trim_nodes = trim_graph(G, NEG_STRAND, min_trim_length=0, trim_utr_fraction=0.0, trim_intron_fraction=0.025) correct = set([Exon(1900, 2000), Exon(1000, 1100)]) self.assertTrue(trim_nodes == correct) trim_nodes = trim_graph(G, NEG_STRAND, min_trim_length=0, trim_utr_fraction=0.0, trim_intron_fraction=0.2) correct = set([ Exon(1900, 2000), Exon(1100, 1200), Exon(1800, 1900), Exon(1000, 1100) ]) self.assertTrue(trim_nodes == correct) trim_nodes = trim_graph(G, NEG_STRAND, min_trim_length=0, trim_utr_fraction=0.0, trim_intron_fraction=0.25) correct = set([ Exon(1900, 2000), Exon(1100, 1200), Exon(1200, 1300), Exon(1700, 1800), Exon(1800, 1900), Exon(1000, 1100) ]) self.assertTrue(trim_nodes == correct)
def test_assembler1(self): # setup correct transcripts PATH_ABCDE = tuple([ Exon(0, 100), Exon(200, 300), Exon(400, 500), Exon(600, 700), Exon(800, 900) ]) PATH_ACE = tuple([Exon(0, 100), Exon(400, 500), Exon(800, 900)]) PATH_ABCE = tuple( [Exon(0, 100), Exon(200, 300), Exon(400, 500), Exon(800, 900)]) PATH_ACDE = tuple( [Exon(0, 100), Exon(400, 500), Exon(600, 700), Exon(800, 900)]) # read transcripts transcripts = read_first_locus("assemble1.gtf", score_attr="score") GG = get_transcript_graphs(transcripts) G, tmap = GG[POS_STRAND] # set transcript scores tmap["ABCDE"].score = 2.0 tmap["ACE"].score = 1.0 tmap["ABCE"].score = 1.0 tmap["ACDE"].score = 1.0 # set assembly parameter kmax = 2 # assemble GS = list( prune_transcript_graph(G, POS_STRAND, tmap, min_trim_length=0, trim_utr_fraction=0, trim_intron_fraction=0)) Gsub, strand, partial_paths = GS[0] results = list( assemble_transcript_graph(Gsub, strand, partial_paths, user_kmax=kmax, ksensitivity=0, fraction_major_path=0, max_paths=1000)) self.assertEquals(len(results), 2) self.assertEqual(tuple(results[0].path), PATH_ABCDE) self.assertAlmostEqual(results[0].score, 3.0, places=3) self.assertEqual(tuple(results[1].path), PATH_ACE) self.assertAlmostEqual(results[1].score, 2.0, places=3) # set transcript scores tmap["ABCDE"].score = 4.0 tmap["ACE"].score = 3.0 tmap["ABCE"].score = 2.0 tmap["ACDE"].score = 1.0 # set assembly parameter kmax = 3 # assemble GS = list( prune_transcript_graph(G, POS_STRAND, tmap, min_trim_length=0, trim_utr_fraction=0, trim_intron_fraction=0)) Gsub, strand, partial_paths = GS[0] results = list( assemble_transcript_graph(Gsub, strand, partial_paths, user_kmax=kmax, ksensitivity=0, fraction_major_path=0, max_paths=1000)) self.assertEquals(len(results), 4) self.assertEqual(tuple(results[0].path), PATH_ABCDE) self.assertAlmostEqual(results[0].score, 4.0, places=3) self.assertEqual(tuple(results[1].path), PATH_ACE) self.assertAlmostEqual(results[1].score, 3.0, places=3) self.assertEqual(tuple(results[2].path), PATH_ABCE) self.assertAlmostEqual(results[2].score, 2.0, places=3) self.assertEqual(tuple(results[3].path), PATH_ACDE) self.assertAlmostEqual(results[3].score, 1.0, places=3)
def create_transcript_graphs(chrom, transcripts, min_trim_length=0, trim_utr_fraction=0.0, trim_intron_fraction=0.0, create_bedgraph=False, bedgraph_filehs=None): ''' generates (graph, strand, transcript_map) tuples with transcript graphs ''' def get_bedgraph_lines(chrom, G): for n in sorted(G.nodes()): if n.start < 0: continue fields = (chrom, n.start, n.end, G.node[n][NODE_SCORE]) yield fields # partition transcripts by strand and resolve unstranded transcripts logging.debug("\tResolving unstranded transcripts") strand_transcript_lists, strand_ref_transcripts = \ partition_transcripts_by_strand(transcripts) # create strand-specific graphs using redistributed score logging.debug("\tCreating transcript graphs") transcript_graphs = [] for strand, transcript_list in enumerate(strand_transcript_lists): # create strand specific transcript graph G = create_directed_graph(strand, transcript_list) # output bedgraph if create_bedgraph: for fields in get_bedgraph_lines(chrom, G): print >> bedgraph_filehs[strand], '\t'.join(map(str, fields)) # trim utrs and intron retentions trim_nodes = trim_graph(G, strand, min_trim_length, trim_utr_fraction, trim_intron_fraction) G.remove_nodes_from(trim_nodes) # collapse consecutive nodes in graph H, node_chain_map = collapse_strand_specific_graph(G, introns=True) # get connected components of graph which represent independent genes # unconnected components are considered different genes Gsubs = nx.weakly_connected_component_subgraphs(H) # add components as separate transcript graphs strand_graphs = [] node_subgraph_map = {} for i, Gsub in enumerate(Gsubs): for n in Gsub: node_subgraph_map[n] = i tg = TranscriptGraph(chrom, strand, Gsub) tg.partial_paths = collections.defaultdict(lambda: 0.0) strand_graphs.append(tg) # populate transcript graphs with partial paths for t in transcript_list: # get original transcript nodes and subtract trimmed nodes # convert to collapsed nodes and bin according to subgraph # TODO: intronic transcripts may be split into multiple pieces, # should we allow this? subgraph_node_map = collections.defaultdict(lambda: set()) for n in split_exons(t, G.graph['boundaries']): n = Exon(*n) if n in trim_nodes: continue cn = node_chain_map[n] subgraph_id = node_subgraph_map[cn] subgraph_node_map[subgraph_id].add(cn) # add transcript node/score pairs to subgraphs for subgraph_id, subgraph_nodes in subgraph_node_map.iteritems(): subgraph_nodes = sorted(subgraph_nodes, key=operator.attrgetter('start'), reverse=(strand == NEG_STRAND)) tg = strand_graphs[subgraph_id] tg.partial_paths[tuple(subgraph_nodes)] += t.score transcript_graphs.extend(strand_graphs) # convert for tg in transcript_graphs: tg.partial_paths = tg.partial_paths.items() return transcript_graphs
def test_assembler1(self): # setup correct transcripts PATH_ABCDE = tuple([ Exon(0, 100), Exon(200, 300), Exon(400, 500), Exon(600, 700), Exon(800, 900) ]) PATH_ACE = tuple([Exon(0, 100), Exon(400, 500), Exon(800, 900)]) PATH_ABCE = tuple( [Exon(0, 100), Exon(200, 300), Exon(400, 500), Exon(800, 900)]) PATH_ACDE = tuple( [Exon(0, 100), Exon(400, 500), Exon(600, 700), Exon(800, 900)]) # read transcripts transcripts = read_first_locus("assemble1.gtf", score_attr="score") tdict = dict((t.attrs['transcript_id'], t) for t in transcripts) # set transcript scores tdict["ABCDE"].score = 2.0 tdict["ACE"].score = 1.0 tdict["ABCE"].score = 1.0 tdict["ACDE"].score = 1.0 # create graphs GS = create_transcript_graphs('chr1', transcripts, create_bedgraph=False, bedgraph_filehs=None, min_trim_length=0, trim_utr_fraction=0.0, trim_intron_fraction=0.0) Gsub, strand, partial_paths = GS[0].Gsub, GS[0].strand, GS[ 0].partial_paths # assemble with kmax=2 results = list( assemble_transcript_graph(Gsub, strand, partial_paths, user_kmax=2, ksensitivity=0, fraction_major_path=0, max_paths=1000)) self.assertEquals(len(results), 2) self.assertEqual(tuple(results[0].path), PATH_ABCDE) self.assertAlmostEqual(results[0].score, 3.0, places=3) self.assertEqual(tuple(results[1].path), PATH_ACE) self.assertAlmostEqual(results[1].score, 2.0, places=3) # change transcript scores tdict["ABCDE"].score = 4.0 tdict["ACE"].score = 3.0 tdict["ABCE"].score = 2.0 tdict["ACDE"].score = 1.0 # create graphs GS = create_transcript_graphs('chr1', transcripts, create_bedgraph=False, bedgraph_filehs=None, min_trim_length=0, trim_utr_fraction=0.0, trim_intron_fraction=0.0) Gsub, strand, partial_paths = GS[0].Gsub, GS[0].strand, GS[ 0].partial_paths # assemble with kmax=3 results = list( assemble_transcript_graph(Gsub, strand, partial_paths, user_kmax=3, ksensitivity=0, fraction_major_path=0, max_paths=1000)) self.assertEquals(len(results), 4) self.assertEqual(tuple(results[0].path), PATH_ABCDE) self.assertAlmostEqual(results[0].score, 4.0, places=3) self.assertEqual(tuple(results[1].path), PATH_ACE) self.assertAlmostEqual(results[1].score, 3.0, places=3) self.assertEqual(tuple(results[2].path), PATH_ABCE) self.assertAlmostEqual(results[2].score, 2.0, places=3) self.assertEqual(tuple(results[3].path), PATH_ACDE) self.assertAlmostEqual(results[3].score, 1.0, places=3) return