def test_assembler1(self): # setup correct transcripts PATH_ABCDE = tuple([Exon(0,100), Exon(200,300), Exon(400,500),Exon(600,700), Exon(800,900)]) PATH_ACE = tuple([Exon(0,100), Exon(400,500), Exon(800,900)]) PATH_ABCE = tuple([Exon(0,100), Exon(200,300), Exon(400,500), Exon(800,900)]) PATH_ACDE = tuple([Exon(0,100), Exon(400,500),Exon(600,700), Exon(800,900)]) # read transcripts transcripts = read_first_locus("assemble1.gtf", score_attr="score") GG = get_transcript_graphs(transcripts) G,tmap = GG[POS_STRAND] # set transcript scores tmap["ABCDE"].score = 2.0 tmap["ACE"].score = 1.0 tmap["ABCE"].score = 1.0 tmap["ACDE"].score = 1.0 # set assembly parameter kmax = 2 # assemble GS = list(prune_transcript_graph(G, POS_STRAND, tmap, min_trim_length=0, trim_utr_fraction=0, trim_intron_fraction=0)) Gsub, strand, partial_paths = GS[0] results = list(assemble_transcript_graph(Gsub, strand, partial_paths, user_kmax=kmax, ksensitivity=0, fraction_major_path=0, max_paths=1000)) self.assertEquals(len(results), 2) self.assertEqual(tuple(results[0].path), PATH_ABCDE) self.assertAlmostEqual(results[0].score, 3.0, places=3) self.assertEqual(tuple(results[1].path), PATH_ACE) self.assertAlmostEqual(results[1].score, 2.0, places=3) # set transcript scores tmap["ABCDE"].score = 4.0 tmap["ACE"].score = 3.0 tmap["ABCE"].score = 2.0 tmap["ACDE"].score = 1.0 # set assembly parameter kmax = 3 # assemble GS = list(prune_transcript_graph(G, POS_STRAND, tmap, min_trim_length=0, trim_utr_fraction=0, trim_intron_fraction=0)) Gsub, strand, partial_paths = GS[0] results = list(assemble_transcript_graph(Gsub, strand, partial_paths, user_kmax=kmax, ksensitivity=0, fraction_major_path=0, max_paths=1000)) self.assertEquals(len(results), 4) self.assertEqual(tuple(results[0].path), PATH_ABCDE) self.assertAlmostEqual(results[0].score, 4.0, places=3) self.assertEqual(tuple(results[1].path), PATH_ACE) self.assertAlmostEqual(results[1].score, 3.0, places=3) self.assertEqual(tuple(results[2].path), PATH_ABCE) self.assertAlmostEqual(results[2].score, 2.0, places=3) self.assertEqual(tuple(results[3].path), PATH_ACDE) self.assertAlmostEqual(results[3].score, 1.0, places=3)
def test_test_transcripts(self): transcripts = read_first_locus("annotate_test1.gtf") t_dict = dict((t.attrs['transcript_id'], t) for t in transcripts) annotate_locus(transcripts, gtf_sample_attr="sample_id") t = t_dict['AA'] self.assertTrue(t.attrs[GTFAttr.CATEGORY] == 1) self.assertTrue(t.attrs[GTFAttr.ANN_REF_ID] == "B") self.assertTrue(t.attrs[GTFAttr.TEST] == "0") t = t_dict['BB'] self.assertTrue(t.attrs[GTFAttr.CATEGORY] == 0) self.assertTrue(t.attrs[GTFAttr.ANN_REF_ID] == "B") self.assertTrue(t.attrs[GTFAttr.TEST] == "1") t = t_dict['CC'] self.assertTrue(t.attrs[GTFAttr.CATEGORY] == 1) self.assertTrue(t.attrs[GTFAttr.ANN_REF_ID] == "C") self.assertTrue(t.attrs[GTFAttr.TEST] == "1") t = t_dict['DD'] self.assertTrue(t.attrs[GTFAttr.CATEGORY] == 3) self.assertTrue(t.attrs[GTFAttr.ANN_REF_ID] == "C") self.assertTrue(t.attrs[GTFAttr.TEST] == "1") t = t_dict['EE'] self.assertTrue(t.attrs[GTFAttr.CATEGORY] == 5) self.assertTrue(t.attrs[GTFAttr.ANN_REF_ID] == "C") self.assertTrue(t.attrs[GTFAttr.TEST] == "1") t = t_dict['FF'] self.assertTrue(t.attrs[GTFAttr.CATEGORY] == 6) self.assertTrue(t.attrs[GTFAttr.ANN_REF_ID] == "C") self.assertTrue(t.attrs[GTFAttr.TEST] == "1")
def test_test_transcripts(self): transcripts = read_first_locus("annotate_test1.gtf") t_dict = dict((t.attrs["transcript_id"], t) for t in transcripts) annotate_locus(transcripts, gtf_sample_attr="sample_id") t = t_dict["AA"] self.assertTrue(t.attrs[GTFAttr.CATEGORY] == 1) self.assertTrue(t.attrs[GTFAttr.ANN_REF_ID] == "B") self.assertTrue(t.attrs[GTFAttr.TEST] == "0") t = t_dict["BB"] self.assertTrue(t.attrs[GTFAttr.CATEGORY] == 0) self.assertTrue(t.attrs[GTFAttr.ANN_REF_ID] == "B") self.assertTrue(t.attrs[GTFAttr.TEST] == "1") t = t_dict["CC"] self.assertTrue(t.attrs[GTFAttr.CATEGORY] == 1) self.assertTrue(t.attrs[GTFAttr.ANN_REF_ID] == "C") self.assertTrue(t.attrs[GTFAttr.TEST] == "1") t = t_dict["DD"] self.assertTrue(t.attrs[GTFAttr.CATEGORY] == 3) self.assertTrue(t.attrs[GTFAttr.ANN_REF_ID] == "C") self.assertTrue(t.attrs[GTFAttr.TEST] == "1") t = t_dict["EE"] self.assertTrue(t.attrs[GTFAttr.CATEGORY] == 5) self.assertTrue(t.attrs[GTFAttr.ANN_REF_ID] == "C") self.assertTrue(t.attrs[GTFAttr.TEST] == "1") t = t_dict["FF"] self.assertTrue(t.attrs[GTFAttr.CATEGORY] == 6) self.assertTrue(t.attrs[GTFAttr.ANN_REF_ID] == "C") self.assertTrue(t.attrs[GTFAttr.TEST] == "1")
def test_trim_intron_retention(self): transcripts = read_first_locus("trim_intron_retention1.gtf", score_attr="FPKM") GG = get_transcript_graphs(transcripts) G,tmap = GG[POS_STRAND] # trim at different thresholds trim_nodes = trim_graph(G, POS_STRAND, min_trim_length=0, trim_utr_fraction=0.0, trim_intron_fraction=0.01) correct = set() self.assertTrue(trim_nodes == correct) trim_nodes = trim_graph(G, POS_STRAND, min_trim_length=0, trim_utr_fraction=0.0, trim_intron_fraction=0.11) correct = set([Exon(500,1500)]) self.assertTrue(trim_nodes == correct) trim_nodes = trim_graph(G, POS_STRAND, min_trim_length=0, trim_utr_fraction=0.0, trim_intron_fraction=0.21) correct = set([Exon(500,1500), Exon(2000,9000)]) self.assertTrue(trim_nodes == correct) trim_nodes = trim_graph(G, POS_STRAND, min_trim_length=0, trim_utr_fraction=0.0, trim_intron_fraction=1.0) correct = set([Exon(500,1500), Exon(2000,9000)]) self.assertTrue(trim_nodes == correct)
def test_trim_intron_retention(self): transcripts = read_first_locus("trim_intron_retention1.gtf", score_attr="FPKM") GG = get_transcript_graphs(transcripts) G, tmap = GG[POS_STRAND] # trim at different thresholds trim_nodes = trim_graph(G, POS_STRAND, min_trim_length=0, trim_utr_fraction=0.0, trim_intron_fraction=0.01) correct = set() self.assertTrue(trim_nodes == correct) trim_nodes = trim_graph(G, POS_STRAND, min_trim_length=0, trim_utr_fraction=0.0, trim_intron_fraction=0.11) correct = set([Exon(500, 1500)]) self.assertTrue(trim_nodes == correct) trim_nodes = trim_graph(G, POS_STRAND, min_trim_length=0, trim_utr_fraction=0.0, trim_intron_fraction=0.21) correct = set([Exon(500, 1500), Exon(2000, 9000)]) self.assertTrue(trim_nodes == correct) trim_nodes = trim_graph(G, POS_STRAND, min_trim_length=0, trim_utr_fraction=0.0, trim_intron_fraction=1.0) correct = set([Exon(500, 1500), Exon(2000, 9000)]) self.assertTrue(trim_nodes == correct)
def test_resolve_strand_scores1(self): transcripts = read_first_locus("resolve_strand_scores1.gtf", score_attr="FPKM") tdict = dict((t.attrs['transcript_id'],t) for t in transcripts) partition_transcripts_by_strand(transcripts) # check strands t = tdict["AA"] self.assertTrue(t.strand == NO_STRAND) t = tdict["BB"] self.assertTrue(t.strand == NO_STRAND)
def test_resolve_strand_scores1(self): transcripts = read_first_locus("resolve_strand_scores1.gtf", score_attr="FPKM") tdict = dict((t.attrs['transcript_id'], t) for t in transcripts) partition_transcripts_by_strand(transcripts) # check strands t = tdict["AA"] self.assertTrue(t.strand == NO_STRAND) t = tdict["BB"] self.assertTrue(t.strand == NO_STRAND)
def test_resolve_strand_scores3(self): transcripts = read_first_locus("resolve_strand_scores3.gtf", score_attr="FPKM") tdict = dict((t.attrs['transcript_id'],t) for t in transcripts) partition_transcripts_by_strand(transcripts) # check strands t = tdict['CC'] self.assertTrue(t.strand == POS_STRAND) t = tdict['DD'] self.assertTrue(t.strand == NEG_STRAND) t = tdict['EE'] self.assertTrue(t.strand == POS_STRAND) # increase score on negative strand transcripts = read_first_locus("resolve_strand_scores3.gtf", score_attr="FPKM") tdict = dict((t.attrs['transcript_id'],t) for t in transcripts) tdict['DD'].score += 1.0 partition_transcripts_by_strand(transcripts) # check strands t = tdict['EE'] self.assertTrue(t.strand == NEG_STRAND)
def test_trim_bidir(self): transcripts = read_first_locus("trim_bidir1.gtf", score_attr="FPKM") GG = get_transcript_graphs(transcripts) G,tmap = GG[POS_STRAND] # trim at three different thresholds trim_nodes = trim_graph(G, POS_STRAND, min_trim_length=0, trim_utr_fraction=0.015, trim_intron_fraction=0.0) correct = set([Exon(0,100), Exon(900,1000)]) self.assertTrue(trim_nodes == correct) trim_nodes = trim_graph(G, POS_STRAND, min_trim_length=0, trim_utr_fraction=0.11, trim_intron_fraction=0.0) correct = set([Exon(0,100), Exon(900,1000), Exon(100,200), Exon(800,900)]) self.assertTrue(trim_nodes == correct) trim_nodes = trim_graph(G, POS_STRAND, min_trim_length=0, trim_utr_fraction=0.26, trim_intron_fraction=0.0) correct = set([Exon(0,100), Exon(900,1000), Exon(100,200), Exon(800,900), Exon(200,300), Exon(700,800)]) self.assertTrue(trim_nodes == correct) # flip sign of transcripts and try again for t in transcripts: t.strand = NEG_STRAND GG = get_transcript_graphs(transcripts) G,tmap = GG[NEG_STRAND] # trim at three different thresholds trim_nodes = trim_graph(G, NEG_STRAND, min_trim_length=0, trim_utr_fraction=0.015, trim_intron_fraction=0.0) correct = set([Exon(0,100), Exon(900,1000)]) self.assertTrue(trim_nodes == correct) trim_nodes = trim_graph(G, NEG_STRAND, min_trim_length=0, trim_utr_fraction=0.11, trim_intron_fraction=0.0) correct = set([Exon(0,100), Exon(900,1000), Exon(100,200), Exon(800,900)]) self.assertTrue(trim_nodes == correct) trim_nodes = trim_graph(G, NEG_STRAND, min_trim_length=0, trim_utr_fraction=0.26, trim_intron_fraction=0.0) correct = set([Exon(0,100), Exon(900,1000), Exon(100,200), Exon(800,900), Exon(200,300), Exon(700,800)]) self.assertTrue(trim_nodes == correct)
def test_resolve_strand_scores3(self): transcripts = read_first_locus("resolve_strand_scores3.gtf", score_attr="FPKM") tdict = dict((t.attrs['transcript_id'], t) for t in transcripts) partition_transcripts_by_strand(transcripts) # check strands t = tdict['CC'] self.assertTrue(t.strand == POS_STRAND) t = tdict['DD'] self.assertTrue(t.strand == NEG_STRAND) t = tdict['EE'] self.assertTrue(t.strand == POS_STRAND) # increase score on negative strand transcripts = read_first_locus("resolve_strand_scores3.gtf", score_attr="FPKM") tdict = dict((t.attrs['transcript_id'], t) for t in transcripts) tdict['DD'].score += 1.0 partition_transcripts_by_strand(transcripts) # check strands t = tdict['EE'] self.assertTrue(t.strand == NEG_STRAND)
def test_find_best_match(self): transcripts = read_first_locus("annotate_best_match1.gtf") t_dict = dict((t.attrs['transcript_id'], t) for t in transcripts) annotate_locus(transcripts, gtf_sample_attr="sample_id") t = t_dict['T1'] self.assertTrue(t.attrs[GTFAttr.CATEGORY] == 0) self.assertTrue(t.attrs[GTFAttr.ANN_REF_ID] == 'D') self.assertAlmostEqual(t.attrs[GTFAttr.ANN_INTRON_RATIO], 1.0, 2) self.assertAlmostEqual(t.attrs[GTFAttr.ANN_COV_RATIO], 0.9375, 2) t = t_dict['T2'] self.assertTrue(t.attrs[GTFAttr.CATEGORY] == 0) self.assertTrue(t.attrs[GTFAttr.ANN_REF_ID] == 'B') self.assertAlmostEqual(t.attrs[GTFAttr.ANN_INTRON_RATIO], 0.25, 2) self.assertAlmostEqual(t.attrs[GTFAttr.ANN_COV_RATIO], 0.6744, 2)
def test_find_best_match(self): transcripts = read_first_locus("annotate_best_match1.gtf") t_dict = dict((t.attrs["transcript_id"], t) for t in transcripts) annotate_locus(transcripts, gtf_sample_attr="sample_id") t = t_dict["T1"] self.assertTrue(t.attrs[GTFAttr.CATEGORY] == 0) self.assertTrue(t.attrs[GTFAttr.ANN_REF_ID] == "D") self.assertAlmostEqual(t.attrs[GTFAttr.ANN_INTRON_RATIO], 1.0, 2) self.assertAlmostEqual(t.attrs[GTFAttr.ANN_COV_RATIO], 0.9375, 2) t = t_dict["T2"] self.assertTrue(t.attrs[GTFAttr.CATEGORY] == 0) self.assertTrue(t.attrs[GTFAttr.ANN_REF_ID] == "B") self.assertAlmostEqual(t.attrs[GTFAttr.ANN_INTRON_RATIO], 0.25, 2) self.assertAlmostEqual(t.attrs[GTFAttr.ANN_COV_RATIO], 0.6744, 2)
def test_intergenic(self): transcripts = read_first_locus("annotate_intergenic1.gtf") t_dict = dict((t.attrs["transcript_id"], t) for t in transcripts) annotate_locus(transcripts, gtf_sample_attr="sample_id") t = t_dict["T1"] self.assertTrue(t.attrs[GTFAttr.CATEGORY] == 6) self.assertTrue(t.attrs[GTFAttr.ANN_REF_ID] == "na") self.assertAlmostEqual(t.attrs[GTFAttr.ANN_INTRON_RATIO], 0.0, 2) self.assertAlmostEqual(t.attrs[GTFAttr.ANN_COV_RATIO], 0.0, 2) self.assertAlmostEqual(t.attrs[GTFAttr.MEAN_RECURRENCE], 2.0, 2) self.assertAlmostEqual(t.attrs[GTFAttr.MEAN_SCORE], 20.0, 2) t = t_dict["F"] self.assertTrue(t.attrs[GTFAttr.CATEGORY] == 6) self.assertTrue(t.attrs[GTFAttr.ANN_REF_ID] == "na") self.assertAlmostEqual(t.attrs[GTFAttr.ANN_INTRON_RATIO], 0.0, 2) self.assertAlmostEqual(t.attrs[GTFAttr.ANN_COV_RATIO], 0.0, 2) self.assertAlmostEqual(t.attrs[GTFAttr.MEAN_RECURRENCE], 2.0, 2) self.assertAlmostEqual(t.attrs[GTFAttr.MEAN_SCORE], 20.0, 2)
def test_intergenic(self): transcripts = read_first_locus("annotate_intergenic1.gtf") t_dict = dict((t.attrs['transcript_id'], t) for t in transcripts) annotate_locus(transcripts, gtf_sample_attr="sample_id") t = t_dict['T1'] self.assertTrue(t.attrs[GTFAttr.CATEGORY] == 6) self.assertTrue(t.attrs[GTFAttr.ANN_REF_ID] == 'na') self.assertAlmostEqual(t.attrs[GTFAttr.ANN_INTRON_RATIO], 0.0, 2) self.assertAlmostEqual(t.attrs[GTFAttr.ANN_COV_RATIO], 0.0, 2) self.assertAlmostEqual(t.attrs[GTFAttr.MEAN_RECURRENCE], 2.0, 2) self.assertAlmostEqual(t.attrs[GTFAttr.MEAN_SCORE], 20.0, 2) t = t_dict['F'] self.assertTrue(t.attrs[GTFAttr.CATEGORY] == 6) self.assertTrue(t.attrs[GTFAttr.ANN_REF_ID] == 'na') self.assertAlmostEqual(t.attrs[GTFAttr.ANN_INTRON_RATIO], 0.0, 2) self.assertAlmostEqual(t.attrs[GTFAttr.ANN_COV_RATIO], 0.0, 2) self.assertAlmostEqual(t.attrs[GTFAttr.MEAN_RECURRENCE], 2.0, 2) self.assertAlmostEqual(t.attrs[GTFAttr.MEAN_SCORE], 20.0, 2)
def test_resolve_strand_ref(self): transcripts = read_first_locus("resolve_strand_ref1.gtf", score_attr="FPKM") tdict = dict((t.attrs['transcript_id'],t) for t in transcripts) partition_transcripts_by_strand(transcripts) # check resolved strands t = tdict["T1"] self.assertTrue(t.strand == POS_STRAND) t = tdict["T2"] self.assertTrue(t.strand == NEG_STRAND) # equal overlap on both strands, default to positive t = tdict["T3"] self.assertTrue(t.strand == POS_STRAND) # more positive strand overlap t = tdict["T4"] self.assertTrue(t.strand == POS_STRAND) # more negative strand overlap t = tdict["T5"] self.assertTrue(t.strand == NEG_STRAND) return
def test_resolve_strand_ref(self): transcripts = read_first_locus("resolve_strand_ref1.gtf", score_attr="FPKM") tdict = dict((t.attrs['transcript_id'], t) for t in transcripts) partition_transcripts_by_strand(transcripts) # check resolved strands t = tdict["T1"] self.assertTrue(t.strand == POS_STRAND) t = tdict["T2"] self.assertTrue(t.strand == NEG_STRAND) # equal overlap on both strands, default to positive t = tdict["T3"] self.assertTrue(t.strand == POS_STRAND) # more positive strand overlap t = tdict["T4"] self.assertTrue(t.strand == POS_STRAND) # more negative strand overlap t = tdict["T5"] self.assertTrue(t.strand == NEG_STRAND) return
def test_categories(self): transcripts = read_first_locus("annotate_category1.gtf") t_dict = dict((t.attrs["transcript_id"], t) for t in transcripts) annotate_locus(transcripts, gtf_sample_attr="sample_id") # intronic same strand self.assertTrue(t_dict["T2"].attrs[GTFAttr.CATEGORY] == 2) # intronic opposite strand self.assertTrue(t_dict["T3"].attrs[GTFAttr.CATEGORY] == 3) # intronic ambiguous self.assertTrue(t_dict["T6"].attrs[GTFAttr.CATEGORY] == 4) # interleaving self.assertTrue(t_dict["T4"].attrs[GTFAttr.CATEGORY] == 5) # interleaving self.assertTrue(t_dict["T5"].attrs[GTFAttr.CATEGORY] == 3) # opp strand overlap (no introns) t = t_dict["T7"] self.assertTrue(t.attrs[GTFAttr.CATEGORY] == 1) self.assertTrue(t.attrs[GTFAttr.ANN_REF_ID] == "T1") self.assertAlmostEqual(t.attrs[GTFAttr.ANN_COV_RATIO], 0.2, 2) # same strand overlap (no introns) t = t_dict["T8"] self.assertTrue(t.attrs[GTFAttr.CATEGORY] == 0) self.assertTrue(t.attrs[GTFAttr.ANN_REF_ID] == "T1") self.assertAlmostEqual(t.attrs[GTFAttr.ANN_INTRON_RATIO], 0.0, 2) self.assertAlmostEqual(t.attrs[GTFAttr.ANN_COV_RATIO], 0.10, 2) # same strand overlap (with introns) t = t_dict["T9"] self.assertTrue(t.attrs[GTFAttr.CATEGORY] == 0) self.assertTrue(t.attrs[GTFAttr.ANN_REF_ID] == "T1") self.assertAlmostEqual(t.attrs[GTFAttr.ANN_INTRON_RATIO], 0.4, 2) self.assertAlmostEqual(t.attrs[GTFAttr.ANN_COV_RATIO], 2.0 / 3, 2) # another same strand overlap (with introns) t = t_dict["T10"] self.assertTrue(t.attrs[GTFAttr.CATEGORY] == 0) self.assertTrue(t.attrs[GTFAttr.ANN_REF_ID] == "T1") self.assertAlmostEqual(t.attrs[GTFAttr.ANN_INTRON_RATIO], 0.5, 2) self.assertAlmostEqual(t.attrs[GTFAttr.ANN_COV_RATIO], 0.6, 2)
def test_categories(self): transcripts = read_first_locus("annotate_category1.gtf") t_dict = dict((t.attrs['transcript_id'], t) for t in transcripts) annotate_locus(transcripts, gtf_sample_attr="sample_id") # intronic same strand self.assertTrue(t_dict['T2'].attrs[GTFAttr.CATEGORY] == 2) # intronic opposite strand self.assertTrue(t_dict['T3'].attrs[GTFAttr.CATEGORY] == 3) # intronic ambiguous self.assertTrue(t_dict['T6'].attrs[GTFAttr.CATEGORY] == 4) # interleaving self.assertTrue(t_dict['T4'].attrs[GTFAttr.CATEGORY] == 5) # interleaving self.assertTrue(t_dict['T5'].attrs[GTFAttr.CATEGORY] == 3) # opp strand overlap (no introns) t = t_dict['T7'] self.assertTrue(t.attrs[GTFAttr.CATEGORY] == 1) self.assertTrue(t.attrs[GTFAttr.ANN_REF_ID] == 'T1') self.assertAlmostEqual(t.attrs[GTFAttr.ANN_COV_RATIO], 0.2, 2) # same strand overlap (no introns) t = t_dict['T8'] self.assertTrue(t.attrs[GTFAttr.CATEGORY] == 0) self.assertTrue(t.attrs[GTFAttr.ANN_REF_ID] == 'T1') self.assertAlmostEqual(t.attrs[GTFAttr.ANN_INTRON_RATIO], 0.0, 2) self.assertAlmostEqual(t.attrs[GTFAttr.ANN_COV_RATIO], 0.10, 2) # same strand overlap (with introns) t = t_dict['T9'] self.assertTrue(t.attrs[GTFAttr.CATEGORY] == 0) self.assertTrue(t.attrs[GTFAttr.ANN_REF_ID] == 'T1') self.assertAlmostEqual(t.attrs[GTFAttr.ANN_INTRON_RATIO], 0.4, 2) self.assertAlmostEqual(t.attrs[GTFAttr.ANN_COV_RATIO], 2. / 3, 2) # another same strand overlap (with introns) t = t_dict['T10'] self.assertTrue(t.attrs[GTFAttr.CATEGORY] == 0) self.assertTrue(t.attrs[GTFAttr.ANN_REF_ID] == 'T1') self.assertAlmostEqual(t.attrs[GTFAttr.ANN_INTRON_RATIO], 0.5, 2) self.assertAlmostEqual(t.attrs[GTFAttr.ANN_COV_RATIO], 0.6, 2)
def test_assembler1(self): # setup correct transcripts PATH_ABCDE = tuple([Exon(0,100), Exon(200,300), Exon(400,500),Exon(600,700), Exon(800,900)]) PATH_ACE = tuple([Exon(0,100), Exon(400,500), Exon(800,900)]) PATH_ABCE = tuple([Exon(0,100), Exon(200,300), Exon(400,500), Exon(800,900)]) PATH_ACDE = tuple([Exon(0,100), Exon(400,500),Exon(600,700), Exon(800,900)]) # read transcripts transcripts = read_first_locus("assemble1.gtf", score_attr="score") tdict = dict((t.attrs['transcript_id'],t) for t in transcripts) # set transcript scores tdict["ABCDE"].score = 2.0 tdict["ACE"].score = 1.0 tdict["ABCE"].score = 1.0 tdict["ACDE"].score = 1.0 # create graphs GS = create_transcript_graphs('chr1', transcripts, create_bedgraph=False, bedgraph_filehs=None, min_trim_length=0, trim_utr_fraction=0.0, trim_intron_fraction=0.0) Gsub, strand, partial_paths = GS[0].Gsub, GS[0].strand, GS[0].partial_paths # assemble with kmax=2 results = list(assemble_transcript_graph(Gsub, strand, partial_paths, user_kmax=2, ksensitivity=0, fraction_major_path=0, max_paths=1000)) self.assertEquals(len(results), 2) self.assertEqual(tuple(results[0].path), PATH_ABCDE) self.assertAlmostEqual(results[0].score, 3.0, places=3) self.assertEqual(tuple(results[1].path), PATH_ACE) self.assertAlmostEqual(results[1].score, 2.0, places=3) # change transcript scores tdict["ABCDE"].score = 4.0 tdict["ACE"].score = 3.0 tdict["ABCE"].score = 2.0 tdict["ACDE"].score = 1.0 # create graphs GS = create_transcript_graphs('chr1', transcripts, create_bedgraph=False, bedgraph_filehs=None, min_trim_length=0, trim_utr_fraction=0.0, trim_intron_fraction=0.0) Gsub, strand, partial_paths = GS[0].Gsub, GS[0].strand, GS[0].partial_paths # assemble with kmax=3 results = list(assemble_transcript_graph(Gsub, strand, partial_paths, user_kmax=3, ksensitivity=0, fraction_major_path=0, max_paths=1000)) self.assertEquals(len(results), 4) self.assertEqual(tuple(results[0].path), PATH_ABCDE) self.assertAlmostEqual(results[0].score, 4.0, places=3) self.assertEqual(tuple(results[1].path), PATH_ACE) self.assertAlmostEqual(results[1].score, 3.0, places=3) self.assertEqual(tuple(results[2].path), PATH_ABCE) self.assertAlmostEqual(results[2].score, 2.0, places=3) self.assertEqual(tuple(results[3].path), PATH_ACDE) self.assertAlmostEqual(results[3].score, 1.0, places=3) return
def test_trim_intronic_utr(self): transcripts = read_first_locus("trim_intron_utr1.gtf", score_attr="FPKM") GG = get_transcript_graphs(transcripts) G,tmap = GG[POS_STRAND] # trim at different thresholds trim_nodes = trim_graph(G, POS_STRAND, min_trim_length=0, trim_utr_fraction=0.0, trim_intron_fraction=0.001) correct = set() self.assertTrue(trim_nodes == correct) trim_nodes = trim_graph(G, POS_STRAND, min_trim_length=0, trim_utr_fraction=0.0, trim_intron_fraction=0.011) correct = set([Exon(1000,1100)]) self.assertTrue(trim_nodes == correct) trim_nodes = trim_graph(G, POS_STRAND, min_trim_length=0, trim_utr_fraction=0.0, trim_intron_fraction=0.055) correct = set([Exon(1000,1100), Exon(1100,1200)]) self.assertTrue(trim_nodes == correct) trim_nodes = trim_graph(G, POS_STRAND, min_trim_length=0, trim_utr_fraction=0.0, trim_intron_fraction=0.15) correct = set([Exon(1000,1100), Exon(1100,1200), Exon(1200,1300)]) self.assertTrue(trim_nodes == correct) trim_nodes = trim_graph(G, POS_STRAND, min_trim_length=0, trim_utr_fraction=0.0, trim_intron_fraction=1.0) correct = set([Exon(1000,1100), Exon(1100,1200), Exon(1200,1300), Exon(1300,1500)]) self.assertTrue(trim_nodes == correct) # flip sign of transcripts and try again for t in transcripts: t.strand = NEG_STRAND GG = get_transcript_graphs(transcripts) G,tmap = GG[NEG_STRAND] # trim at different thresholds trim_nodes = trim_graph(G, NEG_STRAND, min_trim_length=0, trim_utr_fraction=0.0, trim_intron_fraction=0.001) correct = set() self.assertTrue(trim_nodes == correct) trim_nodes = trim_graph(G, NEG_STRAND, min_trim_length=0, trim_utr_fraction=0.0, trim_intron_fraction=0.011) correct = set([Exon(1000,1100)]) self.assertTrue(trim_nodes == correct) trim_nodes = trim_graph(G, NEG_STRAND, min_trim_length=0, trim_utr_fraction=0.0, trim_intron_fraction=0.055) correct = set([Exon(1000,1100), Exon(1100,1200)]) self.assertTrue(trim_nodes == correct) trim_nodes = trim_graph(G, NEG_STRAND, min_trim_length=0, trim_utr_fraction=0.0, trim_intron_fraction=0.15) correct = set([Exon(1000,1100), Exon(1100,1200), Exon(1200,1300)]) self.assertTrue(trim_nodes == correct) trim_nodes = trim_graph(G, NEG_STRAND, min_trim_length=0, trim_utr_fraction=0.0, trim_intron_fraction=1.0) correct = set([Exon(1000,1100), Exon(1100,1200), Exon(1200,1300), Exon(1300,1500)]) self.assertTrue(trim_nodes == correct)
def test_trim_intron_bidir(self): transcripts = read_first_locus("trim_intron_bidir1.gtf", score_attr="FPKM") GG = get_transcript_graphs(transcripts) G, tmap = GG[POS_STRAND] # trim at different thresholds trim_nodes = trim_graph(G, POS_STRAND, min_trim_length=0, trim_utr_fraction=0.0, trim_intron_fraction=0.001) correct = set() self.assertTrue(trim_nodes == correct) trim_nodes = trim_graph(G, POS_STRAND, min_trim_length=0, trim_utr_fraction=0.0, trim_intron_fraction=0.025) correct = set([Exon(1900, 2000), Exon(1000, 1100)]) self.assertTrue(trim_nodes == correct) trim_nodes = trim_graph(G, POS_STRAND, min_trim_length=0, trim_utr_fraction=0.0, trim_intron_fraction=0.2) correct = set([ Exon(1900, 2000), Exon(1100, 1200), Exon(1800, 1900), Exon(1000, 1100) ]) self.assertTrue(trim_nodes == correct) trim_nodes = trim_graph(G, POS_STRAND, min_trim_length=0, trim_utr_fraction=0.0, trim_intron_fraction=0.25) correct = set([ Exon(1900, 2000), Exon(1100, 1200), Exon(1200, 1300), Exon(1700, 1800), Exon(1800, 1900), Exon(1000, 1100) ]) self.assertTrue(trim_nodes == correct) # flip sign of transcripts and try again for t in transcripts: t.strand = NEG_STRAND GG = get_transcript_graphs(transcripts) G, tmap = GG[NEG_STRAND] # trim at different thresholds trim_nodes = trim_graph(G, NEG_STRAND, min_trim_length=0, trim_utr_fraction=0.0, trim_intron_fraction=0.001) correct = set() self.assertTrue(trim_nodes == correct) trim_nodes = trim_graph(G, NEG_STRAND, min_trim_length=0, trim_utr_fraction=0.0, trim_intron_fraction=0.025) correct = set([Exon(1900, 2000), Exon(1000, 1100)]) self.assertTrue(trim_nodes == correct) trim_nodes = trim_graph(G, NEG_STRAND, min_trim_length=0, trim_utr_fraction=0.0, trim_intron_fraction=0.2) correct = set([ Exon(1900, 2000), Exon(1100, 1200), Exon(1800, 1900), Exon(1000, 1100) ]) self.assertTrue(trim_nodes == correct) trim_nodes = trim_graph(G, NEG_STRAND, min_trim_length=0, trim_utr_fraction=0.0, trim_intron_fraction=0.25) correct = set([ Exon(1900, 2000), Exon(1100, 1200), Exon(1200, 1300), Exon(1700, 1800), Exon(1800, 1900), Exon(1000, 1100) ]) self.assertTrue(trim_nodes == correct)
def test_assembler1(self): # setup correct transcripts PATH_ABCDE = tuple([ Exon(0, 100), Exon(200, 300), Exon(400, 500), Exon(600, 700), Exon(800, 900) ]) PATH_ACE = tuple([Exon(0, 100), Exon(400, 500), Exon(800, 900)]) PATH_ABCE = tuple( [Exon(0, 100), Exon(200, 300), Exon(400, 500), Exon(800, 900)]) PATH_ACDE = tuple( [Exon(0, 100), Exon(400, 500), Exon(600, 700), Exon(800, 900)]) # read transcripts transcripts = read_first_locus("assemble1.gtf", score_attr="score") tdict = dict((t.attrs['transcript_id'], t) for t in transcripts) # set transcript scores tdict["ABCDE"].score = 2.0 tdict["ACE"].score = 1.0 tdict["ABCE"].score = 1.0 tdict["ACDE"].score = 1.0 # create graphs GS = create_transcript_graphs('chr1', transcripts, create_bedgraph=False, bedgraph_filehs=None, min_trim_length=0, trim_utr_fraction=0.0, trim_intron_fraction=0.0) Gsub, strand, partial_paths = GS[0].Gsub, GS[0].strand, GS[ 0].partial_paths # assemble with kmax=2 results = list( assemble_transcript_graph(Gsub, strand, partial_paths, user_kmax=2, ksensitivity=0, fraction_major_path=0, max_paths=1000)) self.assertEquals(len(results), 2) self.assertEqual(tuple(results[0].path), PATH_ABCDE) self.assertAlmostEqual(results[0].score, 3.0, places=3) self.assertEqual(tuple(results[1].path), PATH_ACE) self.assertAlmostEqual(results[1].score, 2.0, places=3) # change transcript scores tdict["ABCDE"].score = 4.0 tdict["ACE"].score = 3.0 tdict["ABCE"].score = 2.0 tdict["ACDE"].score = 1.0 # create graphs GS = create_transcript_graphs('chr1', transcripts, create_bedgraph=False, bedgraph_filehs=None, min_trim_length=0, trim_utr_fraction=0.0, trim_intron_fraction=0.0) Gsub, strand, partial_paths = GS[0].Gsub, GS[0].strand, GS[ 0].partial_paths # assemble with kmax=3 results = list( assemble_transcript_graph(Gsub, strand, partial_paths, user_kmax=3, ksensitivity=0, fraction_major_path=0, max_paths=1000)) self.assertEquals(len(results), 4) self.assertEqual(tuple(results[0].path), PATH_ABCDE) self.assertAlmostEqual(results[0].score, 4.0, places=3) self.assertEqual(tuple(results[1].path), PATH_ACE) self.assertAlmostEqual(results[1].score, 3.0, places=3) self.assertEqual(tuple(results[2].path), PATH_ABCE) self.assertAlmostEqual(results[2].score, 2.0, places=3) self.assertEqual(tuple(results[3].path), PATH_ACDE) self.assertAlmostEqual(results[3].score, 1.0, places=3) return
def test_assembler1(self): # setup correct transcripts PATH_ABCDE = tuple([ Exon(0, 100), Exon(200, 300), Exon(400, 500), Exon(600, 700), Exon(800, 900) ]) PATH_ACE = tuple([Exon(0, 100), Exon(400, 500), Exon(800, 900)]) PATH_ABCE = tuple( [Exon(0, 100), Exon(200, 300), Exon(400, 500), Exon(800, 900)]) PATH_ACDE = tuple( [Exon(0, 100), Exon(400, 500), Exon(600, 700), Exon(800, 900)]) # read transcripts transcripts = read_first_locus("assemble1.gtf", score_attr="score") GG = get_transcript_graphs(transcripts) G, tmap = GG[POS_STRAND] # set transcript scores tmap["ABCDE"].score = 2.0 tmap["ACE"].score = 1.0 tmap["ABCE"].score = 1.0 tmap["ACDE"].score = 1.0 # set assembly parameter kmax = 2 # assemble GS = list( prune_transcript_graph(G, POS_STRAND, tmap, min_trim_length=0, trim_utr_fraction=0, trim_intron_fraction=0)) Gsub, strand, partial_paths = GS[0] results = list( assemble_transcript_graph(Gsub, strand, partial_paths, user_kmax=kmax, ksensitivity=0, fraction_major_path=0, max_paths=1000)) self.assertEquals(len(results), 2) self.assertEqual(tuple(results[0].path), PATH_ABCDE) self.assertAlmostEqual(results[0].score, 3.0, places=3) self.assertEqual(tuple(results[1].path), PATH_ACE) self.assertAlmostEqual(results[1].score, 2.0, places=3) # set transcript scores tmap["ABCDE"].score = 4.0 tmap["ACE"].score = 3.0 tmap["ABCE"].score = 2.0 tmap["ACDE"].score = 1.0 # set assembly parameter kmax = 3 # assemble GS = list( prune_transcript_graph(G, POS_STRAND, tmap, min_trim_length=0, trim_utr_fraction=0, trim_intron_fraction=0)) Gsub, strand, partial_paths = GS[0] results = list( assemble_transcript_graph(Gsub, strand, partial_paths, user_kmax=kmax, ksensitivity=0, fraction_major_path=0, max_paths=1000)) self.assertEquals(len(results), 4) self.assertEqual(tuple(results[0].path), PATH_ABCDE) self.assertAlmostEqual(results[0].score, 4.0, places=3) self.assertEqual(tuple(results[1].path), PATH_ACE) self.assertAlmostEqual(results[1].score, 3.0, places=3) self.assertEqual(tuple(results[2].path), PATH_ABCE) self.assertAlmostEqual(results[2].score, 2.0, places=3) self.assertEqual(tuple(results[3].path), PATH_ACDE) self.assertAlmostEqual(results[3].score, 1.0, places=3)