Beispiel #1
0
 def test_assembler1(self):
     # setup correct transcripts
     PATH_ABCDE = tuple([Exon(0,100), Exon(200,300), Exon(400,500),Exon(600,700), Exon(800,900)])
     PATH_ACE = tuple([Exon(0,100), Exon(400,500), Exon(800,900)])
     PATH_ABCE = tuple([Exon(0,100), Exon(200,300), Exon(400,500), Exon(800,900)])
     PATH_ACDE = tuple([Exon(0,100), Exon(400,500),Exon(600,700), Exon(800,900)])
     # read transcripts
     transcripts = read_first_locus("assemble1.gtf", score_attr="score")
     GG = get_transcript_graphs(transcripts)
     G,tmap = GG[POS_STRAND]
     # set transcript scores
     tmap["ABCDE"].score = 2.0
     tmap["ACE"].score = 1.0
     tmap["ABCE"].score = 1.0
     tmap["ACDE"].score = 1.0
     # set assembly parameter
     kmax = 2
     # assemble
     GS = list(prune_transcript_graph(G, POS_STRAND, tmap,
                                      min_trim_length=0,
                                      trim_utr_fraction=0,
                                      trim_intron_fraction=0))
     Gsub, strand, partial_paths = GS[0]
     results = list(assemble_transcript_graph(Gsub, strand, partial_paths,
                                              user_kmax=kmax,
                                              ksensitivity=0,
                                              fraction_major_path=0,
                                              max_paths=1000))
     self.assertEquals(len(results), 2)
     self.assertEqual(tuple(results[0].path), PATH_ABCDE) 
     self.assertAlmostEqual(results[0].score, 3.0, places=3) 
     self.assertEqual(tuple(results[1].path), PATH_ACE) 
     self.assertAlmostEqual(results[1].score, 2.0, places=3) 
     # set transcript scores
     tmap["ABCDE"].score = 4.0
     tmap["ACE"].score = 3.0
     tmap["ABCE"].score = 2.0
     tmap["ACDE"].score = 1.0
     # set assembly parameter
     kmax = 3
     # assemble
     GS = list(prune_transcript_graph(G, POS_STRAND, tmap,
                                      min_trim_length=0,
                                      trim_utr_fraction=0,
                                      trim_intron_fraction=0))
     Gsub, strand, partial_paths = GS[0]
     results = list(assemble_transcript_graph(Gsub, strand, partial_paths,
                                              user_kmax=kmax,
                                              ksensitivity=0,
                                              fraction_major_path=0,
                                              max_paths=1000))
     self.assertEquals(len(results), 4)
     self.assertEqual(tuple(results[0].path), PATH_ABCDE) 
     self.assertAlmostEqual(results[0].score, 4.0, places=3) 
     self.assertEqual(tuple(results[1].path), PATH_ACE) 
     self.assertAlmostEqual(results[1].score, 3.0, places=3) 
     self.assertEqual(tuple(results[2].path), PATH_ABCE) 
     self.assertAlmostEqual(results[2].score, 2.0, places=3) 
     self.assertEqual(tuple(results[3].path), PATH_ACDE) 
     self.assertAlmostEqual(results[3].score, 1.0, places=3) 
 def test_test_transcripts(self):
     transcripts = read_first_locus("annotate_test1.gtf")
     t_dict = dict((t.attrs['transcript_id'], t) for t in transcripts)
     annotate_locus(transcripts, gtf_sample_attr="sample_id")
     t = t_dict['AA']
     self.assertTrue(t.attrs[GTFAttr.CATEGORY] == 1)
     self.assertTrue(t.attrs[GTFAttr.ANN_REF_ID] == "B")
     self.assertTrue(t.attrs[GTFAttr.TEST] == "0")
     t = t_dict['BB']
     self.assertTrue(t.attrs[GTFAttr.CATEGORY] == 0)
     self.assertTrue(t.attrs[GTFAttr.ANN_REF_ID] == "B")
     self.assertTrue(t.attrs[GTFAttr.TEST] == "1")
     t = t_dict['CC']
     self.assertTrue(t.attrs[GTFAttr.CATEGORY] == 1)
     self.assertTrue(t.attrs[GTFAttr.ANN_REF_ID] == "C")
     self.assertTrue(t.attrs[GTFAttr.TEST] == "1")
     t = t_dict['DD']
     self.assertTrue(t.attrs[GTFAttr.CATEGORY] == 3)
     self.assertTrue(t.attrs[GTFAttr.ANN_REF_ID] == "C")
     self.assertTrue(t.attrs[GTFAttr.TEST] == "1")
     t = t_dict['EE']
     self.assertTrue(t.attrs[GTFAttr.CATEGORY] == 5)
     self.assertTrue(t.attrs[GTFAttr.ANN_REF_ID] == "C")
     self.assertTrue(t.attrs[GTFAttr.TEST] == "1")
     t = t_dict['FF']
     self.assertTrue(t.attrs[GTFAttr.CATEGORY] == 6)
     self.assertTrue(t.attrs[GTFAttr.ANN_REF_ID] == "C")
     self.assertTrue(t.attrs[GTFAttr.TEST] == "1")
 def test_test_transcripts(self):
     transcripts = read_first_locus("annotate_test1.gtf")
     t_dict = dict((t.attrs["transcript_id"], t) for t in transcripts)
     annotate_locus(transcripts, gtf_sample_attr="sample_id")
     t = t_dict["AA"]
     self.assertTrue(t.attrs[GTFAttr.CATEGORY] == 1)
     self.assertTrue(t.attrs[GTFAttr.ANN_REF_ID] == "B")
     self.assertTrue(t.attrs[GTFAttr.TEST] == "0")
     t = t_dict["BB"]
     self.assertTrue(t.attrs[GTFAttr.CATEGORY] == 0)
     self.assertTrue(t.attrs[GTFAttr.ANN_REF_ID] == "B")
     self.assertTrue(t.attrs[GTFAttr.TEST] == "1")
     t = t_dict["CC"]
     self.assertTrue(t.attrs[GTFAttr.CATEGORY] == 1)
     self.assertTrue(t.attrs[GTFAttr.ANN_REF_ID] == "C")
     self.assertTrue(t.attrs[GTFAttr.TEST] == "1")
     t = t_dict["DD"]
     self.assertTrue(t.attrs[GTFAttr.CATEGORY] == 3)
     self.assertTrue(t.attrs[GTFAttr.ANN_REF_ID] == "C")
     self.assertTrue(t.attrs[GTFAttr.TEST] == "1")
     t = t_dict["EE"]
     self.assertTrue(t.attrs[GTFAttr.CATEGORY] == 5)
     self.assertTrue(t.attrs[GTFAttr.ANN_REF_ID] == "C")
     self.assertTrue(t.attrs[GTFAttr.TEST] == "1")
     t = t_dict["FF"]
     self.assertTrue(t.attrs[GTFAttr.CATEGORY] == 6)
     self.assertTrue(t.attrs[GTFAttr.ANN_REF_ID] == "C")
     self.assertTrue(t.attrs[GTFAttr.TEST] == "1")
Beispiel #4
0
 def test_trim_intron_retention(self):
     transcripts = read_first_locus("trim_intron_retention1.gtf", score_attr="FPKM")
     GG = get_transcript_graphs(transcripts)
     G,tmap = GG[POS_STRAND]
     # trim at different thresholds
     trim_nodes = trim_graph(G, POS_STRAND,
                             min_trim_length=0, 
                             trim_utr_fraction=0.0,
                             trim_intron_fraction=0.01)
     correct = set()
     self.assertTrue(trim_nodes == correct)
     trim_nodes = trim_graph(G, POS_STRAND,
                             min_trim_length=0, 
                             trim_utr_fraction=0.0,
                             trim_intron_fraction=0.11)
     correct = set([Exon(500,1500)])
     self.assertTrue(trim_nodes == correct)
     trim_nodes = trim_graph(G, POS_STRAND,
                             min_trim_length=0, 
                             trim_utr_fraction=0.0,
                             trim_intron_fraction=0.21)
     correct = set([Exon(500,1500), Exon(2000,9000)])
     self.assertTrue(trim_nodes == correct)
     trim_nodes = trim_graph(G, POS_STRAND,
                             min_trim_length=0, 
                             trim_utr_fraction=0.0,
                             trim_intron_fraction=1.0)
     correct = set([Exon(500,1500), Exon(2000,9000)])
     self.assertTrue(trim_nodes == correct)
Beispiel #5
0
 def test_trim_intron_retention(self):
     transcripts = read_first_locus("trim_intron_retention1.gtf",
                                    score_attr="FPKM")
     GG = get_transcript_graphs(transcripts)
     G, tmap = GG[POS_STRAND]
     # trim at different thresholds
     trim_nodes = trim_graph(G,
                             POS_STRAND,
                             min_trim_length=0,
                             trim_utr_fraction=0.0,
                             trim_intron_fraction=0.01)
     correct = set()
     self.assertTrue(trim_nodes == correct)
     trim_nodes = trim_graph(G,
                             POS_STRAND,
                             min_trim_length=0,
                             trim_utr_fraction=0.0,
                             trim_intron_fraction=0.11)
     correct = set([Exon(500, 1500)])
     self.assertTrue(trim_nodes == correct)
     trim_nodes = trim_graph(G,
                             POS_STRAND,
                             min_trim_length=0,
                             trim_utr_fraction=0.0,
                             trim_intron_fraction=0.21)
     correct = set([Exon(500, 1500), Exon(2000, 9000)])
     self.assertTrue(trim_nodes == correct)
     trim_nodes = trim_graph(G,
                             POS_STRAND,
                             min_trim_length=0,
                             trim_utr_fraction=0.0,
                             trim_intron_fraction=1.0)
     correct = set([Exon(500, 1500), Exon(2000, 9000)])
     self.assertTrue(trim_nodes == correct)
 def test_resolve_strand_scores1(self):
     transcripts = read_first_locus("resolve_strand_scores1.gtf", score_attr="FPKM")
     tdict = dict((t.attrs['transcript_id'],t) for t in transcripts)
     partition_transcripts_by_strand(transcripts)
     # check strands
     t = tdict["AA"]
     self.assertTrue(t.strand == NO_STRAND)
     t = tdict["BB"]
     self.assertTrue(t.strand == NO_STRAND)
Beispiel #7
0
 def test_resolve_strand_scores1(self):
     transcripts = read_first_locus("resolve_strand_scores1.gtf",
                                    score_attr="FPKM")
     tdict = dict((t.attrs['transcript_id'], t) for t in transcripts)
     partition_transcripts_by_strand(transcripts)
     # check strands
     t = tdict["AA"]
     self.assertTrue(t.strand == NO_STRAND)
     t = tdict["BB"]
     self.assertTrue(t.strand == NO_STRAND)
 def test_resolve_strand_scores3(self):
     transcripts = read_first_locus("resolve_strand_scores3.gtf", score_attr="FPKM")
     tdict = dict((t.attrs['transcript_id'],t) for t in transcripts)
     partition_transcripts_by_strand(transcripts)
     # check strands
     t = tdict['CC']
     self.assertTrue(t.strand == POS_STRAND)
     t = tdict['DD']
     self.assertTrue(t.strand == NEG_STRAND)
     t = tdict['EE']
     self.assertTrue(t.strand == POS_STRAND)
     # increase score on negative strand
     transcripts = read_first_locus("resolve_strand_scores3.gtf", score_attr="FPKM")
     tdict = dict((t.attrs['transcript_id'],t) for t in transcripts)
     tdict['DD'].score += 1.0
     partition_transcripts_by_strand(transcripts)
     # check strands
     t = tdict['EE']
     self.assertTrue(t.strand == NEG_STRAND)
Beispiel #9
0
 def test_trim_bidir(self):
     transcripts = read_first_locus("trim_bidir1.gtf", score_attr="FPKM")
     GG = get_transcript_graphs(transcripts)
     G,tmap = GG[POS_STRAND]
     # trim at three different thresholds
     trim_nodes = trim_graph(G, POS_STRAND,
                             min_trim_length=0, 
                             trim_utr_fraction=0.015,
                             trim_intron_fraction=0.0)
     correct = set([Exon(0,100), Exon(900,1000)])
     self.assertTrue(trim_nodes == correct)
     trim_nodes = trim_graph(G, POS_STRAND,
                             min_trim_length=0, 
                             trim_utr_fraction=0.11,
                             trim_intron_fraction=0.0)
     correct = set([Exon(0,100), Exon(900,1000), 
                    Exon(100,200), Exon(800,900)])
     self.assertTrue(trim_nodes == correct)
     trim_nodes = trim_graph(G, POS_STRAND,
                             min_trim_length=0, 
                             trim_utr_fraction=0.26,
                             trim_intron_fraction=0.0)
     correct = set([Exon(0,100), Exon(900,1000), 
                    Exon(100,200), Exon(800,900),
                    Exon(200,300), Exon(700,800)])
     self.assertTrue(trim_nodes == correct)
     # flip sign of transcripts and try again
     for t in transcripts:
         t.strand = NEG_STRAND
     GG = get_transcript_graphs(transcripts)
     G,tmap = GG[NEG_STRAND]        
     # trim at three different thresholds
     trim_nodes = trim_graph(G, NEG_STRAND,
                             min_trim_length=0, 
                             trim_utr_fraction=0.015,
                             trim_intron_fraction=0.0)
     correct = set([Exon(0,100), Exon(900,1000)])
     self.assertTrue(trim_nodes == correct)
     trim_nodes = trim_graph(G, NEG_STRAND,
                             min_trim_length=0, 
                             trim_utr_fraction=0.11,
                             trim_intron_fraction=0.0)
     correct = set([Exon(0,100), Exon(900,1000), 
                    Exon(100,200), Exon(800,900)])
     self.assertTrue(trim_nodes == correct)
     trim_nodes = trim_graph(G, NEG_STRAND,
                             min_trim_length=0, 
                             trim_utr_fraction=0.26,
                             trim_intron_fraction=0.0)
     correct = set([Exon(0,100), Exon(900,1000), 
                    Exon(100,200), Exon(800,900),
                    Exon(200,300), Exon(700,800)])
     self.assertTrue(trim_nodes == correct)
Beispiel #10
0
 def test_resolve_strand_scores3(self):
     transcripts = read_first_locus("resolve_strand_scores3.gtf",
                                    score_attr="FPKM")
     tdict = dict((t.attrs['transcript_id'], t) for t in transcripts)
     partition_transcripts_by_strand(transcripts)
     # check strands
     t = tdict['CC']
     self.assertTrue(t.strand == POS_STRAND)
     t = tdict['DD']
     self.assertTrue(t.strand == NEG_STRAND)
     t = tdict['EE']
     self.assertTrue(t.strand == POS_STRAND)
     # increase score on negative strand
     transcripts = read_first_locus("resolve_strand_scores3.gtf",
                                    score_attr="FPKM")
     tdict = dict((t.attrs['transcript_id'], t) for t in transcripts)
     tdict['DD'].score += 1.0
     partition_transcripts_by_strand(transcripts)
     # check strands
     t = tdict['EE']
     self.assertTrue(t.strand == NEG_STRAND)
Beispiel #11
0
 def test_find_best_match(self):
     transcripts = read_first_locus("annotate_best_match1.gtf")
     t_dict = dict((t.attrs['transcript_id'], t) for t in transcripts)
     annotate_locus(transcripts, gtf_sample_attr="sample_id")
     t = t_dict['T1']
     self.assertTrue(t.attrs[GTFAttr.CATEGORY] == 0)
     self.assertTrue(t.attrs[GTFAttr.ANN_REF_ID] == 'D')
     self.assertAlmostEqual(t.attrs[GTFAttr.ANN_INTRON_RATIO], 1.0, 2)
     self.assertAlmostEqual(t.attrs[GTFAttr.ANN_COV_RATIO], 0.9375, 2)
     t = t_dict['T2']
     self.assertTrue(t.attrs[GTFAttr.CATEGORY] == 0)
     self.assertTrue(t.attrs[GTFAttr.ANN_REF_ID] == 'B')
     self.assertAlmostEqual(t.attrs[GTFAttr.ANN_INTRON_RATIO], 0.25, 2)
     self.assertAlmostEqual(t.attrs[GTFAttr.ANN_COV_RATIO], 0.6744, 2)
 def test_find_best_match(self):
     transcripts = read_first_locus("annotate_best_match1.gtf")
     t_dict = dict((t.attrs["transcript_id"], t) for t in transcripts)
     annotate_locus(transcripts, gtf_sample_attr="sample_id")
     t = t_dict["T1"]
     self.assertTrue(t.attrs[GTFAttr.CATEGORY] == 0)
     self.assertTrue(t.attrs[GTFAttr.ANN_REF_ID] == "D")
     self.assertAlmostEqual(t.attrs[GTFAttr.ANN_INTRON_RATIO], 1.0, 2)
     self.assertAlmostEqual(t.attrs[GTFAttr.ANN_COV_RATIO], 0.9375, 2)
     t = t_dict["T2"]
     self.assertTrue(t.attrs[GTFAttr.CATEGORY] == 0)
     self.assertTrue(t.attrs[GTFAttr.ANN_REF_ID] == "B")
     self.assertAlmostEqual(t.attrs[GTFAttr.ANN_INTRON_RATIO], 0.25, 2)
     self.assertAlmostEqual(t.attrs[GTFAttr.ANN_COV_RATIO], 0.6744, 2)
 def test_intergenic(self):
     transcripts = read_first_locus("annotate_intergenic1.gtf")
     t_dict = dict((t.attrs["transcript_id"], t) for t in transcripts)
     annotate_locus(transcripts, gtf_sample_attr="sample_id")
     t = t_dict["T1"]
     self.assertTrue(t.attrs[GTFAttr.CATEGORY] == 6)
     self.assertTrue(t.attrs[GTFAttr.ANN_REF_ID] == "na")
     self.assertAlmostEqual(t.attrs[GTFAttr.ANN_INTRON_RATIO], 0.0, 2)
     self.assertAlmostEqual(t.attrs[GTFAttr.ANN_COV_RATIO], 0.0, 2)
     self.assertAlmostEqual(t.attrs[GTFAttr.MEAN_RECURRENCE], 2.0, 2)
     self.assertAlmostEqual(t.attrs[GTFAttr.MEAN_SCORE], 20.0, 2)
     t = t_dict["F"]
     self.assertTrue(t.attrs[GTFAttr.CATEGORY] == 6)
     self.assertTrue(t.attrs[GTFAttr.ANN_REF_ID] == "na")
     self.assertAlmostEqual(t.attrs[GTFAttr.ANN_INTRON_RATIO], 0.0, 2)
     self.assertAlmostEqual(t.attrs[GTFAttr.ANN_COV_RATIO], 0.0, 2)
     self.assertAlmostEqual(t.attrs[GTFAttr.MEAN_RECURRENCE], 2.0, 2)
     self.assertAlmostEqual(t.attrs[GTFAttr.MEAN_SCORE], 20.0, 2)
Beispiel #14
0
 def test_intergenic(self):
     transcripts = read_first_locus("annotate_intergenic1.gtf")
     t_dict = dict((t.attrs['transcript_id'], t) for t in transcripts)
     annotate_locus(transcripts, gtf_sample_attr="sample_id")
     t = t_dict['T1']
     self.assertTrue(t.attrs[GTFAttr.CATEGORY] == 6)
     self.assertTrue(t.attrs[GTFAttr.ANN_REF_ID] == 'na')
     self.assertAlmostEqual(t.attrs[GTFAttr.ANN_INTRON_RATIO], 0.0, 2)
     self.assertAlmostEqual(t.attrs[GTFAttr.ANN_COV_RATIO], 0.0, 2)
     self.assertAlmostEqual(t.attrs[GTFAttr.MEAN_RECURRENCE], 2.0, 2)
     self.assertAlmostEqual(t.attrs[GTFAttr.MEAN_SCORE], 20.0, 2)
     t = t_dict['F']
     self.assertTrue(t.attrs[GTFAttr.CATEGORY] == 6)
     self.assertTrue(t.attrs[GTFAttr.ANN_REF_ID] == 'na')
     self.assertAlmostEqual(t.attrs[GTFAttr.ANN_INTRON_RATIO], 0.0, 2)
     self.assertAlmostEqual(t.attrs[GTFAttr.ANN_COV_RATIO], 0.0, 2)
     self.assertAlmostEqual(t.attrs[GTFAttr.MEAN_RECURRENCE], 2.0, 2)
     self.assertAlmostEqual(t.attrs[GTFAttr.MEAN_SCORE], 20.0, 2)
 def test_resolve_strand_ref(self):
     transcripts = read_first_locus("resolve_strand_ref1.gtf", score_attr="FPKM")
     tdict = dict((t.attrs['transcript_id'],t) for t in transcripts)
     partition_transcripts_by_strand(transcripts)
     # check resolved strands
     t = tdict["T1"]
     self.assertTrue(t.strand == POS_STRAND)
     t = tdict["T2"]
     self.assertTrue(t.strand == NEG_STRAND)
     # equal overlap on both strands, default to positive
     t = tdict["T3"]
     self.assertTrue(t.strand == POS_STRAND)
     # more positive strand overlap
     t = tdict["T4"]
     self.assertTrue(t.strand == POS_STRAND)
     # more negative strand overlap
     t = tdict["T5"]
     self.assertTrue(t.strand == NEG_STRAND)
     return
Beispiel #16
0
 def test_resolve_strand_ref(self):
     transcripts = read_first_locus("resolve_strand_ref1.gtf",
                                    score_attr="FPKM")
     tdict = dict((t.attrs['transcript_id'], t) for t in transcripts)
     partition_transcripts_by_strand(transcripts)
     # check resolved strands
     t = tdict["T1"]
     self.assertTrue(t.strand == POS_STRAND)
     t = tdict["T2"]
     self.assertTrue(t.strand == NEG_STRAND)
     # equal overlap on both strands, default to positive
     t = tdict["T3"]
     self.assertTrue(t.strand == POS_STRAND)
     # more positive strand overlap
     t = tdict["T4"]
     self.assertTrue(t.strand == POS_STRAND)
     # more negative strand overlap
     t = tdict["T5"]
     self.assertTrue(t.strand == NEG_STRAND)
     return
 def test_categories(self):
     transcripts = read_first_locus("annotate_category1.gtf")
     t_dict = dict((t.attrs["transcript_id"], t) for t in transcripts)
     annotate_locus(transcripts, gtf_sample_attr="sample_id")
     # intronic same strand
     self.assertTrue(t_dict["T2"].attrs[GTFAttr.CATEGORY] == 2)
     # intronic opposite strand
     self.assertTrue(t_dict["T3"].attrs[GTFAttr.CATEGORY] == 3)
     # intronic ambiguous
     self.assertTrue(t_dict["T6"].attrs[GTFAttr.CATEGORY] == 4)
     # interleaving
     self.assertTrue(t_dict["T4"].attrs[GTFAttr.CATEGORY] == 5)
     # interleaving
     self.assertTrue(t_dict["T5"].attrs[GTFAttr.CATEGORY] == 3)
     # opp strand overlap (no introns)
     t = t_dict["T7"]
     self.assertTrue(t.attrs[GTFAttr.CATEGORY] == 1)
     self.assertTrue(t.attrs[GTFAttr.ANN_REF_ID] == "T1")
     self.assertAlmostEqual(t.attrs[GTFAttr.ANN_COV_RATIO], 0.2, 2)
     # same strand overlap (no introns)
     t = t_dict["T8"]
     self.assertTrue(t.attrs[GTFAttr.CATEGORY] == 0)
     self.assertTrue(t.attrs[GTFAttr.ANN_REF_ID] == "T1")
     self.assertAlmostEqual(t.attrs[GTFAttr.ANN_INTRON_RATIO], 0.0, 2)
     self.assertAlmostEqual(t.attrs[GTFAttr.ANN_COV_RATIO], 0.10, 2)
     # same strand overlap (with introns)
     t = t_dict["T9"]
     self.assertTrue(t.attrs[GTFAttr.CATEGORY] == 0)
     self.assertTrue(t.attrs[GTFAttr.ANN_REF_ID] == "T1")
     self.assertAlmostEqual(t.attrs[GTFAttr.ANN_INTRON_RATIO], 0.4, 2)
     self.assertAlmostEqual(t.attrs[GTFAttr.ANN_COV_RATIO], 2.0 / 3, 2)
     # another same strand overlap (with introns)
     t = t_dict["T10"]
     self.assertTrue(t.attrs[GTFAttr.CATEGORY] == 0)
     self.assertTrue(t.attrs[GTFAttr.ANN_REF_ID] == "T1")
     self.assertAlmostEqual(t.attrs[GTFAttr.ANN_INTRON_RATIO], 0.5, 2)
     self.assertAlmostEqual(t.attrs[GTFAttr.ANN_COV_RATIO], 0.6, 2)
Beispiel #18
0
 def test_categories(self):
     transcripts = read_first_locus("annotate_category1.gtf")
     t_dict = dict((t.attrs['transcript_id'], t) for t in transcripts)
     annotate_locus(transcripts, gtf_sample_attr="sample_id")
     # intronic same strand
     self.assertTrue(t_dict['T2'].attrs[GTFAttr.CATEGORY] == 2)
     # intronic opposite strand
     self.assertTrue(t_dict['T3'].attrs[GTFAttr.CATEGORY] == 3)
     # intronic ambiguous
     self.assertTrue(t_dict['T6'].attrs[GTFAttr.CATEGORY] == 4)
     # interleaving
     self.assertTrue(t_dict['T4'].attrs[GTFAttr.CATEGORY] == 5)
     # interleaving
     self.assertTrue(t_dict['T5'].attrs[GTFAttr.CATEGORY] == 3)
     # opp strand overlap (no introns)
     t = t_dict['T7']
     self.assertTrue(t.attrs[GTFAttr.CATEGORY] == 1)
     self.assertTrue(t.attrs[GTFAttr.ANN_REF_ID] == 'T1')
     self.assertAlmostEqual(t.attrs[GTFAttr.ANN_COV_RATIO], 0.2, 2)
     # same strand overlap (no introns)
     t = t_dict['T8']
     self.assertTrue(t.attrs[GTFAttr.CATEGORY] == 0)
     self.assertTrue(t.attrs[GTFAttr.ANN_REF_ID] == 'T1')
     self.assertAlmostEqual(t.attrs[GTFAttr.ANN_INTRON_RATIO], 0.0, 2)
     self.assertAlmostEqual(t.attrs[GTFAttr.ANN_COV_RATIO], 0.10, 2)
     # same strand overlap (with introns)
     t = t_dict['T9']
     self.assertTrue(t.attrs[GTFAttr.CATEGORY] == 0)
     self.assertTrue(t.attrs[GTFAttr.ANN_REF_ID] == 'T1')
     self.assertAlmostEqual(t.attrs[GTFAttr.ANN_INTRON_RATIO], 0.4, 2)
     self.assertAlmostEqual(t.attrs[GTFAttr.ANN_COV_RATIO], 2. / 3, 2)
     # another same strand overlap (with introns)
     t = t_dict['T10']
     self.assertTrue(t.attrs[GTFAttr.CATEGORY] == 0)
     self.assertTrue(t.attrs[GTFAttr.ANN_REF_ID] == 'T1')
     self.assertAlmostEqual(t.attrs[GTFAttr.ANN_INTRON_RATIO], 0.5, 2)
     self.assertAlmostEqual(t.attrs[GTFAttr.ANN_COV_RATIO], 0.6, 2)
Beispiel #19
0
 def test_assembler1(self):
     # setup correct transcripts
     PATH_ABCDE = tuple([Exon(0,100), Exon(200,300), Exon(400,500),Exon(600,700), Exon(800,900)])
     PATH_ACE = tuple([Exon(0,100), Exon(400,500), Exon(800,900)])
     PATH_ABCE = tuple([Exon(0,100), Exon(200,300), Exon(400,500), Exon(800,900)])
     PATH_ACDE = tuple([Exon(0,100), Exon(400,500),Exon(600,700), Exon(800,900)])
     # read transcripts
     transcripts = read_first_locus("assemble1.gtf", score_attr="score")
     tdict = dict((t.attrs['transcript_id'],t) for t in transcripts)
     # set transcript scores
     tdict["ABCDE"].score = 2.0
     tdict["ACE"].score = 1.0
     tdict["ABCE"].score = 1.0
     tdict["ACDE"].score = 1.0
     # create graphs
     GS = create_transcript_graphs('chr1', transcripts, 
                                   create_bedgraph=False, 
                                   bedgraph_filehs=None,
                                   min_trim_length=0, 
                                   trim_utr_fraction=0.0,
                                   trim_intron_fraction=0.0)   
     Gsub, strand, partial_paths = GS[0].Gsub, GS[0].strand, GS[0].partial_paths
     # assemble with kmax=2
     results = list(assemble_transcript_graph(Gsub, strand, partial_paths,
                                              user_kmax=2,
                                              ksensitivity=0,
                                              fraction_major_path=0,
                                              max_paths=1000))
     self.assertEquals(len(results), 2)
     self.assertEqual(tuple(results[0].path), PATH_ABCDE) 
     self.assertAlmostEqual(results[0].score, 3.0, places=3) 
     self.assertEqual(tuple(results[1].path), PATH_ACE) 
     self.assertAlmostEqual(results[1].score, 2.0, places=3) 
     # change transcript scores
     tdict["ABCDE"].score = 4.0
     tdict["ACE"].score = 3.0
     tdict["ABCE"].score = 2.0
     tdict["ACDE"].score = 1.0
     # create graphs
     GS = create_transcript_graphs('chr1', transcripts, 
                                   create_bedgraph=False, 
                                   bedgraph_filehs=None,
                                   min_trim_length=0, 
                                   trim_utr_fraction=0.0,
                                   trim_intron_fraction=0.0)   
     Gsub, strand, partial_paths = GS[0].Gsub, GS[0].strand, GS[0].partial_paths
     # assemble with kmax=3
     results = list(assemble_transcript_graph(Gsub, strand, partial_paths,
                                              user_kmax=3,
                                              ksensitivity=0,
                                              fraction_major_path=0,
                                              max_paths=1000))
     self.assertEquals(len(results), 4)
     self.assertEqual(tuple(results[0].path), PATH_ABCDE) 
     self.assertAlmostEqual(results[0].score, 4.0, places=3) 
     self.assertEqual(tuple(results[1].path), PATH_ACE) 
     self.assertAlmostEqual(results[1].score, 3.0, places=3) 
     self.assertEqual(tuple(results[2].path), PATH_ABCE) 
     self.assertAlmostEqual(results[2].score, 2.0, places=3) 
     self.assertEqual(tuple(results[3].path), PATH_ACDE) 
     self.assertAlmostEqual(results[3].score, 1.0, places=3) 
     return
Beispiel #20
0
 def test_trim_intronic_utr(self):
     transcripts = read_first_locus("trim_intron_utr1.gtf", score_attr="FPKM")
     GG = get_transcript_graphs(transcripts)
     G,tmap = GG[POS_STRAND]       
     # trim at different thresholds
     trim_nodes = trim_graph(G, POS_STRAND,
                             min_trim_length=0, 
                             trim_utr_fraction=0.0,
                             trim_intron_fraction=0.001)
     correct = set()
     self.assertTrue(trim_nodes == correct)
     trim_nodes = trim_graph(G, POS_STRAND,
                             min_trim_length=0, 
                             trim_utr_fraction=0.0,
                             trim_intron_fraction=0.011)
     correct = set([Exon(1000,1100)])
     self.assertTrue(trim_nodes == correct)
     trim_nodes = trim_graph(G, POS_STRAND,
                             min_trim_length=0, 
                             trim_utr_fraction=0.0,
                             trim_intron_fraction=0.055)
     correct = set([Exon(1000,1100), Exon(1100,1200)])
     self.assertTrue(trim_nodes == correct)
     trim_nodes = trim_graph(G, POS_STRAND,
                             min_trim_length=0, 
                             trim_utr_fraction=0.0,
                             trim_intron_fraction=0.15)
     correct = set([Exon(1000,1100), Exon(1100,1200),
                    Exon(1200,1300)])
     self.assertTrue(trim_nodes == correct)
     trim_nodes = trim_graph(G, POS_STRAND,
                             min_trim_length=0, 
                             trim_utr_fraction=0.0,
                             trim_intron_fraction=1.0)
     correct = set([Exon(1000,1100), Exon(1100,1200),
                    Exon(1200,1300), Exon(1300,1500)])
     self.assertTrue(trim_nodes == correct)        
     # flip sign of transcripts and try again
     for t in transcripts:
         t.strand = NEG_STRAND
     GG = get_transcript_graphs(transcripts)
     G,tmap = GG[NEG_STRAND]
     # trim at different thresholds
     trim_nodes = trim_graph(G, NEG_STRAND,
                             min_trim_length=0, 
                             trim_utr_fraction=0.0,
                             trim_intron_fraction=0.001)
     correct = set()
     self.assertTrue(trim_nodes == correct)
     trim_nodes = trim_graph(G, NEG_STRAND,
                             min_trim_length=0, 
                             trim_utr_fraction=0.0,
                             trim_intron_fraction=0.011)
     correct = set([Exon(1000,1100)])
     self.assertTrue(trim_nodes == correct)
     trim_nodes = trim_graph(G, NEG_STRAND,
                             min_trim_length=0, 
                             trim_utr_fraction=0.0,
                             trim_intron_fraction=0.055)
     correct = set([Exon(1000,1100), Exon(1100,1200)])
     self.assertTrue(trim_nodes == correct)
     trim_nodes = trim_graph(G, NEG_STRAND,
                             min_trim_length=0, 
                             trim_utr_fraction=0.0,
                             trim_intron_fraction=0.15)
     correct = set([Exon(1000,1100), Exon(1100,1200),
                    Exon(1200,1300)])
     self.assertTrue(trim_nodes == correct)
     trim_nodes = trim_graph(G, NEG_STRAND,
                             min_trim_length=0, 
                             trim_utr_fraction=0.0,
                             trim_intron_fraction=1.0)
     correct = set([Exon(1000,1100), Exon(1100,1200),
                    Exon(1200,1300), Exon(1300,1500)])
     self.assertTrue(trim_nodes == correct)
Beispiel #21
0
 def test_trim_intron_bidir(self):
     transcripts = read_first_locus("trim_intron_bidir1.gtf",
                                    score_attr="FPKM")
     GG = get_transcript_graphs(transcripts)
     G, tmap = GG[POS_STRAND]
     # trim at different thresholds
     trim_nodes = trim_graph(G,
                             POS_STRAND,
                             min_trim_length=0,
                             trim_utr_fraction=0.0,
                             trim_intron_fraction=0.001)
     correct = set()
     self.assertTrue(trim_nodes == correct)
     trim_nodes = trim_graph(G,
                             POS_STRAND,
                             min_trim_length=0,
                             trim_utr_fraction=0.0,
                             trim_intron_fraction=0.025)
     correct = set([Exon(1900, 2000), Exon(1000, 1100)])
     self.assertTrue(trim_nodes == correct)
     trim_nodes = trim_graph(G,
                             POS_STRAND,
                             min_trim_length=0,
                             trim_utr_fraction=0.0,
                             trim_intron_fraction=0.2)
     correct = set([
         Exon(1900, 2000),
         Exon(1100, 1200),
         Exon(1800, 1900),
         Exon(1000, 1100)
     ])
     self.assertTrue(trim_nodes == correct)
     trim_nodes = trim_graph(G,
                             POS_STRAND,
                             min_trim_length=0,
                             trim_utr_fraction=0.0,
                             trim_intron_fraction=0.25)
     correct = set([
         Exon(1900, 2000),
         Exon(1100, 1200),
         Exon(1200, 1300),
         Exon(1700, 1800),
         Exon(1800, 1900),
         Exon(1000, 1100)
     ])
     self.assertTrue(trim_nodes == correct)
     # flip sign of transcripts and try again
     for t in transcripts:
         t.strand = NEG_STRAND
     GG = get_transcript_graphs(transcripts)
     G, tmap = GG[NEG_STRAND]
     # trim at different thresholds
     trim_nodes = trim_graph(G,
                             NEG_STRAND,
                             min_trim_length=0,
                             trim_utr_fraction=0.0,
                             trim_intron_fraction=0.001)
     correct = set()
     self.assertTrue(trim_nodes == correct)
     trim_nodes = trim_graph(G,
                             NEG_STRAND,
                             min_trim_length=0,
                             trim_utr_fraction=0.0,
                             trim_intron_fraction=0.025)
     correct = set([Exon(1900, 2000), Exon(1000, 1100)])
     self.assertTrue(trim_nodes == correct)
     trim_nodes = trim_graph(G,
                             NEG_STRAND,
                             min_trim_length=0,
                             trim_utr_fraction=0.0,
                             trim_intron_fraction=0.2)
     correct = set([
         Exon(1900, 2000),
         Exon(1100, 1200),
         Exon(1800, 1900),
         Exon(1000, 1100)
     ])
     self.assertTrue(trim_nodes == correct)
     trim_nodes = trim_graph(G,
                             NEG_STRAND,
                             min_trim_length=0,
                             trim_utr_fraction=0.0,
                             trim_intron_fraction=0.25)
     correct = set([
         Exon(1900, 2000),
         Exon(1100, 1200),
         Exon(1200, 1300),
         Exon(1700, 1800),
         Exon(1800, 1900),
         Exon(1000, 1100)
     ])
     self.assertTrue(trim_nodes == correct)
Beispiel #22
0
 def test_assembler1(self):
     # setup correct transcripts
     PATH_ABCDE = tuple([
         Exon(0, 100),
         Exon(200, 300),
         Exon(400, 500),
         Exon(600, 700),
         Exon(800, 900)
     ])
     PATH_ACE = tuple([Exon(0, 100), Exon(400, 500), Exon(800, 900)])
     PATH_ABCE = tuple(
         [Exon(0, 100),
          Exon(200, 300),
          Exon(400, 500),
          Exon(800, 900)])
     PATH_ACDE = tuple(
         [Exon(0, 100),
          Exon(400, 500),
          Exon(600, 700),
          Exon(800, 900)])
     # read transcripts
     transcripts = read_first_locus("assemble1.gtf", score_attr="score")
     tdict = dict((t.attrs['transcript_id'], t) for t in transcripts)
     # set transcript scores
     tdict["ABCDE"].score = 2.0
     tdict["ACE"].score = 1.0
     tdict["ABCE"].score = 1.0
     tdict["ACDE"].score = 1.0
     # create graphs
     GS = create_transcript_graphs('chr1',
                                   transcripts,
                                   create_bedgraph=False,
                                   bedgraph_filehs=None,
                                   min_trim_length=0,
                                   trim_utr_fraction=0.0,
                                   trim_intron_fraction=0.0)
     Gsub, strand, partial_paths = GS[0].Gsub, GS[0].strand, GS[
         0].partial_paths
     # assemble with kmax=2
     results = list(
         assemble_transcript_graph(Gsub,
                                   strand,
                                   partial_paths,
                                   user_kmax=2,
                                   ksensitivity=0,
                                   fraction_major_path=0,
                                   max_paths=1000))
     self.assertEquals(len(results), 2)
     self.assertEqual(tuple(results[0].path), PATH_ABCDE)
     self.assertAlmostEqual(results[0].score, 3.0, places=3)
     self.assertEqual(tuple(results[1].path), PATH_ACE)
     self.assertAlmostEqual(results[1].score, 2.0, places=3)
     # change transcript scores
     tdict["ABCDE"].score = 4.0
     tdict["ACE"].score = 3.0
     tdict["ABCE"].score = 2.0
     tdict["ACDE"].score = 1.0
     # create graphs
     GS = create_transcript_graphs('chr1',
                                   transcripts,
                                   create_bedgraph=False,
                                   bedgraph_filehs=None,
                                   min_trim_length=0,
                                   trim_utr_fraction=0.0,
                                   trim_intron_fraction=0.0)
     Gsub, strand, partial_paths = GS[0].Gsub, GS[0].strand, GS[
         0].partial_paths
     # assemble with kmax=3
     results = list(
         assemble_transcript_graph(Gsub,
                                   strand,
                                   partial_paths,
                                   user_kmax=3,
                                   ksensitivity=0,
                                   fraction_major_path=0,
                                   max_paths=1000))
     self.assertEquals(len(results), 4)
     self.assertEqual(tuple(results[0].path), PATH_ABCDE)
     self.assertAlmostEqual(results[0].score, 4.0, places=3)
     self.assertEqual(tuple(results[1].path), PATH_ACE)
     self.assertAlmostEqual(results[1].score, 3.0, places=3)
     self.assertEqual(tuple(results[2].path), PATH_ABCE)
     self.assertAlmostEqual(results[2].score, 2.0, places=3)
     self.assertEqual(tuple(results[3].path), PATH_ACDE)
     self.assertAlmostEqual(results[3].score, 1.0, places=3)
     return
 def test_assembler1(self):
     # setup correct transcripts
     PATH_ABCDE = tuple([
         Exon(0, 100),
         Exon(200, 300),
         Exon(400, 500),
         Exon(600, 700),
         Exon(800, 900)
     ])
     PATH_ACE = tuple([Exon(0, 100), Exon(400, 500), Exon(800, 900)])
     PATH_ABCE = tuple(
         [Exon(0, 100),
          Exon(200, 300),
          Exon(400, 500),
          Exon(800, 900)])
     PATH_ACDE = tuple(
         [Exon(0, 100),
          Exon(400, 500),
          Exon(600, 700),
          Exon(800, 900)])
     # read transcripts
     transcripts = read_first_locus("assemble1.gtf", score_attr="score")
     GG = get_transcript_graphs(transcripts)
     G, tmap = GG[POS_STRAND]
     # set transcript scores
     tmap["ABCDE"].score = 2.0
     tmap["ACE"].score = 1.0
     tmap["ABCE"].score = 1.0
     tmap["ACDE"].score = 1.0
     # set assembly parameter
     kmax = 2
     # assemble
     GS = list(
         prune_transcript_graph(G,
                                POS_STRAND,
                                tmap,
                                min_trim_length=0,
                                trim_utr_fraction=0,
                                trim_intron_fraction=0))
     Gsub, strand, partial_paths = GS[0]
     results = list(
         assemble_transcript_graph(Gsub,
                                   strand,
                                   partial_paths,
                                   user_kmax=kmax,
                                   ksensitivity=0,
                                   fraction_major_path=0,
                                   max_paths=1000))
     self.assertEquals(len(results), 2)
     self.assertEqual(tuple(results[0].path), PATH_ABCDE)
     self.assertAlmostEqual(results[0].score, 3.0, places=3)
     self.assertEqual(tuple(results[1].path), PATH_ACE)
     self.assertAlmostEqual(results[1].score, 2.0, places=3)
     # set transcript scores
     tmap["ABCDE"].score = 4.0
     tmap["ACE"].score = 3.0
     tmap["ABCE"].score = 2.0
     tmap["ACDE"].score = 1.0
     # set assembly parameter
     kmax = 3
     # assemble
     GS = list(
         prune_transcript_graph(G,
                                POS_STRAND,
                                tmap,
                                min_trim_length=0,
                                trim_utr_fraction=0,
                                trim_intron_fraction=0))
     Gsub, strand, partial_paths = GS[0]
     results = list(
         assemble_transcript_graph(Gsub,
                                   strand,
                                   partial_paths,
                                   user_kmax=kmax,
                                   ksensitivity=0,
                                   fraction_major_path=0,
                                   max_paths=1000))
     self.assertEquals(len(results), 4)
     self.assertEqual(tuple(results[0].path), PATH_ABCDE)
     self.assertAlmostEqual(results[0].score, 4.0, places=3)
     self.assertEqual(tuple(results[1].path), PATH_ACE)
     self.assertAlmostEqual(results[1].score, 3.0, places=3)
     self.assertEqual(tuple(results[2].path), PATH_ABCE)
     self.assertAlmostEqual(results[2].score, 2.0, places=3)
     self.assertEqual(tuple(results[3].path), PATH_ACDE)
     self.assertAlmostEqual(results[3].score, 1.0, places=3)