Ejemplo n.º 1
0
 def test_assembler1(self):
     # setup correct transcripts
     PATH_ABCDE = tuple([Exon(0,100), Exon(200,300), Exon(400,500),Exon(600,700), Exon(800,900)])
     PATH_ACE = tuple([Exon(0,100), Exon(400,500), Exon(800,900)])
     PATH_ABCE = tuple([Exon(0,100), Exon(200,300), Exon(400,500), Exon(800,900)])
     PATH_ACDE = tuple([Exon(0,100), Exon(400,500),Exon(600,700), Exon(800,900)])
     # read transcripts
     transcripts = read_first_locus("assemble1.gtf", score_attr="score")
     GG = get_transcript_graphs(transcripts)
     G,tmap = GG[POS_STRAND]
     # set transcript scores
     tmap["ABCDE"].score = 2.0
     tmap["ACE"].score = 1.0
     tmap["ABCE"].score = 1.0
     tmap["ACDE"].score = 1.0
     # set assembly parameter
     kmax = 2
     # assemble
     GS = list(prune_transcript_graph(G, POS_STRAND, tmap,
                                      min_trim_length=0,
                                      trim_utr_fraction=0,
                                      trim_intron_fraction=0))
     Gsub, strand, partial_paths = GS[0]
     results = list(assemble_transcript_graph(Gsub, strand, partial_paths,
                                              user_kmax=kmax,
                                              ksensitivity=0,
                                              fraction_major_path=0,
                                              max_paths=1000))
     self.assertEquals(len(results), 2)
     self.assertEqual(tuple(results[0].path), PATH_ABCDE) 
     self.assertAlmostEqual(results[0].score, 3.0, places=3) 
     self.assertEqual(tuple(results[1].path), PATH_ACE) 
     self.assertAlmostEqual(results[1].score, 2.0, places=3) 
     # set transcript scores
     tmap["ABCDE"].score = 4.0
     tmap["ACE"].score = 3.0
     tmap["ABCE"].score = 2.0
     tmap["ACDE"].score = 1.0
     # set assembly parameter
     kmax = 3
     # assemble
     GS = list(prune_transcript_graph(G, POS_STRAND, tmap,
                                      min_trim_length=0,
                                      trim_utr_fraction=0,
                                      trim_intron_fraction=0))
     Gsub, strand, partial_paths = GS[0]
     results = list(assemble_transcript_graph(Gsub, strand, partial_paths,
                                              user_kmax=kmax,
                                              ksensitivity=0,
                                              fraction_major_path=0,
                                              max_paths=1000))
     self.assertEquals(len(results), 4)
     self.assertEqual(tuple(results[0].path), PATH_ABCDE) 
     self.assertAlmostEqual(results[0].score, 4.0, places=3) 
     self.assertEqual(tuple(results[1].path), PATH_ACE) 
     self.assertAlmostEqual(results[1].score, 3.0, places=3) 
     self.assertEqual(tuple(results[2].path), PATH_ABCE) 
     self.assertAlmostEqual(results[2].score, 2.0, places=3) 
     self.assertEqual(tuple(results[3].path), PATH_ACDE) 
     self.assertAlmostEqual(results[3].score, 1.0, places=3) 
def assemble_gene(
    locus_chrom,
    locus_id_str,
    gene_id_value_obj,
    tss_id_value_obj,
    t_id_value_obj,
    G,
    strand,
    partial_paths,
    config,
    gtf_fileh,
    bed_fileh,
):
    # run assembly algorithm
    path_info_list = assemble_transcript_graph(
        G, strand, partial_paths, config.kmax, config.ksensitivity, config.fraction_major_isoform, config.max_paths
    )
    logging.debug("\tAssembled %d transcript(s)" % (len(path_info_list)))
    # determine gene ids and tss ids
    annotate_gene_and_tss_ids(path_info_list, strand, gene_id_value_obj, tss_id_value_obj)
    # bin transcripts by gene id
    gene_path_info_dict = collections.defaultdict(lambda: [])
    for p in path_info_list:
        gene_path_info_dict[p.gene_id].append(p)
    for gene_path_info_list in gene_path_info_dict.itervalues():
        # highest scoring path is always first in list
        highest_score = max(1e-8, gene_path_info_list[0].score)
        # create GTF features for each transcript path
        for p in gene_path_info_list:
            # assign transcript id
            t_id = t_id_value_obj.next()
            # get strings for each id
            t_id_str = "TU%d" % t_id
            tss_id_str = "TSS%d" % (p.tss_id)
            gene_id_str = "G%d" % (p.gene_id)
            # compute isoform fractions
            frac = p.score / highest_score
            # write to GTF
            if config.create_gtf:
                for f in get_gtf_features(
                    locus_chrom,
                    strand,
                    p.path,
                    locus_id=locus_id_str,
                    gene_id=gene_id_str,
                    tss_id=tss_id_str,
                    transcript_id=t_id_str,
                    score=p.score,
                    frac=frac,
                ):
                    print >> gtf_fileh, str(f)
            # write to BED
            if config.create_bed:
                name = "%s|%s(%.1f)" % (gene_id_str, t_id_str, p.score)
                fields = write_bed(locus_chrom, name, strand, int(round(1000.0 * frac)), p.path)
                print >> bed_fileh, "\t".join(fields)
def assemble_gene(locus_chrom, locus_id_str, 
                  gene_id_value_obj, tss_id_value_obj, t_id_value_obj,
                  G, strand, partial_paths, 
                  config, gtf_fileh, bed_fileh):
    # run assembly algorithm
    path_info_list = assemble_transcript_graph(G, strand, partial_paths,
                                               config.kmax,
                                               config.ksensitivity,
                                               config.fraction_major_isoform,
                                               config.max_paths)
    logging.debug("\tAssembled %d transcript(s)" % (len(path_info_list)))
    # determine gene ids and tss ids
    annotate_gene_and_tss_ids(path_info_list, strand,
                              gene_id_value_obj,
                              tss_id_value_obj)
    # bin transcripts by gene id
    gene_path_info_dict = collections.defaultdict(lambda: [])
    for p in path_info_list:
        gene_path_info_dict[p.gene_id].append(p)
    for gene_path_info_list in gene_path_info_dict.itervalues():
        # highest scoring path is always first in list
        highest_score = max(1e-8, gene_path_info_list[0].score)
        # create GTF features for each transcript path
        for p in gene_path_info_list:
            # assign transcript id
            t_id = t_id_value_obj.next()
            # get strings for each id
            t_id_str = "TU%d" % t_id
            tss_id_str = "TSS%d" % (p.tss_id)
            gene_id_str = "G%d" % (p.gene_id)
            # compute isoform fractions
            frac = p.score / highest_score
            # write to GTF
            if config.create_gtf:
                for f in get_gtf_features(locus_chrom, strand, p.path,
                                          locus_id=locus_id_str, 
                                          gene_id=gene_id_str, 
                                          tss_id=tss_id_str, 
                                          transcript_id=t_id_str,
                                          score=p.score, 
                                          frac=frac):
                    print >>gtf_fileh, str(f)
            # write to BED
            if config.create_bed:
                name = "%s|%s(%.1f)" % (gene_id_str, t_id_str, p.score)
                fields = write_bed(locus_chrom, name, strand, 
                                   int(round(1000.0*frac)), p.path)
                print >>bed_fileh, '\t'.join(fields)    
Ejemplo n.º 4
0
 def test_assembler1(self):
     # setup correct transcripts
     PATH_ABCDE = tuple([
         Exon(0, 100),
         Exon(200, 300),
         Exon(400, 500),
         Exon(600, 700),
         Exon(800, 900)
     ])
     PATH_ACE = tuple([Exon(0, 100), Exon(400, 500), Exon(800, 900)])
     PATH_ABCE = tuple(
         [Exon(0, 100),
          Exon(200, 300),
          Exon(400, 500),
          Exon(800, 900)])
     PATH_ACDE = tuple(
         [Exon(0, 100),
          Exon(400, 500),
          Exon(600, 700),
          Exon(800, 900)])
     # read transcripts
     transcripts = read_first_locus("assemble1.gtf", score_attr="score")
     GG = get_transcript_graphs(transcripts)
     G, tmap = GG[POS_STRAND]
     # set transcript scores
     tmap["ABCDE"].score = 2.0
     tmap["ACE"].score = 1.0
     tmap["ABCE"].score = 1.0
     tmap["ACDE"].score = 1.0
     # set assembly parameter
     kmax = 2
     # assemble
     GS = list(
         prune_transcript_graph(G,
                                POS_STRAND,
                                tmap,
                                min_trim_length=0,
                                trim_utr_fraction=0,
                                trim_intron_fraction=0))
     Gsub, strand, partial_paths = GS[0]
     results = list(
         assemble_transcript_graph(Gsub,
                                   strand,
                                   partial_paths,
                                   user_kmax=kmax,
                                   ksensitivity=0,
                                   fraction_major_path=0,
                                   max_paths=1000))
     self.assertEquals(len(results), 2)
     self.assertEqual(tuple(results[0].path), PATH_ABCDE)
     self.assertAlmostEqual(results[0].score, 3.0, places=3)
     self.assertEqual(tuple(results[1].path), PATH_ACE)
     self.assertAlmostEqual(results[1].score, 2.0, places=3)
     # set transcript scores
     tmap["ABCDE"].score = 4.0
     tmap["ACE"].score = 3.0
     tmap["ABCE"].score = 2.0
     tmap["ACDE"].score = 1.0
     # set assembly parameter
     kmax = 3
     # assemble
     GS = list(
         prune_transcript_graph(G,
                                POS_STRAND,
                                tmap,
                                min_trim_length=0,
                                trim_utr_fraction=0,
                                trim_intron_fraction=0))
     Gsub, strand, partial_paths = GS[0]
     results = list(
         assemble_transcript_graph(Gsub,
                                   strand,
                                   partial_paths,
                                   user_kmax=kmax,
                                   ksensitivity=0,
                                   fraction_major_path=0,
                                   max_paths=1000))
     self.assertEquals(len(results), 4)
     self.assertEqual(tuple(results[0].path), PATH_ABCDE)
     self.assertAlmostEqual(results[0].score, 4.0, places=3)
     self.assertEqual(tuple(results[1].path), PATH_ACE)
     self.assertAlmostEqual(results[1].score, 3.0, places=3)
     self.assertEqual(tuple(results[2].path), PATH_ABCE)
     self.assertAlmostEqual(results[2].score, 2.0, places=3)
     self.assertEqual(tuple(results[3].path), PATH_ACDE)
     self.assertAlmostEqual(results[3].score, 1.0, places=3)
Ejemplo n.º 5
0
 def test_assembler1(self):
     # setup correct transcripts
     PATH_ABCDE = tuple([Exon(0,100), Exon(200,300), Exon(400,500),Exon(600,700), Exon(800,900)])
     PATH_ACE = tuple([Exon(0,100), Exon(400,500), Exon(800,900)])
     PATH_ABCE = tuple([Exon(0,100), Exon(200,300), Exon(400,500), Exon(800,900)])
     PATH_ACDE = tuple([Exon(0,100), Exon(400,500),Exon(600,700), Exon(800,900)])
     # read transcripts
     transcripts = read_first_locus("assemble1.gtf", score_attr="score")
     tdict = dict((t.attrs['transcript_id'],t) for t in transcripts)
     # set transcript scores
     tdict["ABCDE"].score = 2.0
     tdict["ACE"].score = 1.0
     tdict["ABCE"].score = 1.0
     tdict["ACDE"].score = 1.0
     # create graphs
     GS = create_transcript_graphs('chr1', transcripts, 
                                   create_bedgraph=False, 
                                   bedgraph_filehs=None,
                                   min_trim_length=0, 
                                   trim_utr_fraction=0.0,
                                   trim_intron_fraction=0.0)   
     Gsub, strand, partial_paths = GS[0].Gsub, GS[0].strand, GS[0].partial_paths
     # assemble with kmax=2
     results = list(assemble_transcript_graph(Gsub, strand, partial_paths,
                                              user_kmax=2,
                                              ksensitivity=0,
                                              fraction_major_path=0,
                                              max_paths=1000))
     self.assertEquals(len(results), 2)
     self.assertEqual(tuple(results[0].path), PATH_ABCDE) 
     self.assertAlmostEqual(results[0].score, 3.0, places=3) 
     self.assertEqual(tuple(results[1].path), PATH_ACE) 
     self.assertAlmostEqual(results[1].score, 2.0, places=3) 
     # change transcript scores
     tdict["ABCDE"].score = 4.0
     tdict["ACE"].score = 3.0
     tdict["ABCE"].score = 2.0
     tdict["ACDE"].score = 1.0
     # create graphs
     GS = create_transcript_graphs('chr1', transcripts, 
                                   create_bedgraph=False, 
                                   bedgraph_filehs=None,
                                   min_trim_length=0, 
                                   trim_utr_fraction=0.0,
                                   trim_intron_fraction=0.0)   
     Gsub, strand, partial_paths = GS[0].Gsub, GS[0].strand, GS[0].partial_paths
     # assemble with kmax=3
     results = list(assemble_transcript_graph(Gsub, strand, partial_paths,
                                              user_kmax=3,
                                              ksensitivity=0,
                                              fraction_major_path=0,
                                              max_paths=1000))
     self.assertEquals(len(results), 4)
     self.assertEqual(tuple(results[0].path), PATH_ABCDE) 
     self.assertAlmostEqual(results[0].score, 4.0, places=3) 
     self.assertEqual(tuple(results[1].path), PATH_ACE) 
     self.assertAlmostEqual(results[1].score, 3.0, places=3) 
     self.assertEqual(tuple(results[2].path), PATH_ABCE) 
     self.assertAlmostEqual(results[2].score, 2.0, places=3) 
     self.assertEqual(tuple(results[3].path), PATH_ACDE) 
     self.assertAlmostEqual(results[3].score, 1.0, places=3) 
     return
Ejemplo n.º 6
0
 def test_assembler1(self):
     # setup correct transcripts
     PATH_ABCDE = tuple([
         Exon(0, 100),
         Exon(200, 300),
         Exon(400, 500),
         Exon(600, 700),
         Exon(800, 900)
     ])
     PATH_ACE = tuple([Exon(0, 100), Exon(400, 500), Exon(800, 900)])
     PATH_ABCE = tuple(
         [Exon(0, 100),
          Exon(200, 300),
          Exon(400, 500),
          Exon(800, 900)])
     PATH_ACDE = tuple(
         [Exon(0, 100),
          Exon(400, 500),
          Exon(600, 700),
          Exon(800, 900)])
     # read transcripts
     transcripts = read_first_locus("assemble1.gtf", score_attr="score")
     tdict = dict((t.attrs['transcript_id'], t) for t in transcripts)
     # set transcript scores
     tdict["ABCDE"].score = 2.0
     tdict["ACE"].score = 1.0
     tdict["ABCE"].score = 1.0
     tdict["ACDE"].score = 1.0
     # create graphs
     GS = create_transcript_graphs('chr1',
                                   transcripts,
                                   create_bedgraph=False,
                                   bedgraph_filehs=None,
                                   min_trim_length=0,
                                   trim_utr_fraction=0.0,
                                   trim_intron_fraction=0.0)
     Gsub, strand, partial_paths = GS[0].Gsub, GS[0].strand, GS[
         0].partial_paths
     # assemble with kmax=2
     results = list(
         assemble_transcript_graph(Gsub,
                                   strand,
                                   partial_paths,
                                   user_kmax=2,
                                   ksensitivity=0,
                                   fraction_major_path=0,
                                   max_paths=1000))
     self.assertEquals(len(results), 2)
     self.assertEqual(tuple(results[0].path), PATH_ABCDE)
     self.assertAlmostEqual(results[0].score, 3.0, places=3)
     self.assertEqual(tuple(results[1].path), PATH_ACE)
     self.assertAlmostEqual(results[1].score, 2.0, places=3)
     # change transcript scores
     tdict["ABCDE"].score = 4.0
     tdict["ACE"].score = 3.0
     tdict["ABCE"].score = 2.0
     tdict["ACDE"].score = 1.0
     # create graphs
     GS = create_transcript_graphs('chr1',
                                   transcripts,
                                   create_bedgraph=False,
                                   bedgraph_filehs=None,
                                   min_trim_length=0,
                                   trim_utr_fraction=0.0,
                                   trim_intron_fraction=0.0)
     Gsub, strand, partial_paths = GS[0].Gsub, GS[0].strand, GS[
         0].partial_paths
     # assemble with kmax=3
     results = list(
         assemble_transcript_graph(Gsub,
                                   strand,
                                   partial_paths,
                                   user_kmax=3,
                                   ksensitivity=0,
                                   fraction_major_path=0,
                                   max_paths=1000))
     self.assertEquals(len(results), 4)
     self.assertEqual(tuple(results[0].path), PATH_ABCDE)
     self.assertAlmostEqual(results[0].score, 4.0, places=3)
     self.assertEqual(tuple(results[1].path), PATH_ACE)
     self.assertAlmostEqual(results[1].score, 3.0, places=3)
     self.assertEqual(tuple(results[2].path), PATH_ABCE)
     self.assertAlmostEqual(results[2].score, 2.0, places=3)
     self.assertEqual(tuple(results[3].path), PATH_ACDE)
     self.assertAlmostEqual(results[3].score, 1.0, places=3)
     return