Ejemplo n.º 1
0
 def test_assembler1(self):
     # setup correct transcripts
     PATH_ABCDE = tuple([Exon(0,100), Exon(200,300), Exon(400,500),Exon(600,700), Exon(800,900)])
     PATH_ACE = tuple([Exon(0,100), Exon(400,500), Exon(800,900)])
     PATH_ABCE = tuple([Exon(0,100), Exon(200,300), Exon(400,500), Exon(800,900)])
     PATH_ACDE = tuple([Exon(0,100), Exon(400,500),Exon(600,700), Exon(800,900)])
     # read transcripts
     transcripts = read_first_locus("assemble1.gtf", score_attr="score")
     GG = get_transcript_graphs(transcripts)
     G,tmap = GG[POS_STRAND]
     # set transcript scores
     tmap["ABCDE"].score = 2.0
     tmap["ACE"].score = 1.0
     tmap["ABCE"].score = 1.0
     tmap["ACDE"].score = 1.0
     # set assembly parameter
     kmax = 2
     # assemble
     GS = list(prune_transcript_graph(G, POS_STRAND, tmap,
                                      min_trim_length=0,
                                      trim_utr_fraction=0,
                                      trim_intron_fraction=0))
     Gsub, strand, partial_paths = GS[0]
     results = list(assemble_transcript_graph(Gsub, strand, partial_paths,
                                              user_kmax=kmax,
                                              ksensitivity=0,
                                              fraction_major_path=0,
                                              max_paths=1000))
     self.assertEquals(len(results), 2)
     self.assertEqual(tuple(results[0].path), PATH_ABCDE) 
     self.assertAlmostEqual(results[0].score, 3.0, places=3) 
     self.assertEqual(tuple(results[1].path), PATH_ACE) 
     self.assertAlmostEqual(results[1].score, 2.0, places=3) 
     # set transcript scores
     tmap["ABCDE"].score = 4.0
     tmap["ACE"].score = 3.0
     tmap["ABCE"].score = 2.0
     tmap["ACDE"].score = 1.0
     # set assembly parameter
     kmax = 3
     # assemble
     GS = list(prune_transcript_graph(G, POS_STRAND, tmap,
                                      min_trim_length=0,
                                      trim_utr_fraction=0,
                                      trim_intron_fraction=0))
     Gsub, strand, partial_paths = GS[0]
     results = list(assemble_transcript_graph(Gsub, strand, partial_paths,
                                              user_kmax=kmax,
                                              ksensitivity=0,
                                              fraction_major_path=0,
                                              max_paths=1000))
     self.assertEquals(len(results), 4)
     self.assertEqual(tuple(results[0].path), PATH_ABCDE) 
     self.assertAlmostEqual(results[0].score, 4.0, places=3) 
     self.assertEqual(tuple(results[1].path), PATH_ACE) 
     self.assertAlmostEqual(results[1].score, 3.0, places=3) 
     self.assertEqual(tuple(results[2].path), PATH_ABCE) 
     self.assertAlmostEqual(results[2].score, 2.0, places=3) 
     self.assertEqual(tuple(results[3].path), PATH_ACDE) 
     self.assertAlmostEqual(results[3].score, 1.0, places=3) 
def assemble_locus(
    transcripts,
    locus_id_value_obj,
    gene_id_value_obj,
    tss_id_value_obj,
    t_id_value_obj,
    config,
    gtf_fileh,
    bed_fileh,
    bedgraph_filehs,
):
    def get_bedgraph_lines(chrom, G):
        for n in sorted(G.nodes()):
            if n.start < 0:
                continue
            fields = (chrom, n.start, n.end, G.node[n][NODE_SCORE])
            yield fields

    # gather properties of locus
    locus_chrom = transcripts[0].chrom
    locus_start = transcripts[0].start
    locus_end = max(tx.end for tx in transcripts)
    logging.debug("[LOCUS] %s:%d-%d %d transcripts" % (locus_chrom, locus_start, locus_end, len(transcripts)))
    locus_id_str = "L%d" % (locus_id_value_obj.next())
    # filter transcripts
    logging.debug("\tFiltering transcripts")
    transcripts = filter_transcripts(transcripts, config.min_transcript_length, config.guided)
    # build transcript graphs
    for G, strand, strand_transcripts in create_transcript_graphs(transcripts):
        # output bedgraph
        if config.create_bedgraph:
            for fields in get_bedgraph_lines(locus_chrom, G):
                print >>bedgraph_filehs[strand], "\t".join(map(str, fields))
        # process transcript graphs
        for Gsub, strand, partial_paths in prune_transcript_graph(
            G, strand, strand_transcripts, config.min_trim_length, config.trim_utr_fraction, config.trim_intron_fraction
        ):
            logging.debug(
                "Subgraph %s:%d-%d(%s) %d nodes %d paths"
                % (locus_chrom, locus_start, locus_end, strand_int_to_str(strand), len(Gsub), len(partial_paths))
            )
            # assemble subgraph
            assemble_gene(
                locus_chrom,
                locus_id_str,
                gene_id_value_obj,
                tss_id_value_obj,
                t_id_value_obj,
                Gsub,
                strand,
                partial_paths,
                config,
                gtf_fileh,
                bed_fileh,
            )
def assemble_locus(transcripts,
                   locus_id_value_obj,
                   gene_id_value_obj,
                   tss_id_value_obj,
                   t_id_value_obj,
                   config,
                   gtf_fileh,
                   bed_fileh,
                   bedgraph_filehs):
    def get_bedgraph_lines(chrom, G):
        for n in sorted(G.nodes()):
            if n.start < 0:
                continue
            fields = (chrom, n.start, n.end, G.node[n][NODE_SCORE]) 
            yield fields
    # gather properties of locus
    locus_chrom = transcripts[0].chrom
    locus_start = transcripts[0].start
    locus_end = max(tx.end for tx in transcripts)
    logging.debug("[LOCUS] %s:%d-%d %d transcripts" % 
                  (locus_chrom, locus_start, locus_end, 
                   len(transcripts)))
    locus_id_str = "L%d" % (locus_id_value_obj.next())
    # filter transcripts
    logging.debug("\tFiltering transcripts")
    transcripts = filter_transcripts(transcripts, 
                                     config.min_transcript_length,
                                     config.guided)
    # build transcript graphs
    for G, strand, strand_transcripts in \
        create_transcript_graphs(transcripts):
        # output bedgraph
        if config.create_bedgraph:
            for fields in get_bedgraph_lines(locus_chrom, G):
                print >>bedgraph_filehs[strand], '\t'.join(map(str,fields))
        # process transcript graphs
        for Gsub, strand, partial_paths in \
            prune_transcript_graph(G, strand, strand_transcripts,
                                   config.min_trim_length, 
                                   config.trim_utr_fraction,
                                   config.trim_intron_fraction):
            logging.debug("Subgraph %s:%d-%d(%s) %d nodes %d paths" %
                           (locus_chrom, locus_start, locus_end,
                            strand_int_to_str(strand), len(Gsub),
                            len(partial_paths)))
            # assemble subgraph
            assemble_gene(locus_chrom, locus_id_str, 
                          gene_id_value_obj,
                          tss_id_value_obj,
                          t_id_value_obj,
                          Gsub, strand, partial_paths, 
                          config,
                          gtf_fileh,
                          bed_fileh)
Ejemplo n.º 4
0
 def test_assembler1(self):
     # setup correct transcripts
     PATH_ABCDE = tuple([
         Exon(0, 100),
         Exon(200, 300),
         Exon(400, 500),
         Exon(600, 700),
         Exon(800, 900)
     ])
     PATH_ACE = tuple([Exon(0, 100), Exon(400, 500), Exon(800, 900)])
     PATH_ABCE = tuple(
         [Exon(0, 100),
          Exon(200, 300),
          Exon(400, 500),
          Exon(800, 900)])
     PATH_ACDE = tuple(
         [Exon(0, 100),
          Exon(400, 500),
          Exon(600, 700),
          Exon(800, 900)])
     # read transcripts
     transcripts = read_first_locus("assemble1.gtf", score_attr="score")
     GG = get_transcript_graphs(transcripts)
     G, tmap = GG[POS_STRAND]
     # set transcript scores
     tmap["ABCDE"].score = 2.0
     tmap["ACE"].score = 1.0
     tmap["ABCE"].score = 1.0
     tmap["ACDE"].score = 1.0
     # set assembly parameter
     kmax = 2
     # assemble
     GS = list(
         prune_transcript_graph(G,
                                POS_STRAND,
                                tmap,
                                min_trim_length=0,
                                trim_utr_fraction=0,
                                trim_intron_fraction=0))
     Gsub, strand, partial_paths = GS[0]
     results = list(
         assemble_transcript_graph(Gsub,
                                   strand,
                                   partial_paths,
                                   user_kmax=kmax,
                                   ksensitivity=0,
                                   fraction_major_path=0,
                                   max_paths=1000))
     self.assertEquals(len(results), 2)
     self.assertEqual(tuple(results[0].path), PATH_ABCDE)
     self.assertAlmostEqual(results[0].score, 3.0, places=3)
     self.assertEqual(tuple(results[1].path), PATH_ACE)
     self.assertAlmostEqual(results[1].score, 2.0, places=3)
     # set transcript scores
     tmap["ABCDE"].score = 4.0
     tmap["ACE"].score = 3.0
     tmap["ABCE"].score = 2.0
     tmap["ACDE"].score = 1.0
     # set assembly parameter
     kmax = 3
     # assemble
     GS = list(
         prune_transcript_graph(G,
                                POS_STRAND,
                                tmap,
                                min_trim_length=0,
                                trim_utr_fraction=0,
                                trim_intron_fraction=0))
     Gsub, strand, partial_paths = GS[0]
     results = list(
         assemble_transcript_graph(Gsub,
                                   strand,
                                   partial_paths,
                                   user_kmax=kmax,
                                   ksensitivity=0,
                                   fraction_major_path=0,
                                   max_paths=1000))
     self.assertEquals(len(results), 4)
     self.assertEqual(tuple(results[0].path), PATH_ABCDE)
     self.assertAlmostEqual(results[0].score, 4.0, places=3)
     self.assertEqual(tuple(results[1].path), PATH_ACE)
     self.assertAlmostEqual(results[1].score, 3.0, places=3)
     self.assertEqual(tuple(results[2].path), PATH_ABCE)
     self.assertAlmostEqual(results[2].score, 2.0, places=3)
     self.assertEqual(tuple(results[3].path), PATH_ACDE)
     self.assertAlmostEqual(results[3].score, 1.0, places=3)