Esempio n. 1
0
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv: argv = sys.argv

    # setup command line parser
    parser = E.OptionParser( version = "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", 
                                    usage = globals()["__doc__"] )

    parser.add_option("-a", "--gtf-a", dest="gtf_a", type="string",
                      help="supply a gtf file - will compress uncompressed files"  )
    parser.add_option("-b", "--gtf-b", dest = "gtf_b", type = "string",
                      help="supply a second gtf file - will compress uncompressed files")
    parser.add_option("-s", "--scripts-dir", dest = "scripts_dir", type = "string",
                      help="supply a location for accessory scripts")
    parser.add_option( "--no-venn", dest = "no_venn", action="store_true", 
                      help="set if no venn is to be drawn")

    
    ## add common options (-h/--help, ...) and parse command line 
    (options, args) = E.Start( parser, argv = argv )

    gtf_files = [options.gtf_a, options.gtf_b]

    merged_files = []
    prefices = []
    E.info("merging gtf files")
    for gtf in gtf_files:
        if gtf.endswith(".gtf.gz"):
            outfile = P.snip(gtf, ".gtf.gz") + ".merged.gtf.gz"
            prefices.append(P.snip(gtf, ".gtf.gz"))
            merged_files.append(outfile)
            statement = '''zcat %s | python %s/gtf2gtf.py --merge-transcripts --log=%s.log | gzip > %s''' % (gtf, options.scripts_dir, outfile, outfile)
            P.run()
        elif gtf.endswith(".gtf"):
            outfile = P.snip(gtf, ".gtf") + ".merged.gtf.gz"
            prefices.append(P.snip(gtf,".gtf"))
            merged_files.append(outfile)
            statement = '''cat %s | python %s/gtf2gtf.py --merge-transcripts --log=%s.log | gzip  > %s''' % (gtf, options.scripts_dir, outfile, outfile)
            P.run()
        else:
            raise ValueError("cannot perform merge on %s: is not a gtf file" % gtf)

    for prefix in prefices:
        if options.gtf_a.find(prefix) != -1:
            gtf_a = prefix + ".merged.gtf.gz"
            prefix_a = prefix
        elif options.gtf_b.find(prefix) != -1:
            gtf_b = prefix + ".merged.gtf.gz"
            prefix_b = prefix

    E.info("intersecting gtf files")
    # intersect the resulting merged files

    scriptsdir = options.scripts_dir
    intersection_out = "_vs_".join([prefix_a, prefix_b]) + ".intersection.gtf.gz" 
    statement = '''intersectBed -a %(gtf_a)s -b %(gtf_b)s -s -wa
                 | python %(scriptsdir)s/gtf2gtf.py --merge-transcripts --log=log | gzip > %(intersection_out)s'''
    P.run()

    if not options.no_venn:
        E.info("producing venn diagram for %s vs %s..." % (options.gtf_a, options.gtf_b))
        # produce the venn diagram
        intersection_file = intersection_out
        gtf_a_merged = gtf_a
        gtf_b_merged = gtf_b

        # create dictionary key
        gtf_pair = (gtf_a_merged, gtf_b_merged)

        # containers for counts
        count_gtf_merged_a = 0
        count_gtf_merged_b = 0
        count_intersection = 0

        # create GTF iterator objects
        gtf_iterator_a = GTF.iterator(IOTools.openFile(gtf_pair[0]))
        gtf_iterator_b = GTF.iterator(IOTools.openFile(gtf_pair[1]))
        gtf_iterator_intersection = GTF.iterator(IOTools.openFile(intersection_file))

        # do the counts for each file
        E.info("counting entries in %s" % gtf_a)
        for entry in gtf_iterator_a:
            count_gtf_merged_a += 1
        print "counts for gtf-a: ",count_gtf_merged_a

        E.info("counting entries in %s" % gtf_b)
        for entry in gtf_iterator_b:
            count_gtf_merged_b += 1
        print "counts for gtf-b: ",count_gtf_merged_b

        E.info("counting entries in %s" % intersection_file)
        for entry in gtf_iterator_intersection:
            count_intersection += 1
        print "counts for intersection: ", count_intersection

        # this is the important bit - basically take an arbitrary list of numbers to represent the list of lincrna in the refnoncoding set
        # then use the intersection count to represent the overlapping section in the lincrna set and add a set of random numbers to this 
        # set to make up the remaining - non-overlapping set

        result = {}
        E.info("assembling count lists")
        result[gtf_pair] = {"gtf-b" : map(str,xrange(count_gtf_merged_b))  , "gtf-a" : map(str,xrange(count_intersection)) + map(str, [random.random() for i in range(count_intersection,count_gtf_merged_a)]  )}

        R_source = os.path.join(os.path.abspath(options.scripts_dir), "venn_diagram.R")
        R.source(R_source)

        prefix_a = prefix_a.replace(".", "_").replace("-", "_")
        prefix_b = prefix_b.replace(".", "_").replace("-", "_")
        
        R('''prefix.a <- "%s"''' % prefix_a)
        R('''prefix.b <- "%s"''' % prefix_b) 
        E.info("drawing venn diagram to %s" % (prefix_a + "_vs_" + prefix_b + ".overlap.png"))
        
        R["venn.diagram2"](R.list( A = result[gtf_pair]["gtf-a"], B = result[gtf_pair]["gtf-b"])
        , prefix_a + "_vs_" + prefix_b + ".overlap.png"
        , **{'cat.cex': 1.5
             , 'main.fontfamily': "Arial"
             , 'cat.pos':FloatVector((0,0))
             , 'cat.fontfamily':"Arial"
             , 'main.cex':1.8                                                                                                                                                                                                              
             , 'height':1000
             , 'width':1000
             , 'cex':2                                                                                                                                                                                                                      
             , 'fontfamily':"Arial"                                                                                                                                                                                                         
             , 'lwd':R.c(1,1)                                                                                                                                                                                                               
             , 'fill':R.c(R.rgb(0,0,0.5,0.5), R.rgb(0.5,0,0,0.5))                                                                                                                                                         
             , 'category.names':R.c(prefix_a, prefix_b) 
             , 'margin' : R.c(0.1,0.1,0.1,0.1)
             })

    ## write footer and output benchmark information.
    E.Stop()
Esempio n. 2
0
def draw_r(regions, points, **kwargs):
    # initialize the environment
    from rpy2.interactive import process_revents
    from rpy2.robjects import r
    from rpy2.robjects.packages import importr
    NA = r("NA")[0]
    RGB = lambda rgb: r.rgb(*rgb, maxColorValue=256)
    C = lambda seq: r.c(*seq)
    OOB = 40
    graphics = importr("graphics")
    grDevices = importr("grDevices")
    process_revents.start()
    graphics.par(bg="white")
    graphics.split_screen(r.c(2, 1))
    graphics.split_screen(r.c(1, 2), screen=2)
    graphics.screen(1)
    # prepare the regions for plotting
    ul, lr = regions.box()
    xlim = r.c(ul[0], lr[0])
    ylim = r.c(lr[1], ul[1])
    # create the main plot window
    graphics.plot(r.c(), r.c(), main=regions.name(), type="p", pch="+",
            xlim=xlim, ylim=ylim, xlab="", ylab="",
            xaxp=r.c(0, lr[0], lr[0]/200), yaxp=r.c(0, lr[1], lr[1]/200),
            bg="white")
    # plot the polygons in the order given
    order = sorted(regions.polys(), key=lambda p: p.area, reverse=True)
    for poly in order:
        xs, ys = zip(*poly.boundary[0].coords)
        color = regions.color(poly.name(), default=NA)
        cr, cg, cb = r.col2rgb(color)
        rgb = r.rgb(cr, cg, cb, alpha=128, maxColorValue=255)
        graphics.polygon(C(xs), C(ys), col=rgb)
    # plot the grid
    graphics.abline(v=r.c(OOB, lr[0]-OOB), lty=2)
    graphics.abline(h=r.seq(0, lr[1], 200), col="lightgray", lty=2)
    graphics.abline(v=r.seq(0, lr[0], 200), col="lightgray", lty=2)
    # plot the points
    xs, ys, names = zip(*[(pt[0].x, pt[0].y, pt[1]) for pt in points])
    colors = [RGB(points.Color(name)) for name in names]
    graphics.points(C(xs), C(ys), xlab="", ylab="", pch="+", col=C(colors))
    # save as a png
    if "png" in kwargs and kwargs['png']:
        grDevices.dev_print(grDevices.png, file=kwargs['png'], width=lr[0],
                height=lr[1])
    # derive legend contents: colors, counts, names
    tid_counts = {}
    uniq_tids = []
    for n in names:
        if n not in uniq_tids:
            tid_counts[n] = 0
            uniq_tids.append(n)
        tid_counts[n] += 1
    uniq_colors = [RGB(points.Color(tid)) for tid in uniq_tids]
    uniq_names = [("%d\t%s" % (i, IDs.TileID[i])) for i in uniq_tids]
    name_counts = [("%d\t%s: %d" % (k, IDs.TileID[k], v)) for (k,v) in \
            tid_counts.items()]
    # display the colors legend
    legend_args = dict(y_intersp=0.7, cex=0.7)
    graphics.screen(3)
    graphics.legend("center", title="Tile Colors", legend=C(uniq_names),
            col=C(uniq_colors), pch="+", pt_cex=1, **legend_args)
    # display the counts legend
    graphics.screen(4)
    graphics.legend("center", title="Tile Counts", legend=C(name_counts),
            **legend_args)
    # sleep until the window is closed
    while grDevices.dev_list() != r("NULL"):
        time.sleep(0.1)
Esempio n. 3
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "-a",
        "--first-gtf-file",
        dest="gtf_a",
        type="string",
        help="supply a gtf file - will compress uncompressed files")
    parser.add_option(
        "-b",
        "--second-gtf-file",
        dest="gtf_b",
        type="string",
        help="supply a second gtf file - will compress uncompressed files")
    parser.add_option("-s",
                      "--scripts-dir",
                      dest="scripts_dir",
                      type="string",
                      help="supply a location for accessory scripts")
    parser.add_option("--no-venn",
                      dest="no_venn",
                      action="store_true",
                      help="set if no venn is to be drawn")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    gtf_files = [options.gtf_a, options.gtf_b]

    merged_files = []
    prefices = []
    E.info("merging gtf files")
    for gtf in gtf_files:
        if gtf.endswith(".gtf.gz"):
            outfile = IOTools.snip(gtf, ".gtf.gz") + ".merged.gtf.gz"
            prefices.append(IOTools.snip(gtf, ".gtf.gz"))
            merged_files.append(outfile)
            statement = '''zcat %s | python %s/gtf2gtf.py --method=merge-transcripts --log=%s.log | gzip > %s''' % (
                gtf, options.scripts_dir, outfile, outfile)
            P.execute(statement)
        elif gtf.endswith(".gtf"):
            outfile = IOTools.snip(gtf, ".gtf") + ".merged.gtf.gz"
            prefices.append(IOTools.snip(gtf, ".gtf"))
            merged_files.append(outfile)
            statement = '''cat %s | python %s/gtf2gtf.py --method=merge-transcripts --log=%s.log | gzip  > %s''' % (
                gtf, options.scripts_dir, outfile, outfile)
            E.execute(statement)
        else:
            raise ValueError("cannot perform merge on %s: is not a gtf file" %
                             gtf)

    for prefix in prefices:
        if options.gtf_a.find(prefix) != -1:
            gtf_a = prefix + ".merged.gtf.gz"
            prefix_a = prefix
        elif options.gtf_b.find(prefix) != -1:
            gtf_b = prefix + ".merged.gtf.gz"
            prefix_b = prefix

    E.info("intersecting gtf files")
    # intersect the resulting merged files

    scriptsdir = options.scripts_dir
    intersection_out = "_vs_".join([prefix_a, prefix_b
                                    ]) + ".intersection.gtf.gz"
    statement = '''intersectBed -a %(gtf_a)s -b %(gtf_b)s -s -wa
                 | python %(scriptsdir)s/gtf2gtf.py --method=merge-transcripts --log=log | gzip > %(intersection_out)s'''
    P.run()

    if not options.no_venn:
        E.info("producing venn diagram for %s vs %s..." %
               (options.gtf_a, options.gtf_b))
        # produce the venn diagram
        intersection_file = intersection_out
        gtf_a_merged = gtf_a
        gtf_b_merged = gtf_b

        # create dictionary key
        gtf_pair = (gtf_a_merged, gtf_b_merged)

        # containers for counts
        count_gtf_merged_a = 0
        count_gtf_merged_b = 0
        count_intersection = 0

        # create GTF iterator objects
        gtf_iterator_a = GTF.iterator(IOTools.openFile(gtf_pair[0]))
        gtf_iterator_b = GTF.iterator(IOTools.openFile(gtf_pair[1]))
        gtf_iterator_intersection = GTF.iterator(
            IOTools.openFile(intersection_file))

        # do the counts for each file
        E.info("counting entries in %s" % gtf_a)
        for entry in gtf_iterator_a:
            count_gtf_merged_a += 1
        print("counts for gtf-a: ", count_gtf_merged_a)

        E.info("counting entries in %s" % gtf_b)
        for entry in gtf_iterator_b:
            count_gtf_merged_b += 1
        print("counts for gtf-b: ", count_gtf_merged_b)

        E.info("counting entries in %s" % intersection_file)
        for entry in gtf_iterator_intersection:
            count_intersection += 1
        print("counts for intersection: ", count_intersection)

        # this is the important bit - basically take an arbitrary list of numbers to represent the list of lincrna in the refnoncoding set
        # then use the intersection count to represent the overlapping section in the lincrna set and add a set of random numbers to this
        # set to make up the remaining - non-overlapping set

        result = {}
        E.info("assembling count lists")
        result[gtf_pair] = {
            "gtf-b":
            list(map(str, range(count_gtf_merged_b))),
            "gtf-a":
            list(map(str, range(count_intersection))) + list(
                map(str, [
                    random.random()
                    for i in range(count_intersection, count_gtf_merged_a)
                ]))
        }

        R_source = os.path.join(os.path.abspath(options.scripts_dir),
                                "venn_diagram.R")
        R.source(R_source)

        prefix_a = prefix_a.replace(".", "_").replace("-", "_")
        prefix_b = prefix_b.replace(".", "_").replace("-", "_")

        R('''prefix.a <- "%s"''' % prefix_a)
        R('''prefix.b <- "%s"''' % prefix_b)
        E.info("drawing venn diagram to %s" %
               (prefix_a + "_vs_" + prefix_b + ".overlap.png"))

        R["venn.diagram2"](R.list(A=result[gtf_pair]["gtf-a"],
                                  B=result[gtf_pair]["gtf-b"]),
                           prefix_a + "_vs_" + prefix_b + ".overlap.png", **{
                               'cat.cex':
                               1.5,
                               'main.fontfamily':
                               "Arial",
                               'cat.pos':
                               FloatVector((0, 0)),
                               'cat.fontfamily':
                               "Arial",
                               'main.cex':
                               1.8,
                               'height':
                               1000,
                               'width':
                               1000,
                               'cex':
                               2,
                               'fontfamily':
                               "Arial",
                               'lwd':
                               R.c(1, 1),
                               'fill':
                               R.c(R.rgb(0, 0, 0.5, 0.5),
                                   R.rgb(0.5, 0, 0, 0.5)),
                               'category.names':
                               R.c(prefix_a, prefix_b),
                               'margin':
                               R.c(0.1, 0.1, 0.1, 0.1)
                           })

    # write footer and output benchmark information.
    E.Stop()