예제 #1
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "-a",
        "--first-gtf-file",
        dest="gtf_a",
        type="string",
        help="supply a gtf file - will compress uncompressed files")
    parser.add_option(
        "-b",
        "--second-gtf-file",
        dest="gtf_b",
        type="string",
        help="supply a second gtf file - will compress uncompressed files")
    parser.add_option("-s",
                      "--scripts-dir",
                      dest="scripts_dir",
                      type="string",
                      help="supply a location for accessory scripts")
    parser.add_option("--no-venn",
                      dest="no_venn",
                      action="store_true",
                      help="set if no venn is to be drawn")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    gtf_files = [options.gtf_a, options.gtf_b]

    merged_files = []
    prefices = []
    E.info("merging gtf files")
    for gtf in gtf_files:
        if gtf.endswith(".gtf.gz"):
            outfile = IOTools.snip(gtf, ".gtf.gz") + ".merged.gtf.gz"
            prefices.append(IOTools.snip(gtf, ".gtf.gz"))
            merged_files.append(outfile)
            statement = '''zcat %s | python %s/gtf2gtf.py --method=merge-transcripts --log=%s.log | gzip > %s''' % (
                gtf, options.scripts_dir, outfile, outfile)
            P.execute(statement)
        elif gtf.endswith(".gtf"):
            outfile = IOTools.snip(gtf, ".gtf") + ".merged.gtf.gz"
            prefices.append(IOTools.snip(gtf, ".gtf"))
            merged_files.append(outfile)
            statement = '''cat %s | python %s/gtf2gtf.py --method=merge-transcripts --log=%s.log | gzip  > %s''' % (
                gtf, options.scripts_dir, outfile, outfile)
            E.execute(statement)
        else:
            raise ValueError("cannot perform merge on %s: is not a gtf file" %
                             gtf)

    for prefix in prefices:
        if options.gtf_a.find(prefix) != -1:
            gtf_a = prefix + ".merged.gtf.gz"
            prefix_a = prefix
        elif options.gtf_b.find(prefix) != -1:
            gtf_b = prefix + ".merged.gtf.gz"
            prefix_b = prefix

    E.info("intersecting gtf files")
    # intersect the resulting merged files

    scriptsdir = options.scripts_dir
    intersection_out = "_vs_".join([prefix_a, prefix_b
                                    ]) + ".intersection.gtf.gz"
    statement = '''intersectBed -a %(gtf_a)s -b %(gtf_b)s -s -wa
                 | python %(scriptsdir)s/gtf2gtf.py --method=merge-transcripts --log=log | gzip > %(intersection_out)s'''
    P.run()

    if not options.no_venn:
        E.info("producing venn diagram for %s vs %s..." %
               (options.gtf_a, options.gtf_b))
        # produce the venn diagram
        intersection_file = intersection_out
        gtf_a_merged = gtf_a
        gtf_b_merged = gtf_b

        # create dictionary key
        gtf_pair = (gtf_a_merged, gtf_b_merged)

        # containers for counts
        count_gtf_merged_a = 0
        count_gtf_merged_b = 0
        count_intersection = 0

        # create GTF iterator objects
        gtf_iterator_a = GTF.iterator(IOTools.open_file(gtf_pair[0]))
        gtf_iterator_b = GTF.iterator(IOTools.open_file(gtf_pair[1]))
        gtf_iterator_intersection = GTF.iterator(
            IOTools.open_file(intersection_file))

        # do the counts for each file
        E.info("counting entries in %s" % gtf_a)
        for entry in gtf_iterator_a:
            count_gtf_merged_a += 1
        print("counts for gtf-a: ", count_gtf_merged_a)

        E.info("counting entries in %s" % gtf_b)
        for entry in gtf_iterator_b:
            count_gtf_merged_b += 1
        print("counts for gtf-b: ", count_gtf_merged_b)

        E.info("counting entries in %s" % intersection_file)
        for entry in gtf_iterator_intersection:
            count_intersection += 1
        print("counts for intersection: ", count_intersection)

        # this is the important bit - basically take an arbitrary list of numbers to represent the list of lincrna in the refnoncoding set
        # then use the intersection count to represent the overlapping section in the lincrna set and add a set of random numbers to this
        # set to make up the remaining - non-overlapping set

        result = {}
        E.info("assembling count lists")
        result[gtf_pair] = {
            "gtf-b":
            list(map(str, range(count_gtf_merged_b))),
            "gtf-a":
            list(map(str, range(count_intersection))) + list(
                map(str, [
                    random.random()
                    for i in range(count_intersection, count_gtf_merged_a)
                ]))
        }

        R_source = os.path.join(os.path.abspath(options.scripts_dir),
                                "venn_diagram.R")
        R.source(R_source)

        prefix_a = prefix_a.replace(".", "_").replace("-", "_")
        prefix_b = prefix_b.replace(".", "_").replace("-", "_")

        R('''prefix.a <- "%s"''' % prefix_a)
        R('''prefix.b <- "%s"''' % prefix_b)
        E.info("drawing venn diagram to %s" %
               (prefix_a + "_vs_" + prefix_b + ".overlap.png"))

        R["venn.diagram2"](R.list(A=result[gtf_pair]["gtf-a"],
                                  B=result[gtf_pair]["gtf-b"]),
                           prefix_a + "_vs_" + prefix_b + ".overlap.png", **{
                               'cat.cex':
                               1.5,
                               'main.fontfamily':
                               "Arial",
                               'cat.pos':
                               FloatVector((0, 0)),
                               'cat.fontfamily':
                               "Arial",
                               'main.cex':
                               1.8,
                               'height':
                               1000,
                               'width':
                               1000,
                               'cex':
                               2,
                               'fontfamily':
                               "Arial",
                               'lwd':
                               R.c(1, 1),
                               'fill':
                               R.c(R.rgb(0, 0, 0.5, 0.5),
                                   R.rgb(0.5, 0, 0, 0.5)),
                               'category.names':
                               R.c(prefix_a, prefix_b),
                               'margin':
                               R.c(0.1, 0.1, 0.1, 0.1)
                           })

    # write footer and output benchmark information.
    E.stop()
예제 #2
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-g",
        "--genome-file",
        dest="genome_file",
        type="string",
        help="filename with Samtools indexed genome [default=%default].")
    parser.add_option(
        "-b",
        "--bam-file",
        dest="bam_file",
        type="string",
        help="filename of bam to add reads to [default=%default].")
    parser.add_option("-i",
                      "--insertsize-mean",
                      dest="isize",
                      type="string",
                      help="Insert size [default=%default].")
    parser.add_option(
        "-s",
        "--insertsize-std",
        dest="isd",
        type="string",
        help="Insert size standard deviation [default=%default].")
    parser.add_option("-r",
                      "--num-reads",
                      dest="nreads",
                      type="string",
                      help="Number of random reads to add [default=%default].")
    parser.add_option("-l",
                      "--read-length",
                      dest="readlength",
                      type="string",
                      help="length of reads to generate [default=%default].")
    parser.add_option("-o",
                      "--output-section",
                      dest="output_file",
                      type="string",
                      help="output filename  [default=%default].")

    parser.set_defaults(
        genome_file=None,
        bam_file=None,
        isize=250,
        isd=20,
        nreads=10000,
        readlength=50,
        output_file=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    # Generate random reads and add to bam
    track = os.path.basename(options.bam_file)[:-len(".bam")]
    readlen = options.readlength
    isize = options.isize
    isd = options.isd
    nreads = options.nreads
    genome = options.genome_file
    bam = options.bam_file
    out = options.output_file
    statement = '''
    java -jar -Xmx2048m /ifs/apps/bio/simseq-72ce499/SimSeq.jar 
    -1 %(readlen)s -2 %(readlen)s \
    --error  /ifs/apps/bio/simseq-72ce499/examples/hiseq_mito_default_bwa_mapping_mq10_1.txt \
    --error2 /ifs/apps/bio/simseq-72ce499/examples/hiseq_mito_default_bwa_mapping_mq10_2.txt \
    --insert_size %(isize)s \
    --insert_stdev %(isd)s \
    --read_number %(nreads)s \
    --read_prefix simseq_ \
    --reference %(genome)s \
    --duplicate_probability 0.0 \
    --out simseq.sam > simseq.log; ''' % locals()
    E.execute(statement % locals())

    statement = '''samtools view -bS -t %(genome)s.fai -o simseq.bam simseq.sam; 
                   samtools sort simseq.bam simseq.srt;
                   samtools sort %(bam)s %(track)s.srt;
                   samtools merge %(out)s %(track)s.srt.bam simseq.srt.bam;
                   samtools index %(out)s;'''
    E.execute(statement % locals())

    # write footer and output benchmark information.
    E.stop()