def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version= "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option( "-a", "--first-gtf-file", dest="gtf_a", type="string", help="supply a gtf file - will compress uncompressed files") parser.add_option( "-b", "--second-gtf-file", dest="gtf_b", type="string", help="supply a second gtf file - will compress uncompressed files") parser.add_option("-s", "--scripts-dir", dest="scripts_dir", type="string", help="supply a location for accessory scripts") parser.add_option("--no-venn", dest="no_venn", action="store_true", help="set if no venn is to be drawn") # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) gtf_files = [options.gtf_a, options.gtf_b] merged_files = [] prefices = [] E.info("merging gtf files") for gtf in gtf_files: if gtf.endswith(".gtf.gz"): outfile = IOTools.snip(gtf, ".gtf.gz") + ".merged.gtf.gz" prefices.append(IOTools.snip(gtf, ".gtf.gz")) merged_files.append(outfile) statement = '''zcat %s | python %s/gtf2gtf.py --method=merge-transcripts --log=%s.log | gzip > %s''' % ( gtf, options.scripts_dir, outfile, outfile) P.execute(statement) elif gtf.endswith(".gtf"): outfile = IOTools.snip(gtf, ".gtf") + ".merged.gtf.gz" prefices.append(IOTools.snip(gtf, ".gtf")) merged_files.append(outfile) statement = '''cat %s | python %s/gtf2gtf.py --method=merge-transcripts --log=%s.log | gzip > %s''' % ( gtf, options.scripts_dir, outfile, outfile) E.execute(statement) else: raise ValueError("cannot perform merge on %s: is not a gtf file" % gtf) for prefix in prefices: if options.gtf_a.find(prefix) != -1: gtf_a = prefix + ".merged.gtf.gz" prefix_a = prefix elif options.gtf_b.find(prefix) != -1: gtf_b = prefix + ".merged.gtf.gz" prefix_b = prefix E.info("intersecting gtf files") # intersect the resulting merged files scriptsdir = options.scripts_dir intersection_out = "_vs_".join([prefix_a, prefix_b ]) + ".intersection.gtf.gz" statement = '''intersectBed -a %(gtf_a)s -b %(gtf_b)s -s -wa | python %(scriptsdir)s/gtf2gtf.py --method=merge-transcripts --log=log | gzip > %(intersection_out)s''' P.run() if not options.no_venn: E.info("producing venn diagram for %s vs %s..." % (options.gtf_a, options.gtf_b)) # produce the venn diagram intersection_file = intersection_out gtf_a_merged = gtf_a gtf_b_merged = gtf_b # create dictionary key gtf_pair = (gtf_a_merged, gtf_b_merged) # containers for counts count_gtf_merged_a = 0 count_gtf_merged_b = 0 count_intersection = 0 # create GTF iterator objects gtf_iterator_a = GTF.iterator(IOTools.open_file(gtf_pair[0])) gtf_iterator_b = GTF.iterator(IOTools.open_file(gtf_pair[1])) gtf_iterator_intersection = GTF.iterator( IOTools.open_file(intersection_file)) # do the counts for each file E.info("counting entries in %s" % gtf_a) for entry in gtf_iterator_a: count_gtf_merged_a += 1 print("counts for gtf-a: ", count_gtf_merged_a) E.info("counting entries in %s" % gtf_b) for entry in gtf_iterator_b: count_gtf_merged_b += 1 print("counts for gtf-b: ", count_gtf_merged_b) E.info("counting entries in %s" % intersection_file) for entry in gtf_iterator_intersection: count_intersection += 1 print("counts for intersection: ", count_intersection) # this is the important bit - basically take an arbitrary list of numbers to represent the list of lincrna in the refnoncoding set # then use the intersection count to represent the overlapping section in the lincrna set and add a set of random numbers to this # set to make up the remaining - non-overlapping set result = {} E.info("assembling count lists") result[gtf_pair] = { "gtf-b": list(map(str, range(count_gtf_merged_b))), "gtf-a": list(map(str, range(count_intersection))) + list( map(str, [ random.random() for i in range(count_intersection, count_gtf_merged_a) ])) } R_source = os.path.join(os.path.abspath(options.scripts_dir), "venn_diagram.R") R.source(R_source) prefix_a = prefix_a.replace(".", "_").replace("-", "_") prefix_b = prefix_b.replace(".", "_").replace("-", "_") R('''prefix.a <- "%s"''' % prefix_a) R('''prefix.b <- "%s"''' % prefix_b) E.info("drawing venn diagram to %s" % (prefix_a + "_vs_" + prefix_b + ".overlap.png")) R["venn.diagram2"](R.list(A=result[gtf_pair]["gtf-a"], B=result[gtf_pair]["gtf-b"]), prefix_a + "_vs_" + prefix_b + ".overlap.png", **{ 'cat.cex': 1.5, 'main.fontfamily': "Arial", 'cat.pos': FloatVector((0, 0)), 'cat.fontfamily': "Arial", 'main.cex': 1.8, 'height': 1000, 'width': 1000, 'cex': 2, 'fontfamily': "Arial", 'lwd': R.c(1, 1), 'fill': R.c(R.rgb(0, 0, 0.5, 0.5), R.rgb(0.5, 0, 0, 0.5)), 'category.names': R.c(prefix_a, prefix_b), 'margin': R.c(0.1, 0.1, 0.1, 0.1) }) # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-g", "--genome-file", dest="genome_file", type="string", help="filename with Samtools indexed genome [default=%default].") parser.add_option( "-b", "--bam-file", dest="bam_file", type="string", help="filename of bam to add reads to [default=%default].") parser.add_option("-i", "--insertsize-mean", dest="isize", type="string", help="Insert size [default=%default].") parser.add_option( "-s", "--insertsize-std", dest="isd", type="string", help="Insert size standard deviation [default=%default].") parser.add_option("-r", "--num-reads", dest="nreads", type="string", help="Number of random reads to add [default=%default].") parser.add_option("-l", "--read-length", dest="readlength", type="string", help="length of reads to generate [default=%default].") parser.add_option("-o", "--output-section", dest="output_file", type="string", help="output filename [default=%default].") parser.set_defaults( genome_file=None, bam_file=None, isize=250, isd=20, nreads=10000, readlength=50, output_file=None, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) # Generate random reads and add to bam track = os.path.basename(options.bam_file)[:-len(".bam")] readlen = options.readlength isize = options.isize isd = options.isd nreads = options.nreads genome = options.genome_file bam = options.bam_file out = options.output_file statement = ''' java -jar -Xmx2048m /ifs/apps/bio/simseq-72ce499/SimSeq.jar -1 %(readlen)s -2 %(readlen)s \ --error /ifs/apps/bio/simseq-72ce499/examples/hiseq_mito_default_bwa_mapping_mq10_1.txt \ --error2 /ifs/apps/bio/simseq-72ce499/examples/hiseq_mito_default_bwa_mapping_mq10_2.txt \ --insert_size %(isize)s \ --insert_stdev %(isd)s \ --read_number %(nreads)s \ --read_prefix simseq_ \ --reference %(genome)s \ --duplicate_probability 0.0 \ --out simseq.sam > simseq.log; ''' % locals() E.execute(statement % locals()) statement = '''samtools view -bS -t %(genome)s.fai -o simseq.bam simseq.sam; samtools sort simseq.bam simseq.srt; samtools sort %(bam)s %(track)s.srt; samtools merge %(out)s %(track)s.srt.bam simseq.srt.bam; samtools index %(out)s;''' E.execute(statement % locals()) # write footer and output benchmark information. E.stop()