def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-b", "--bam-file", dest="bam_file", type="string", help="supply input bam file name") parser.add_option("-g", "--gtf-file", dest="gtf_file", type="string", help="supply input gtf file name") parser.add_option("-o", "--outfile", dest="outfile", type="string", help="supply output file name") parser.add_option( "-G", "--reference-gtf-file", dest="reference_gtf", type="string", help= "supply reference gtf for context of reads not contributing to transcripts" ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) ###################################################### ###################################################### # for all alignments ###################################################### ###################################################### # open outfile and prepare headers outf = IOTools.openFile(options.outfile, "w") outf.write("\t".join([ "total alignments", "aligments in transcripts", "percent alignments in transcripts", "total spliced alignments", "spliced alignments in transcripts", "percent spliced alignments in transcripts" ]) + "\n") # calculate coverage over transcript file - NB split reads contribute twice to the transcript # use BedTool object pybedbamfile = pybedtools.BedTool(options.bam_file) # count alignments E.info("counting total number of alignments and spliced alignments") total_alignments = 0 spliced_alignments = 0 for alignment in pybedbamfile: cigar = alignment[5] if cigar.find("N") != -1: # N signifies split read total_alignments += 1 spliced_alignments += 1 else: total_alignments += 1 # merge the gtf file to avoid double counting of exons in different # transcripts - converts to a bed file gtffile = pybedtools.BedTool(options.gtf_file).merge() E.info("computing coverage of aligments in %s over intervals in %s" % (options.bam_file, options.gtf_file)) cover = pybedbamfile.coverage(gtffile) # make sure that the exons aren't being counted twice - shouldn't be # because of merge E.info("counting reads contributing to transcripts") c = 0 for entry in cover: coverage = int(entry[3]) if coverage > 0: c += coverage # sum the coverage across exons from all transcripts coverage_in_transcripts = c ###################################################### ###################################################### # for spliced alignments ###################################################### ###################################################### # count total number of spliced alignments # requires that the CIGAR string 'N' is present # uses pysam to write out a bam file of the spliced reads only allreads = pysam.Samfile(options.bam_file) spliced_bamname = IOTools.snip(options.bam_file, ".bam") + "_spliced_reads.bam" # open file for outputting spliced alignments splicedreads = pysam.Samfile(spliced_bamname, "wb", template=allreads) # cigar string in pysam for spliced alignment is (3, int) spliced = collections.defaultdict(list) for read in allreads: for cigar_tag in read.cigar: if cigar_tag[0] == 3: spliced[read].append(cigar_tag) # write out spliced alignments for read in list(spliced.keys()): splicedreads.write(read) splicedreads.close() allreads.close() # index splice reads bam file pysam.sort(spliced_bamname, P.snip(spliced_bamname, ".bam")) pysam.index(spliced_bamname) # read in the spliced reads as a BedTool object splicedbam = pybedtools.BedTool(spliced_bamname) # perform coverage of spliced reads over intervals - will be twice # as many as there should be due to counting both exons # overlapping spliced_coverage = splicedbam.coverage(gtffile) # avoid double counting exons E.info("counting spliced reads contributing to transcripts") spliced_exons = {} c = 0 for entry in spliced_coverage: coverage = int(entry[3]) if coverage > 0: c += coverage spliced_coverage_in_transcripts = c # NOTE: the counting of spliced alignments is not accurate spliced_coverage_in_transcripts = float( spliced_coverage_in_transcripts) / 2 ########################### # write out the results ########################### outf.write(str(int(total_alignments)) + "\t") # remove half of the coverage assigned to spliced reads coverage_in_transcripts = (coverage_in_transcripts) - ( spliced_coverage_in_transcripts) outf.write( str( int(coverage_in_transcripts) - int(spliced_coverage_in_transcripts)) + "\t") outf.write( str(int((coverage_in_transcripts / total_alignments) * 100)) + "\t") # write out spliced counts outf.write(str(int(spliced_alignments)) + "\t") outf.write(str(int(spliced_coverage_in_transcripts)) + "\t") outf.write( str(int((spliced_coverage_in_transcripts / spliced_alignments) * 100))) outf.close() ############################ # contextualise those that # don't fall in transcripts ############################ if options.reference_gtf: context_summary = IOTools.openFile( IOTools.snip(options.bam_file, ".bam") + ".excluded.context", "w") context_summary.write("\t".join(["Feature", "number"]) + "\n") # write out the read info as well context_file = IOTools.openFile( IOTools.snip(options.bam_file, ".bam") + ".excluded", "w") context_dict = collections.defaultdict(int) # intersect bam - write non-overlapping with transcripts - intersect # with reference - write out context = pybedbamfile.intersect(gtffile, v=True, bed=True).intersect( pybedtools.BedTool(options.reference_gtf), wb=True) for entry in context: feature = entry[8] context_dict[feature] += 1 context_file.write("\t".join([e for e in entry]) + "\n") for feature, value in context_dict.items(): context_summary.write("\t".join([feature, str(value)]) + "\n") context_file.close() context_summary.close() # write footer and output benchmark information. E.Stop()
def peekParameters(workingdir, pipeline, on_error_raise=None, prefix=None, update_interface=False, restrict_interface=False): '''peek configuration parameters from external pipeline. As the paramater dictionary is built at runtime, this method executes the pipeline in workingdir, dumping its configuration values and reading them into a dictionary. If either `pipeline` or `workingdir` are not found, an error is raised. This behaviour can be changed by setting `on_error_raise` to False. In that case, an empty dictionary is returned. Arguments --------- workingdir : string Working directory. This is the directory that the pipeline was executed in. pipeline : string Name of the pipeline script. The pipeline is assumed to live in the same directory as the current pipeline. on_error_raise : Bool If set to a boolean, an error will be raised (or not) if there is an error during parameter peeking, for example if `workingdir` can not be found. If `on_error_raise` is None, it will be set to the default, which is to raise an exception unless the calling script is imported or the option ``--is-test`` has been passed at the command line. prefix : string Add a prefix to all parameters. This is useful if the paramaters are added to the configuration dictionary of the calling pipeline. update_interface : bool If True, this method will prefix any options in the ``[interface]`` section with `workingdir`. This allows transparent access to files in the external pipeline. restrict_interface : bool If True, only interface parameters will be imported. Returns ------- config : dict Dictionary of configuration values. ''' caller_locals = getCallerLocals() # check if we should raise errors if on_error_raise is None: on_error_raise = not isTest() and \ "__name__" in caller_locals and \ caller_locals["__name__"] == "__main__" # patch - if --help or -h in command line arguments, # do not peek as there might be no config file. if "--help" in sys.argv or "-h" in sys.argv: return {} # Attempt to locate directory with pipeline source code. This is a # patch as pipelines might be called within the repository # directory or from an installed location dirname = PARAMS["pipelinedir"] # called without a directory, use current directory if dirname == "": dirname = os.path.abspath(".") else: # if not exists, assume we want version located # in directory of calling script. if not os.path.exists(dirname): # directory is path of calling script dirname = os.path.dirname(caller_locals['__file__']) pipeline = os.path.join(dirname, pipeline) if not os.path.exists(pipeline): if on_error_raise: raise ValueError( "can't find pipeline at %s" % (pipeline)) else: return {} if workingdir == "": workingdir = os.path.abspath(".") # patch for the "config" target - use default # pipeline directory if directory is not specified # working dir is set to "?!" if "config" in sys.argv or "check" in sys.argv or "clone" in sys.argv and workingdir == "?!": workingdir = os.path.join(PARAMS.get("pipelinedir"), IOTools.snip(pipeline, ".py")) if not os.path.exists(workingdir): if on_error_raise: raise ValueError( "can't find working dir %s" % workingdir) else: return {} statement = "python %s -f -v 0 dump" % pipeline os.environ.update({'BASH_ENV': os.path.join(os.environ['HOME'],'.bashrc')}) process = subprocess.Popen(statement, cwd=workingdir, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=os.environ.copy()) # process.stdin.close() stdout, stderr = process.communicate() if process.returncode != 0: raise OSError( ("Child was terminated by signal %i: \n" "Statement: %s\n" "The stderr was: \n%s\n" "Stdout: %s") % (-process.returncode, statement, stderr, stdout)) # subprocess only accepts encoding argument in py >= 3.6 so # decode here. stdout = stdout.decode("utf-8").splitlines() # remove any log messages stdout = [x for x in stdout if x.startswith("{")] if len(stdout) > 1: raise ValueError("received multiple configurations") dump = json.loads(stdout[0]) # update interface if update_interface: for key, value in list(dump.items()): if key.startswith("interface"): dump[key] = os.path.join(workingdir, value) # keep only interface if so required if restrict_interface: dump = dict([(k, v) for k, v in dump.items() if k.startswith("interface")]) # prefix all parameters if prefix is not None: dump = dict([("%s%s" % (prefix, x), y) for x, y in list(dump.items())]) return dump
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version= "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option( "-a", "--first-gtf-file", dest="gtf_a", type="string", help="supply a gtf file - will compress uncompressed files") parser.add_option( "-b", "--second-gtf-file", dest="gtf_b", type="string", help="supply a second gtf file - will compress uncompressed files") parser.add_option("-s", "--scripts-dir", dest="scripts_dir", type="string", help="supply a location for accessory scripts") parser.add_option("--no-venn", dest="no_venn", action="store_true", help="set if no venn is to be drawn") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) gtf_files = [options.gtf_a, options.gtf_b] merged_files = [] prefices = [] E.info("merging gtf files") for gtf in gtf_files: if gtf.endswith(".gtf.gz"): outfile = IOTools.snip(gtf, ".gtf.gz") + ".merged.gtf.gz" prefices.append(IOTools.snip(gtf, ".gtf.gz")) merged_files.append(outfile) statement = '''zcat %s | python %s/gtf2gtf.py --method=merge-transcripts --log=%s.log | gzip > %s''' % ( gtf, options.scripts_dir, outfile, outfile) P.execute(statement) elif gtf.endswith(".gtf"): outfile = IOTools.snip(gtf, ".gtf") + ".merged.gtf.gz" prefices.append(IOTools.snip(gtf, ".gtf")) merged_files.append(outfile) statement = '''cat %s | python %s/gtf2gtf.py --method=merge-transcripts --log=%s.log | gzip > %s''' % ( gtf, options.scripts_dir, outfile, outfile) E.execute(statement) else: raise ValueError("cannot perform merge on %s: is not a gtf file" % gtf) for prefix in prefices: if options.gtf_a.find(prefix) != -1: gtf_a = prefix + ".merged.gtf.gz" prefix_a = prefix elif options.gtf_b.find(prefix) != -1: gtf_b = prefix + ".merged.gtf.gz" prefix_b = prefix E.info("intersecting gtf files") # intersect the resulting merged files scriptsdir = options.scripts_dir intersection_out = "_vs_".join([prefix_a, prefix_b ]) + ".intersection.gtf.gz" statement = '''intersectBed -a %(gtf_a)s -b %(gtf_b)s -s -wa | python %(scriptsdir)s/gtf2gtf.py --method=merge-transcripts --log=log | gzip > %(intersection_out)s''' P.run() if not options.no_venn: E.info("producing venn diagram for %s vs %s..." % (options.gtf_a, options.gtf_b)) # produce the venn diagram intersection_file = intersection_out gtf_a_merged = gtf_a gtf_b_merged = gtf_b # create dictionary key gtf_pair = (gtf_a_merged, gtf_b_merged) # containers for counts count_gtf_merged_a = 0 count_gtf_merged_b = 0 count_intersection = 0 # create GTF iterator objects gtf_iterator_a = GTF.iterator(IOTools.openFile(gtf_pair[0])) gtf_iterator_b = GTF.iterator(IOTools.openFile(gtf_pair[1])) gtf_iterator_intersection = GTF.iterator( IOTools.openFile(intersection_file)) # do the counts for each file E.info("counting entries in %s" % gtf_a) for entry in gtf_iterator_a: count_gtf_merged_a += 1 print("counts for gtf-a: ", count_gtf_merged_a) E.info("counting entries in %s" % gtf_b) for entry in gtf_iterator_b: count_gtf_merged_b += 1 print("counts for gtf-b: ", count_gtf_merged_b) E.info("counting entries in %s" % intersection_file) for entry in gtf_iterator_intersection: count_intersection += 1 print("counts for intersection: ", count_intersection) # this is the important bit - basically take an arbitrary list of numbers to represent the list of lincrna in the refnoncoding set # then use the intersection count to represent the overlapping section in the lincrna set and add a set of random numbers to this # set to make up the remaining - non-overlapping set result = {} E.info("assembling count lists") result[gtf_pair] = { "gtf-b": list(map(str, range(count_gtf_merged_b))), "gtf-a": list(map(str, range(count_intersection))) + list( map(str, [ random.random() for i in range(count_intersection, count_gtf_merged_a) ])) } R_source = os.path.join(os.path.abspath(options.scripts_dir), "venn_diagram.R") R.source(R_source) prefix_a = prefix_a.replace(".", "_").replace("-", "_") prefix_b = prefix_b.replace(".", "_").replace("-", "_") R('''prefix.a <- "%s"''' % prefix_a) R('''prefix.b <- "%s"''' % prefix_b) E.info("drawing venn diagram to %s" % (prefix_a + "_vs_" + prefix_b + ".overlap.png")) R["venn.diagram2"](R.list(A=result[gtf_pair]["gtf-a"], B=result[gtf_pair]["gtf-b"]), prefix_a + "_vs_" + prefix_b + ".overlap.png", **{ 'cat.cex': 1.5, 'main.fontfamily': "Arial", 'cat.pos': FloatVector((0, 0)), 'cat.fontfamily': "Arial", 'main.cex': 1.8, 'height': 1000, 'width': 1000, 'cex': 2, 'fontfamily': "Arial", 'lwd': R.c(1, 1), 'fill': R.c(R.rgb(0, 0, 0.5, 0.5), R.rgb(0.5, 0, 0, 0.5)), 'category.names': R.c(prefix_a, prefix_b), 'margin': R.c(0.1, 0.1, 0.1, 0.1) }) # write footer and output benchmark information. E.Stop()
def loadCuffdiff(dbhandle, infile, outfile, min_fpkm=1.0): '''load results from cuffdiff analysis to database This functions parses and loads the results of a cuffdiff differential expression analysis. Parsing is performed by the parseCuffdiff function. Multiple tables will be created as cuffdiff outputs information on gene, isoform, tss, etc. levels. The method converts from ln(fold change) to log2 fold change. Pairwise comparisons in which one gene is not expressed (fpkm < `min_fpkm`) are set to status 'NOCALL'. These transcripts might nevertheless be significant. Arguments --------- dbhandle : object Database handle. infile : string Input filename, output from cuffdiff outfile : string Output filename in :term:`tsv` format. min_fpkm : float Minimum fpkm. Genes with an fpkm lower than this will be set to status `NOCALL`. ''' prefix = P.toTable(outfile) indir = infile + ".dir" if not os.path.exists(indir): P.touch(outfile) return # E.info( "building cummeRbund database" ) # R('''library(cummeRbund)''') # cuff = R('''readCufflinks(dir = %(indir)s, dbfile=%(indir)s/csvdb)''' ) # to be continued... tmpname = P.getTempFilename(shared=True) # ignore promoters and splicing - no fold change column, but sqrt(JS) for fn, level in (("cds_exp.diff.gz", "cds"), ("gene_exp.diff.gz", "gene"), ("isoform_exp.diff.gz", "isoform"), # ("promoters.diff.gz", "promotor"), # ("splicing.diff.gz", "splice"), ("tss_group_exp.diff.gz", "tss")): tablename = prefix + "_" + level + "_diff" infile = os.path.join(indir, fn) results = parseCuffdiff(infile, min_fpkm=min_fpkm) Expression.writeExpressionResults(tmpname, results) P.load(tmpname, outfile, tablename=tablename, options="--allow-empty-file " "--add-index=treatment_name " "--add-index=control_name " "--add-index=test_id") for fn, level in (("cds.fpkm_tracking.gz", "cds"), ("genes.fpkm_tracking.gz", "gene"), ("isoforms.fpkm_tracking.gz", "isoform"), ("tss_groups.fpkm_tracking.gz", "tss")): tablename = prefix + "_" + level + "_levels" infile = os.path.join(indir, fn) P.load(infile, outfile, tablename=tablename, options="--allow-empty-file " "--add-index=tracking_id " "--add-index=control_name " "--add-index=test_id") # Jethro - load tables of sample specific cuffdiff fpkm values into csvdb # IMS: First read in lookup table for CuffDiff/Pipeline sample name # conversion inf = IOTools.openFile(os.path.join(indir, "read_groups.info.gz")) inf.readline() sample_lookup = {} for line in inf: line = line.split("\t") our_sample_name = IOTools.snip(line[0]) our_sample_name = re.sub("-", "_", our_sample_name) cuffdiff_sample_name = "%s_%s" % (line[1], line[2]) sample_lookup[cuffdiff_sample_name] = our_sample_name inf.close() for fn, level in (("cds.read_group_tracking.gz", "cds"), ("genes.read_group_tracking.gz", "gene"), ("isoforms.read_group_tracking.gz", "isoform"), ("tss_groups.read_group_tracking.gz", "tss")): tablename = prefix + "_" + level + "sample_fpkms" tmpf = P.getTempFilename(".") inf = IOTools.openFile(os.path.join(indir, fn)).readlines() outf = IOTools.openFile(tmpf, "w") samples = [] genes = {} is_first = True for line in inf: if is_first: is_first = False continue line = line.split() gene_id = line[0] condition = line[1] replicate = line[2] fpkm = line[6] status = line[8] sample_id = condition + "_" + replicate if sample_id not in samples: samples.append(sample_id) # IMS: The following block keeps getting its indenting messed # up. It is not part of the 'if sample_id not in samples' block # please make sure it does not get made part of it if gene_id not in genes: genes[gene_id] = {} genes[gene_id][sample_id] = fpkm else: if sample_id in genes[gene_id]: raise ValueError( 'sample_id %s appears twice in file for gene_id %s' % (sample_id, gene_id)) else: if status != "OK": genes[gene_id][sample_id] = status else: genes[gene_id][sample_id] = fpkm samples = sorted(samples) # IMS - CDS files might be empty if not cds has been # calculated for the genes in the long term need to add CDS # annotation to denovo predicted genesets in meantime just # skip if cds tracking file is empty if len(samples) == 0: continue headers = "gene_id\t" + "\t".join([sample_lookup[x] for x in samples]) outf.write(headers + "\n") for gene in genes.iterkeys(): outf.write(gene + "\t") s = 0 while x < len(samples) - 1: outf.write(genes[gene][samples[s]] + "\t") s += 1 # IMS: Please be careful with this line. It keeps getting moved # into the above while block where it does not belong outf.write(genes[gene][samples[len(samples) - 1]] + "\n") outf.close() P.load(tmpf, outfile, tablename=tablename, options="--allow-empty-file " " --add-index=gene_id") os.unlink(tmpf) # build convenience table with tracks tablename = prefix + "_isoform_levels" tracks = Database.getColumnNames(dbhandle, tablename) tracks = [x[:-len("_FPKM")] for x in tracks if x.endswith("_FPKM")] tmpfile = P.getTempFile(dir=".") tmpfile.write("track\n") tmpfile.write("\n".join(tracks) + "\n") tmpfile.close() P.load(tmpfile.name, outfile) os.unlink(tmpfile.name)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-a", "--first-gtf-file", dest="gtf_a", type="string", help="supply a gtf file - will compress uncompressed files") parser.add_option("-b", "--second-gtf-file", dest="gtf_b", type="string", help="supply a second gtf file - will compress uncompressed files") parser.add_option("-s", "--scripts-dir", dest="scripts_dir", type="string", help="supply a location for accessory scripts") parser.add_option("--no-venn", dest="no_venn", action="store_true", help="set if no venn is to be drawn") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) gtf_files = [options.gtf_a, options.gtf_b] merged_files = [] prefices = [] E.info("merging gtf files") for gtf in gtf_files: if gtf.endswith(".gtf.gz"): outfile = IOTools.snip(gtf, ".gtf.gz") + ".merged.gtf.gz" prefices.append(IOTools.snip(gtf, ".gtf.gz")) merged_files.append(outfile) statement = '''zcat %s | python %s/gtf2gtf.py --method=merge-transcripts --log=%s.log | gzip > %s''' % ( gtf, options.scripts_dir, outfile, outfile) P.execute(statement) elif gtf.endswith(".gtf"): outfile = IOTools.snip(gtf, ".gtf") + ".merged.gtf.gz" prefices.append(IOTools.snip(gtf, ".gtf")) merged_files.append(outfile) statement = '''cat %s | python %s/gtf2gtf.py --method=merge-transcripts --log=%s.log | gzip > %s''' % ( gtf, options.scripts_dir, outfile, outfile) E.execute(statement) else: raise ValueError( "cannot perform merge on %s: is not a gtf file" % gtf) for prefix in prefices: if options.gtf_a.find(prefix) != -1: gtf_a = prefix + ".merged.gtf.gz" prefix_a = prefix elif options.gtf_b.find(prefix) != -1: gtf_b = prefix + ".merged.gtf.gz" prefix_b = prefix E.info("intersecting gtf files") # intersect the resulting merged files scriptsdir = options.scripts_dir intersection_out = "_vs_".join( [prefix_a, prefix_b]) + ".intersection.gtf.gz" statement = '''intersectBed -a %(gtf_a)s -b %(gtf_b)s -s -wa | python %(scriptsdir)s/gtf2gtf.py --method=merge-transcripts --log=log | gzip > %(intersection_out)s''' P.run() if not options.no_venn: E.info("producing venn diagram for %s vs %s..." % (options.gtf_a, options.gtf_b)) # produce the venn diagram intersection_file = intersection_out gtf_a_merged = gtf_a gtf_b_merged = gtf_b # create dictionary key gtf_pair = (gtf_a_merged, gtf_b_merged) # containers for counts count_gtf_merged_a = 0 count_gtf_merged_b = 0 count_intersection = 0 # create GTF iterator objects gtf_iterator_a = GTF.iterator(IOTools.openFile(gtf_pair[0])) gtf_iterator_b = GTF.iterator(IOTools.openFile(gtf_pair[1])) gtf_iterator_intersection = GTF.iterator( IOTools.openFile(intersection_file)) # do the counts for each file E.info("counting entries in %s" % gtf_a) for entry in gtf_iterator_a: count_gtf_merged_a += 1 print("counts for gtf-a: ", count_gtf_merged_a) E.info("counting entries in %s" % gtf_b) for entry in gtf_iterator_b: count_gtf_merged_b += 1 print("counts for gtf-b: ", count_gtf_merged_b) E.info("counting entries in %s" % intersection_file) for entry in gtf_iterator_intersection: count_intersection += 1 print("counts for intersection: ", count_intersection) # this is the important bit - basically take an arbitrary list of numbers to represent the list of lincrna in the refnoncoding set # then use the intersection count to represent the overlapping section in the lincrna set and add a set of random numbers to this # set to make up the remaining - non-overlapping set result = {} E.info("assembling count lists") result[gtf_pair] = {"gtf-b": list(map(str, range(count_gtf_merged_b))), "gtf-a": list(map(str, range(count_intersection))) + list(map(str, [random.random() for i in range(count_intersection, count_gtf_merged_a)]))} R_source = os.path.join( os.path.abspath(options.scripts_dir), "venn_diagram.R") R.source(R_source) prefix_a = prefix_a.replace(".", "_").replace("-", "_") prefix_b = prefix_b.replace(".", "_").replace("-", "_") R('''prefix.a <- "%s"''' % prefix_a) R('''prefix.b <- "%s"''' % prefix_b) E.info("drawing venn diagram to %s" % (prefix_a + "_vs_" + prefix_b + ".overlap.png")) R["venn.diagram2"](R.list(A=result[gtf_pair]["gtf-a"], B=result[gtf_pair]["gtf-b"]), prefix_a + "_vs_" + prefix_b + ".overlap.png", **{'cat.cex': 1.5, 'main.fontfamily': "Arial", 'cat.pos': FloatVector((0, 0)), 'cat.fontfamily': "Arial", 'main.cex': 1.8, 'height': 1000, 'width': 1000, 'cex': 2, 'fontfamily': "Arial", 'lwd': R.c(1, 1), 'fill': R.c(R.rgb(0, 0, 0.5, 0.5), R.rgb(0.5, 0, 0, 0.5)), 'category.names': R.c(prefix_a, prefix_b), 'margin': R.c(0.1, 0.1, 0.1, 0.1) }) # write footer and output benchmark information. E.Stop()
def runDE(design_file, counts_file, outfile, outdir, method="deseq", spike_file=None): '''run DESeq, DESeq2 or EdgeR through :mod:`scripts/runExpression.py` The job is split into smaller sections. The order of the input data is randomized in order to avoid any biases due to chromosomes and break up local correlations. At the end, a q-value is computed from all results. Arguments --------- design_file : string Filename with experimental design counts_file : string :term:`tsv` formatted file with counts per windows outfile : string Output filename in :term:`tsv` format. outdir : string Directory for additional output files. method : string Method to use. See :mod:`scripts/runExpression.py`. spike_file : string Filename with spike-in data to add before processing. ''' if spike_file is None: statement = "zcat %(counts_file)s" else: statement = '''python %(scriptsdir)s/combine_tables.py --missing-value=0 --cat=filename --log=%(outfile)s.log %(counts_file)s %(spike_file)s | python %(scriptsdir)s/csv_cut.py --remove filename --log=%(outfile)s.log ''' prefix = IOTools.snip(os.path.basename(outfile)) E.info(prefix) # the post-processing strips away the warning, # renames the qvalue column to old_qvalue # and adds a new qvalue column after recomputing # over all windows. statement += ''' | perl %(scriptsdir)s/randomize_lines.pl -h | %(cmd-farm)s --input-header --output-header --split-at-lines=200000 --cluster-options="-l mem_free=16G" --log=%(outfile)s.log --output-filename-pattern=%(outdir)s/%%s --subdirs --output-regex-header="^test_id" "python %(scriptsdir)s/runExpression.py --method=%(method)s --tags-tsv-file=- --design-tsv-file=%(design_file)s --output-filename-pattern=%%DIR%%%(prefix)s_ --deseq-fit-type=%(deseq_fit_type)s --deseq-dispersion-method=%(deseq_dispersion_method)s --deseq-sharing-mode=%(deseq_sharing_mode)s --edger-dispersion=%(edger_dispersion)f --deseq2-design-formula=%(deseq2_model)s --deseq2-contrasts=%(deseq2_contrasts)s --filter-min-counts-per-row=%(tags_filter_min_counts_per_row)i --filter-min-counts-per-sample=%(tags_filter_min_counts_per_sample)i --filter-percentile-rowsums=%(tags_filter_percentile_rowsums)i --log=%(outfile)s.log --fdr=%(edger_fdr)f --deseq2-plot=0" | perl -p -e "s/qvalue/old_qvalue/" | python %(scriptsdir)s/table2table.py --log=%(outfile)s.log --method=fdr --column=pvalue --fdr-method=BH --fdr-add-column=qvalue | gzip > %(outfile)s ''' E.info(statement) P.run()
def peekParameters(workingdir, pipeline, on_error_raise=None, prefix=None, update_interface=False, restrict_interface=False): '''peek configuration parameters from external pipeline. As the paramater dictionary is built at runtime, this method executes the pipeline in workingdir, dumping its configuration values and reading them into a dictionary. If either `pipeline` or `workingdir` are not found, an error is raised. This behaviour can be changed by setting `on_error_raise` to False. In that case, an empty dictionary is returned. Arguments --------- workingdir : string Working directory. This is the directory that the pipeline was executed in. pipeline : string Name of the pipeline script. The pipeline is assumed to live in the same directory as the current pipeline. on_error_raise : Bool If set to a boolean, an error will be raised (or not) if there is an error during parameter peeking, for example if `workingdir` can not be found. If `on_error_raise` is None, it will be set to the default, which is to raise an exception unless the calling script is imported or the option ``--is-test`` has been passed at the command line. prefix : string Add a prefix to all parameters. This is useful if the paramaters are added to the configuration dictionary of the calling pipeline. update_interface : bool If True, this method will prefix any options in the ``[interface]`` section with `workingdir`. This allows transparent access to files in the external pipeline. restrict_interface : bool If True, only interface parameters will be imported. Returns ------- config : dict Dictionary of configuration values. ''' caller_locals = getCallerLocals() # check if we should raise errors if on_error_raise is None: on_error_raise = not isTest() and \ "__name__" in caller_locals and \ caller_locals["__name__"] == "__main__" # patch - if --help or -h in command line arguments, # do not peek as there might be no config file. if "--help" in sys.argv or "-h" in sys.argv: return {} # Attempt to locate directory with pipeline source code. This is a # patch as pipelines might be called within the repository # directory or from an installed location dirname = PARAMS["pipelinedir"] # called without a directory, use current directory if dirname == "": dirname = os.path.abspath(".") else: # if not exists, assume we want version located # in directory of calling script. if not os.path.exists(dirname): # directory is path of calling script dirname = os.path.dirname(caller_locals['__file__']) pipeline = os.path.join(dirname, pipeline) if not os.path.exists(pipeline): if on_error_raise: raise ValueError("can't find pipeline at %s" % (pipeline)) else: return {} if workingdir == "": workingdir = os.path.abspath(".") # patch for the "config" target - use default # pipeline directory if directory is not specified # working dir is set to "?!" if "config" in sys.argv or "check" in sys.argv or "clone" in sys.argv and workingdir == "?!": workingdir = os.path.join(PARAMS.get("pipelinedir"), IOTools.snip(pipeline, ".py")) if not os.path.exists(workingdir): if on_error_raise: raise ValueError("can't find working dir %s" % workingdir) else: return {} statement = "python %s -f -v 0 dump" % pipeline process = subprocess.Popen(statement, cwd=workingdir, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # process.stdin.close() stdout, stderr = process.communicate() if process.returncode != 0: raise OSError( ("Child was terminated by signal %i: \n" "Statement: %s\n" "The stderr was: \n%s\n" "Stdout: %s") % (-process.returncode, statement, stderr, stdout)) # subprocess only accepts encoding argument in py >= 3.6 so # decode here. stdout = stdout.decode("utf-8").splitlines() # remove any log messages stdout = [x for x in stdout if x.startswith("{")] if len(stdout) > 1: raise ValueError("received multiple configurations") dump = json.loads(stdout[0]) # update interface if update_interface: for key, value in list(dump.items()): if key.startswith("interface"): dump[key] = os.path.join(workingdir, value) # keep only interface if so required if restrict_interface: dump = dict([(k, v) for k, v in dump.items() if k.startswith("interface")]) # prefix all parameters if prefix is not None: dump = dict([("%s%s" % (prefix, x), y) for x, y in list(dump.items())]) return dump
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-b", "--bam-file", dest="bam_file", type="string", help="supply input bam file name") parser.add_option("-g", "--gtf-file", dest="gtf_file", type="string", help="supply input gtf file name") parser.add_option("-o", "--outfile", dest="outfile", type="string", help="supply output file name") parser.add_option( "-G", "--reference-gtf-file", dest="reference_gtf", type="string", help="supply reference gtf for context of reads not contributing to transcripts") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) ###################################################### ###################################################### # for all alignments ###################################################### ###################################################### # open outfile and prepare headers outf = open(options.outfile, "w") outf.write("\t".join(["total alignments", "aligments in transcripts", "percent alignments in transcripts", "total spliced alignments", "spliced alignments in transcripts", "percent spliced alignments in transcripts"]) + "\n") # calculate coverage over transcript file - NB split reads contribute twice to the transcript # use BedTool object pybedbamfile = pybedtools.BedTool(options.bam_file) # count alignments E.info("counting total number of alignments and spliced alignments") total_alignments = 0 spliced_alignments = 0 for alignment in pybedbamfile: cigar = alignment[5] if cigar.find("N") != -1: # N signifies split read total_alignments += 1 spliced_alignments += 1 else: total_alignments += 1 # merge the gtf file to avoid double counting of exons in different # transcripts - converts to a bed file gtffile = pybedtools.BedTool(options.gtf_file).merge() E.info("computing coverage of aligments in %s over intervals in %s" % (options.bam_file, options.gtf_file)) cover = pybedbamfile.coverage(gtffile) # make sure that the exons aren't being counted twice - shouldn't be # because of merge E.info("counting reads contributing to transcripts") c = 0 for entry in cover: coverage = int(entry[3]) if coverage > 0: c += coverage # sum the coverage across exons from all transcripts coverage_in_transcripts = c ###################################################### ###################################################### # for spliced alignments ###################################################### ###################################################### # count total number of spliced alignments # requires that the CIGAR string 'N' is present # uses pysam to write out a bam file of the spliced reads only allreads = pysam.Samfile(options.bam_file) spliced_bamname = IOTools.snip(options.bam_file, ".bam") + "_spliced_reads.bam" # open file for outputting spliced alignments splicedreads = pysam.Samfile(spliced_bamname, "wb", template=allreads) # cigar string in pysam for spliced alignment is (3, int) spliced = collections.defaultdict(list) for read in allreads: for cigar_tag in read.cigar: if cigar_tag[0] == 3: spliced[read].append(cigar_tag) # write out spliced alignments for read in spliced.keys(): splicedreads.write(read) splicedreads.close() allreads.close() # index splice reads bam file pysam.sort(spliced_bamname, P.snip(spliced_bamname, ".bam")) pysam.index(spliced_bamname) # read in the spliced reads as a BedTool object splicedbam = pybedtools.BedTool(spliced_bamname) # perform coverage of spliced reads over intervals - will be twice # as many as there should be due to counting both exons # overlapping spliced_coverage = splicedbam.coverage(gtffile) # avoid double counting exons E.info("counting spliced reads contributing to transcripts") spliced_exons = {} c = 0 for entry in spliced_coverage: coverage = int(entry[3]) if coverage > 0: c += coverage spliced_coverage_in_transcripts = c # NOTE: the counting of spliced alignments is not accurate spliced_coverage_in_transcripts = float( spliced_coverage_in_transcripts) / 2 ########################### # write out the results ########################### outf.write(str(int(total_alignments)) + "\t") # remove half of the coverage assigned to spliced reads coverage_in_transcripts = ( coverage_in_transcripts) - (spliced_coverage_in_transcripts) outf.write( str(int(coverage_in_transcripts) - int(spliced_coverage_in_transcripts)) + "\t") outf.write( str(int((coverage_in_transcripts / total_alignments) * 100)) + "\t") # write out spliced counts outf.write(str(int(spliced_alignments)) + "\t") outf.write(str(int(spliced_coverage_in_transcripts)) + "\t") outf.write( str(int((spliced_coverage_in_transcripts / spliced_alignments) * 100))) outf.close() ############################ # contextualise those that # don't fall in transcripts ############################ if options.reference_gtf: context_summary = open( IOTools.snip(options.bam_file, ".bam") + ".excluded.context", "w") context_summary.write("\t".join(["Feature", "number"]) + "\n") # write out the read info as well context_file = open( IOTools.snip(options.bam_file, ".bam") + ".excluded", "w") context_dict = collections.defaultdict(int) # intersect bam - write non-overlapping with transcripts - intersect # with reference - write out context = pybedbamfile.intersect(gtffile, v=True, bed=True).intersect( pybedtools.BedTool(options.reference_gtf), wb=True) for entry in context: feature = entry[8] context_dict[feature] += 1 context_file.write("\t".join([e for e in entry]) + "\n") for feature, value in context_dict.iteritems(): context_summary.write("\t".join([feature, str(value)]) + "\n") context_file.close() context_summary.close() # write footer and output benchmark information. E.Stop()
def loadCuffdiff(infile, outfile, min_fpkm=1.0): '''load results from differential expression analysis and produce summary plots. Note: converts from ln(fold change) to log2 fold change. The cuffdiff output is parsed. Pairwise comparisons in which one gene is not expressed (fpkm < fpkm_silent) are set to status 'NOCALL'. These transcripts might nevertheless be significant. This requires the cummeRbund library to be present in R. ''' prefix = P.toTable(outfile) indir = infile + ".dir" if not os.path.exists(indir): P.touch(outfile) return # E.info( "building cummeRbund database" ) # R('''library(cummeRbund)''') # cuff = R('''readCufflinks(dir = %(indir)s, dbfile=%(indir)s/csvdb)''' ) # to be continued dbhandle = sqlite3.connect(PARAMS["database"]) tmpname = P.getTempFilename(".") # ignore promoters and splicing - no fold change column, but sqrt(JS) for fn, level in (("cds_exp.diff.gz", "cds"), ("gene_exp.diff.gz", "gene"), ("isoform_exp.diff.gz", "isoform"), # ("promoters.diff.gz", "promotor"), # ("splicing.diff.gz", "splice"), ("tss_group_exp.diff.gz", "tss")): tablename = prefix + "_" + level + "_diff" infile = os.path.join(indir, fn) results = parseCuffdiff(infile, min_fpkm=min_fpkm) Expression.writeExpressionResults(tmpname, results) statement = '''cat %(tmpname)s | python %(scriptsdir)s/csv2db.py %(csv2db_options)s --allow-empty-file --add-index=treatment_name --add-index=control_name --add-index=test_id --table=%(tablename)s >> %(outfile)s.log ''' P.run() for fn, level in (("cds.fpkm_tracking.gz", "cds"), ("genes.fpkm_tracking.gz", "gene"), ("isoforms.fpkm_tracking.gz", "isoform"), ("tss_groups.fpkm_tracking.gz", "tss")): tablename = prefix + "_" + level + "_levels" statement = '''zcat %(indir)s/%(fn)s | python %(scriptsdir)s/csv2db.py %(csv2db_options)s --allow-empty-file --add-index=tracking_id --table=%(tablename)s >> %(outfile)s.log ''' P.run() # Jethro - load tables of sample specific cuffdiff fpkm values into csvdb # IMS: First read in lookup table for CuffDiff/Pipeline sample name # conversion inf = IOTools.openFile(os.path.join(indir, "read_groups.info.gz")) inf.readline() sample_lookup = {} for line in inf: line = line.split("\t") our_sample_name = IOTools.snip(line[0]) our_sample_name = re.sub("-", "_", our_sample_name) cuffdiff_sample_name = "%s_%s" % (line[1], line[2]) sample_lookup[cuffdiff_sample_name] = our_sample_name inf.close() for fn, level in (("cds.read_group_tracking.gz", "cds"), ("genes.read_group_tracking.gz", "gene"), ("isoforms.read_group_tracking.gz", "isoform"), ("tss_groups.read_group_tracking.gz", "tss")): tablename = prefix + "_" + level + "sample_fpkms" tmpf = P.getTempFilename(".") inf = IOTools.openFile(os.path.join(indir, fn)).readlines() outf = IOTools.openFile(tmpf, "w") samples = [] genes = {} x = 0 for line in inf: if x == 0: x += 1 continue line = line.split() gene_id = line[0] condition = line[1] replicate = line[2] fpkm = line[6] status = line[8] sample_id = condition + "_" + replicate if sample_id not in samples: samples.append(sample_id) # IMS: The following block keeps getting its indenting messed # up. It is not part of the 'if sample_id not in samples' block # plesae make sure it does not get made part of it if gene_id not in genes: genes[gene_id] = {} genes[gene_id][sample_id] = fpkm else: if sample_id in genes[gene_id]: raise ValueError( 'sample_id %s appears twice in file for gene_id %s' % (sample_id, gene_id)) else: if status != "OK": genes[gene_id][sample_id] = status else: genes[gene_id][sample_id] = fpkm samples = sorted(samples) # IMS - CDS files might be empty if not cds has been # calculated for the genes in the long term need to add CDS # annotation to denovo predicted genesets in meantime just # skip if cds tracking file is empty if len(samples) == 0: continue headers = "gene_id\t" + "\t".join([sample_lookup[x] for x in samples]) outf.write(headers + "\n") for gene in genes.iterkeys(): outf.write(gene + "\t") x = 0 while x < len(samples) - 1: outf.write(genes[gene][samples[x]] + "\t") x += 1 # IMS: Please be careful with this line. It keeps getting moved # into the above while block where it does not belong outf.write(genes[gene][samples[len(samples) - 1]] + "\n") outf.close() statement = ("cat %(tmpf)s |" " python %(scriptsdir)s/csv2db.py " " %(csv2db_options)s" " --allow-empty-file" " --add-index=gene_id" " --table=%(tablename)s" " >> %(outfile)s.log") P.run() os.unlink(tmpf) # build convenience table with tracks tablename = prefix + "_isoform_levels" tracks = Database.getColumnNames(dbhandle, tablename) tracks = [x[:-len("_FPKM")] for x in tracks if x.endswith("_FPKM")] tmpfile = P.getTempFile(dir=".") tmpfile.write("track\n") tmpfile.write("\n".join(tracks) + "\n") tmpfile.close() P.load(tmpfile.name, outfile) os.unlink(tmpfile.name)