def runControlCPC(infile, outfile): # farm.py is called from within cpc.sh assert iotools.which( "farm.py"), "farm.py needs to be in $PATH for cpc to run" # Default cpc parameters don't work with later versions of blast E.info("Running cpc with blast version:%s" % iotools.which("blastx")) result_evidence = P.snip(outfile, ".result") + ".evidence" working_dir = "lncRNA_control/cpc" statement = ("%(pipeline_scriptsdir)s/cpc.sh" " %(infile)s" " %(outfile)s" " %(working_dir)s" " %(result_evidence)s") P.run()
def check_executables(filenames): """check for the presence/absence of executables""" missing = [] for filename in filenames: if not iotools.which(filename): missing.append(filename) if missing: raise ValueError("missing executables: %s" % ",".join(missing))
def run_report(clean=True, with_pipeline_status=True, pipeline_status_format="svg"): '''run cgatreport. This will also run ruffus to create an svg image of the pipeline status unless *with_pipeline_status* is set to False. The image will be saved into the export directory. ''' params = P.get_params() if with_pipeline_status: targetdir = params["exportdir"] if not os.path.exists(targetdir): os.mkdir(targetdir) ruffus.pipeline_printout_graph( os.path.join(targetdir, "pipeline.%s" % pipeline_status_format), pipeline_status_format, ["full"], checksum_level=params["ruffus_checksums_level"]) dirname, basename = os.path.split(P.get_caller().__file__) report_engine = params.get("report_engine", "cgatreport") assert report_engine in ('sphinxreport', 'cgatreport') docdir = os.path.join(dirname, "pipeline_docs", iotools.snip(basename, ".py")) themedir = os.path.join(dirname, "pipeline_docs", "themes") relpath = os.path.relpath(docdir) trackerdir = os.path.join(docdir, "trackers") # use a fake X display in order to avoid windows popping up # from R plots. xvfb_command = iotools.which("xvfb-run") # permit multiple servers using -d option if xvfb_command: xvfb_command += " -d " else: xvfb_command = "" # if there is no DISPLAY variable set, xvfb runs, but # exits with error when killing process. Thus, ignore return # value. # print os.getenv("DISPLAY"), "command=", xvfb_command if not os.getenv("DISPLAY"): erase_return = "|| true" else: erase_return = "" if os.path.exists("conf.py"): conf_dir = os.path.abspath(".") else: conf_dir = os.path.join(os.path.dirname(__file__), "configuration") # in the current version, xvfb always returns with an error, thus # ignore these. erase_return = "|| true" if clean: clean = "rm -rf report _cache _static;" else: clean = "" # with sphinx >1.3.1 the PYTHONPATH needs to be set explicitely as # the virtual environment seems to be stripped. It is thus set to # the contents of the current sys.path syspath = ":".join(sys.path) statement = ''' %(clean)s (export SPHINX_DOCSDIR=%(docdir)s; export SPHINX_THEMEDIR=%(themedir)s; export PYTHONPATH=%(syspath)s; %(xvfb_command)s %(report_engine)s-build --num-jobs=%(report_threads)s sphinx-build -b html -d %(report_doctrees)s -c %(conf_dir)s -j %(report_threads)s %(docdir)s %(report_html)s >& report.log %(erase_return)s ) ''' P.run(statement) E.info( 'the report is available at %s' % os.path.abspath(os.path.join(params['report_html'], "contents.html")))
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.ArgumentParser(description=__doc__) parser.add_argument( "--input-filename-fasta", dest="input_filename_fasta", type=str, help="filename with reference sequence in fasta format ") parser.add_argument( "--counting-mode", dest="counting_mode", type=str, choices=("all", "pileup_defaults"), help="counting mode. all=all reads/bases. pileup-defaults= " "use default pileup thresholds. Options will be added to " "--mpileup-options. .") parser.add_argument("--mpileup-options", dest="mpileup_options", type=str, help="pileup options to use ") parser.set_defaults( mpileup_options="", counting_mode="all", input_filename_fasta=None, report_step=1000000, ) # add common options (-h/--help, ...) and parse command line (args) = E.start(parser, argv=argv, add_output_options=True) bamfile = args[0] mpileup_options = args.mpileup_options if args.counting_mode == "all": mpileup_options += " -Q 0 -B -A" read_depth_histogram = collections.defaultdict(int) base_depth_histogram = collections.defaultdict(int) # deletions are marked by something like -2AA at the first # position and a '*' for subsequent positions rx_deletions = re.compile("([-][0-9]+|[*])") report_step = args.report_step npositions = 0 samtools = iotools.which("samtools") statement = ("{samtools} mpileup " "-f {reference_fasta} " "{mpileup_options} " "{bamfile} ".format(samtools=samtools, reference_fasta=args.input_filename_fasta, mpileup_options=mpileup_options, bamfile=os.path.abspath(bamfile))) E.info("running the following statement: {}".format(statement)) cmd_args = shlex.split(statement) proc = subprocess.Popen(cmd_args, shell=False, stderr=subprocess.PIPE, stdout=subprocess.PIPE, cwd=os.path.abspath(os.curdir)) for line in proc.stdout: line = line.decode("utf-8") contig, pos, base, read_depth, info, qualities = line[:-1].split("\t") read_depth = int(read_depth) pos = int(pos) if pos % report_step == 0: E.info("working on {}: {}".format(contig, pos)) ndeletions = len(rx_deletions.findall(info)) base_depth = read_depth - ndeletions read_depth_histogram[read_depth] += 1 base_depth_histogram[base_depth] += 1 for line in proc.stderr: E.warn(line) keys = sorted( set(read_depth_histogram.keys()).union(base_depth_histogram.keys())) args.stdout.write("depth\tread_depth_positions\tbase_depth_positions\n") for key in keys: args.stdout.write("{}\t{}\t{}\n".format(key, read_depth_histogram[key], base_depth_histogram[key])) E.info("positions tested: {}".format(sum(read_depth_histogram.values()))) E.stop()
def main(argv=None): """script main. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-o", "--output-format", dest="output_format", type="choice", choices=("bedgraph", "wiggle", "bigbed", "bigwig", "bed"), help="output format [default=%default]") parser.add_option("-s", "--shift-size", dest="shift", type="int", help="shift reads by a certain amount (ChIP-Seq) " "[%default]") parser.add_option("-e", "--extend", dest="extend", type="int", help="extend reads by a certain amount " "(ChIP-Seq) [%default]") parser.add_option("-p", "--wiggle-span", dest="span", type="int", help="span of a window in wiggle tracks " "[%default]") parser.add_option("-m", "--merge-pairs", dest="merge_pairs", action="store_true", help="merge paired-ended reads into a single " "bed interval [default=%default].") parser.add_option("--scale-base", dest="scale_base", type="float", help="number of reads/pairs to scale bigwig file to. " "The default is to scale to 1M reads " "[default=%default]") parser.add_option("--scale-method", dest="scale_method", type="choice", choices=( "none", "reads", ), help="scale bigwig output. 'reads' will normalize by " "the total number reads in the bam file that are used " "to construct the bigwig file. If --merge-pairs is used " "the number of pairs output will be used for " "normalization. 'none' will not scale the bigwig file" "[default=%default]") parser.add_option("--max-insert-size", dest="max_insert_size", type="int", help="only merge if insert size less that " "# bases. 0 turns of this filter " "[default=%default].") parser.add_option("--min-insert-size", dest="min_insert_size", type="int", help="only merge paired-end reads if they are " "at least # bases apart. " "0 turns of this filter. [default=%default]") parser.set_defaults( samfile=None, output_format="wiggle", shift=0, extend=0, span=1, merge_pairs=None, min_insert_size=0, max_insert_size=0, scale_method='none', scale_base=1000000, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv, add_output_options=True) if len(args) >= 1: options.samfile = args[0] if len(args) == 2: options.output_filename_pattern = args[1] if not options.samfile: raise ValueError("please provide a bam file") # Read BAM file using Pysam samfile = pysam.AlignmentFile(options.samfile, "rb") # Create temporary files / folders tmpdir = tempfile.mkdtemp() E.debug("temporary files are in %s" % tmpdir) tmpfile_wig = os.path.join(tmpdir, "wig") tmpfile_sizes = os.path.join(tmpdir, "sizes") # Create dictionary of contig sizes contig_sizes = dict(list(zip(samfile.references, samfile.lengths))) # write contig sizes outfile_size = iotools.open_file(tmpfile_sizes, "w") for contig, size in sorted(contig_sizes.items()): outfile_size.write("%s\t%s\n" % (contig, size)) outfile_size.close() # Shift and extend only available for bigwig format if options.shift or options.extend: if options.output_format != "bigwig": raise ValueError( "shift and extend only available for bigwig output") # Output filename required for bigwig / bigbed computation if options.output_format == "bigwig": if not options.output_filename_pattern: raise ValueError( "please specify an output file for bigwig computation.") # Define executable to use for binary conversion if options.output_format == "bigwig": executable_name = "wigToBigWig" else: raise ValueError("unknown output format `%s`" % options.output_format) # check required executable file is in the path executable = iotools.which(executable_name) if not executable: raise OSError("could not find %s in path." % executable_name) # Open outout file outfile = iotools.open_file(tmpfile_wig, "w") E.info("starting output to %s" % tmpfile_wig) else: outfile = iotools.open_file(tmpfile_wig, "w") E.info("starting output to stdout") # Set up output write functions if options.output_format in ("wiggle", "bigwig"): # wiggle is one-based, so add 1, also step-size is 1, so need # to output all bases if options.span == 1: outf = lambda outfile, contig, start, end, val: \ outfile.write( "".join(["%i\t%i\n" % (x, val) for x in range(start + 1, end + 1)])) else: outf = SpanWriter(options.span) elif options.output_format == "bedgraph": # bed is 0-based, open-closed outf = lambda outfile, contig, start, end, val: \ outfile.write("%s\t%i\t%i\t%i\n" % (contig, start, end, val)) # initialise counters ninput, nskipped, ncontigs = 0, 0, 0 # set output file name output_filename_pattern = options.output_filename_pattern if output_filename_pattern: output_filename = os.path.abspath(output_filename_pattern) # shift and extend or merge pairs. Output temporay bed file if options.shift > 0 or options.extend > 0 or options.merge_pairs: # Workflow 1: convert to bed intervals and use bedtools # genomecov to build a coverage file. # Convert to bigwig with UCSC tools bedGraph2BigWig if options.merge_pairs: # merge pairs using bam2bed E.info("merging pairs to temporary file") counter = merge_pairs(samfile, outfile, min_insert_size=options.min_insert_size, max_insert_size=options.max_insert_size, bed_format=3) E.info("merging results: {}".format(counter)) if counter.output == 0: raise ValueError("no pairs output after merging") else: # create bed file with shifted/extended tags shift, extend = options.shift, options.extend shift_extend = shift + extend counter = E.Counter() for contig in samfile.references: E.debug("output for %s" % contig) lcontig = contig_sizes[contig] for read in samfile.fetch(contig): pos = read.pos if read.is_reverse: start = max(0, read.pos + read.alen - shift_extend) else: start = max(0, read.pos + shift) # intervals extending beyond contig are removed if start >= lcontig: continue end = min(lcontig, start + extend) outfile.write("%s\t%i\t%i\n" % (contig, start, end)) counter.output += 1 outfile.close() if options.scale_method == "reads": scale_factor = float(options.scale_base) / counter.output E.info("scaling: method=%s scale_quantity=%i scale_factor=%f" % (options.scale_method, counter.output, scale_factor)) scale = "-scale %f" % scale_factor else: scale = "" # Convert bed file to coverage file (bedgraph) tmpfile_bed = os.path.join(tmpdir, "bed") E.info("computing coverage") # calculate coverage - format is bedgraph statement = """bedtools genomecov -bg -i %(tmpfile_wig)s %(scale)s -g %(tmpfile_sizes)s > %(tmpfile_bed)s""" % locals() E.run(statement) # Convert bedgraph to bigwig E.info("converting to bigwig") tmpfile_sorted = os.path.join(tmpdir, "sorted") statement = ("sort -k 1,1 -k2,2n %(tmpfile_bed)s > %(tmpfile_sorted)s;" "bedGraphToBigWig %(tmpfile_sorted)s %(tmpfile_sizes)s " "%(output_filename_pattern)s" % locals()) E.run(statement) else: # Workflow 2: use pysam column iterator to build a # wig file. Then convert to bigwig of bedgraph file # with UCSC tools. def column_iter(iterator): start = None end = 0 n = None for t in iterator: if t.pos - end > 1 or n != t.n: if start is not None: yield start, end, n start = t.pos end = t.pos n = t.n end = t.pos yield start, end, n if options.scale_method != "none": raise NotImplementedError( "scaling not implemented for pileup method") # Bedgraph track definition if options.output_format == "bedgraph": outfile.write("track type=bedGraph\n") for contig in samfile.references: # if contig != "chrX": continue E.debug("output for %s" % contig) lcontig = contig_sizes[contig] # Write wiggle header if options.output_format in ("wiggle", "bigwig"): outfile.write("variableStep chrom=%s span=%i\n" % (contig, options.span)) # Generate pileup per contig using pysam and iterate over columns for start, end, val in column_iter(samfile.pileup(contig)): # patch: there was a problem with bam files and reads # overextending at the end. These are usually Ns, but # need to check as otherwise wigToBigWig fails. if lcontig <= end: E.warn("read extending beyond contig: %s: %i > %i" % (contig, end, lcontig)) end = lcontig if start >= end: continue if val > 0: outf(outfile, contig, start, end, val) ncontigs += 1 # Close output file if type(outf) == type(SpanWriter): outf.flush(outfile) else: outfile.flush() E.info("finished output") # Report counters E.info("ninput=%i, ncontigs=%i, nskipped=%i" % (ninput, ncontigs, nskipped)) # Convert to binary formats if options.output_format == "bigwig": outfile.close() E.info("starting %s conversion" % executable) try: retcode = subprocess.call(" ".join( (executable, tmpfile_wig, tmpfile_sizes, output_filename_pattern)), shell=True) if retcode != 0: E.warn("%s terminated with signal: %i" % (executable, -retcode)) return -retcode except OSError as msg: E.warn("Error while executing bigwig: %s" % msg) return 1 E.info("finished bigwig conversion") else: with open(tmpfile_wig) as inf: sys.stdout.write(inf.read()) # Cleanup temp files shutil.rmtree(tmpdir) E.stop()
def getRunStatement(self, infile, outfile, controlfile): """ Generate a specific run statement for each peakcaller class """ # select location of the spp script to run if self.PARAMS_PEAKCALLER["spp_options_idr_script"] == "default": executable = iotools.which("run_spp.R") elif self.PARAMS_PEAKCALLER["spp_options_idr_script"] == "nodups": executable = iotools.which("run_spp_nodups.R") else: executable = self.PARAMS_PEAKCALLER["spp_options_idr_script"] try: os.path.exists(executable) except: raise IOError("SPP script not found: %s" % executable) # select the threshold for lax peak calling if self.PARAMS_PEAKCALLER["spp_options_npeaks"]: if self.PARAMS_PEAKCALLER["spp_options_fdr"]: raise Exception("Value specified for both SPP options" " -npeaks and -fdr please select one or" " other option, but not both") else: threshold = "-npeaks=" + \ str(self.PARAMS_PEAKCALLER["spp_options_npeaks"]) elif self.PARAMS_PEAKCALLER["spp_options_fdr"]: threshold = "-fdr=" + \ str(self.PARAMS_PEAKCALLER["spp_options_fdr"]) else: raise Exception("Must specify a value for either" " spp_options_npeaks or spp_options_fdr," " but not both") # build run statement for spp. # -savn is output.npeak.file (passed as NULL, # means filename based on infile) # -out is output.result.file # -odir defaults to os.path.dirname( infile ) # -savn is save narrowpeak file # -savr is save regionpeak file # (run_spp.R script throws an error if region peak is not output). statement = [("Rscript %(executable)s" " -c=%(infile)s" " -i=%(controlfile)s" " %(threshold)s" " -savn" " -savr")] # add additional options statement.append(self.PARAMS_PEAKCALLER["spp_options_parameters"]) # specify outfile # MM: this was hard-coded to a non-existent directory # changed to stats directory statement.append(" -rf" " -out=./stats/phantomPeakStatsReps.tab" " >& %(outfile)s") statement = (" ".join(statement) % locals()) return statement