def main(): cmnd = get_args() ## resolve projects project = dxencode.resolve_project(PROJECT_NAME) print 'Project: ' + project.describe()['name'] pid = project.get_id() counts = {} n = 0 summaries = dxpy.find_data_objects(classname='file', folder='/runs', name='*_summary.txt', recurse=True, name_mode='glob', project=pid, return_handler=False) while summaries: try: flink = dxpy.dxlink(summaries.next()) n = n+1 except StopIteration: break fd = dxpy.describe(flink) fn = "fastqc/%s" % fd['name'] if not os.path.isfile(fn): print 'Downloading: %s from %s' % (fn, fd['folder']) try: dxpy.download_dxfile(flink, fn) except Exception, e: print "Error %s" % e parse_summary(fn, counts)
def main(outfn, assembly, debug, key, keyfile, dryrun, force, analysis_ids=None, infile=None, project=None): if debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) if infile is not None: infile = dxpy.DXFile(infile) dxpy.download_dxfile(infile.get_id(), "infile") ids = open("infile",'r') elif analysis_ids is not None: ids = analysis_ids else: logger.error("Must supply one of --infile or a list of one or more analysis-ids") return authid, authpw, server = common.processkey(key, keyfile) keypair = (authid,authpw) for (i, analysis_id) in enumerate(ids): logger.info('%s' %(analysis_id)) accessioned_files = accession_analysis(analysis_id, keypair, server, assembly, dryrun, force) print accessioned_files common.touch(outfn) outfile = dxpy.upload_local_file(outfn) output = {} output["outfile"] = dxpy.dxlink(outfile) return output
def main(quants_a, quants_b): # tool_versions.py --applet $script_name --appver $script_ver sw_versions = subprocess.check_output(['tool_versions.py', '-a', APP_SCRIPT, '-av', APP_VER]) dxfile_a = dxpy.DXFile(quants_a) dxfile_b = dxpy.DXFile(quants_b) print "* Downloading files..." dxpy.download_dxfile(dxfile_a.get_id(), "quants_a") dxpy.download_dxfile(dxfile_b.get_id(), "quants_b") print "* Runnning MAD.R..." mad_output = subprocess.check_output(['Rscript', '/usr/bin/MAD.R', 'quants_a', 'quants_b']) quants_a_name = dxfile_a.name.split('.') quants_b_name = dxfile_b.name.split('.') filename = quants_a_name[0] + '_' + quants_b_name[0] + '_' + quants_a_name[1] + '_mad_plot.png' subprocess.check_call(['mv', "MAplot.png", filename]) print "* package properties..." qc_metrics = {} qc_metrics["MAD.R"] = json.loads(mad_output) meta_string = json.dumps(qc_metrics) print json.dumps(qc_metrics,indent=4) props = {} props["SW"] = sw_versions print "* Upload Plot..." plot_dxfile = dxpy.upload_local_file(filename,properties=props,details=qc_metrics) return { "metadata": meta_string, "mad_plot": plot_dxfile }
def main(BAMs, params='USE_THREADING=true SORT_ORDER=coordinate VALIDATION_STRINGENCY=LENIENT'): inputFiles = [] for i in range(len(BAMs)): fh = dxpy.DXFile(BAMs[i]) dxpy.download_dxfile(fh.get_id(), "input%d.bam" % (i)) name = dxpy.DXFile(BAMs[0]).describe()['name'].rstrip(".bam") # Fill in your application code here. command = "java -Xmx4g -jar /opt/jar/MergeSamFiles.jar OUTPUT=%s.bam %s" % (name, params) for i in range(len(BAMs)): command += " INPUT=input%d.bam" % (i) subprocess.check_call(command, shell=True) # The following line(s) use the Python bindings to upload your file outputs # after you have created them on the local file system. It assumes that you # have used the output field name for the filename for each output, but you # can change that behavior to suit your needs. BAM = dxpy.upload_local_file("%s.bam" % name); # The following line fills in some basic dummy output and assumes # that you have created variables to represent your output with # the same name as your output fields. output = {} output["BAM"] = dxpy.dxlink(BAM) return output
def main(inputs, prefix=None): input_filenames = [] for input_file in inputs: dxf = dxpy.DXFile(input_file) input_filenames.append(dxf.name) dxpy.download_dxfile(dxf.get_id(), dxf.name) # uses last extension - presumably they are all the same extension = splitext(splitext(input_filenames[-1])[0])[1] if prefix: pooled_filename = prefix + "_pooled%s.gz" % (extension) else: pooled_filename = \ '-'.join([splitext(splitext(fn)[0])[0] for fn in input_filenames]) + "_pooled%s.gz" % (extension) out, err = common.run_pipe([ 'gzip -dc %s' % (' '.join(input_filenames)), 'gzip -cn'], outfile=pooled_filename) pooled = dxpy.upload_local_file(pooled_filename) output = { "pooled": dxpy.dxlink(pooled) } return output
def main(input_file): # The following line(s) initialize your data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. input_file = dxpy.DXFile(input_file) # The following line(s) download your file inputs to the local file system # using variable names for the filenames. dxpy.download_dxfile(input_file.get_id(), "input_file") # Fill in your application code here. subprocess.check_call("fastq_quality_trimmer -t 20 -Q 33 -i input_file -o output_file", shell=True) # The following line(s) use the Python bindings to upload your file outputs # after you have created them on the local file system. It assumes that you # have used the output field name for the filename for each output, but you # can change that behavior to suit your needs. output_file = dxpy.upload_local_file("output_file"); # The following line fills in some basic dummy output and assumes # that you have created variables to represent your output with # the same name as your output fields. output = {} output["output_file"] = dxpy.dxlink(output_file) return output
def unpack_tarball(input_tarball): """ Unpacks the tarball with the specified file ID and returns a string containing the directory where it was unpacked. """ tempdir = tempfile.mkdtemp() print "Working in " + tempdir tarball_filename = os.path.join(tempdir, "input.tar.gz") dxpy.download_dxfile(input_tarball, tarball_filename) checkout_dir = os.path.join(tempdir, "unpackdest") os.mkdir(checkout_dir) subprocess.check_call(['tar', '-xzf', tarball_filename, '-C', checkout_dir, '--warning=no-timestamp']) # TODO: instead of guessing the directory name to be a name that # generates no warnings, have the client send the directory name # that was used on its end. try: appname = json.load(open(os.path.join(tempdir, "unpackdest", "dxapp.json"))).get("name", "unpackdest") if appname != "unpackdest": os.rename(os.path.join(tempdir, "unpackdest"), os.path.join(tempdir, appname)) checkout_dir = os.path.join(tempdir, appname) except: pass return checkout_dir
def main(psmcfa, psmc, outname, xchr, timemax, window): # The following line(s) initialize your data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. psmcfa = dxpy.DXFile(psmcfa) psmc = dxpy.DXFile(psmc) # The following line(s) download your file inputs to the local file system # using variable names for the filenames. dxpy.download_dxfile(psmcfa.get_id(), "psmcfa") dxpy.download_dxfile(psmc.get_id(), "psmc") # Fill in your application code here. (tmaxNew, parfile) = writeRecalFile('psmc', timemax, window, xchr) subprocess.check_call(['psmc', '-t', str(round(tmaxNew,4)), '-i', parfile, '-o', outname, 'psmcfa']) # The following line(s) use the Python bindings to upload your file outputs # after you have created them on the local file system. It assumes that you # have used the output field name for the filename for each output, but you # can change that behavior to suit your needs. outfile = dxpy.upload_local_file(outname); # The following line fills in some basic dummy output and assumes # that you have created variables to represent your output with # the same name as your output fields. output = {} output["outfile"] = dxpy.dxlink(outfile) return output
def postprocess(**inputs): kwargs = inputs["kwargs"] subjob_outputs = inputs["subjob_outputs"] print "\nMerging outputs from {n} subjobs".format(n=len(subjob_outputs)) output_prefix = kwargs["output_prefix"] variant_suffixes = kwargs["variant_suffixes"] app_output_fn = {} for subjob_output in subjob_outputs: for type, id in subjob_output.iteritems(): file_id = id["$dnanexus_link"] filename = output_prefix + "_" + variant_suffixes[type] print "Downloading " + str(file_id) + " into " + filename dxpy.download_dxfile(dxid=file_id, filename=filename, append=True) app_output_fn[type] = filename postprocess_outputs = {} need_to_renumber = ["deletions", "short_inserts", "tandem_duplications", "inversions", "large_inserts"] for type, fn in app_output_fn.iteritems(): out_fn = fn if type in need_to_renumber: out_fn = RenumberMergedOutput(fn, fn+"_renumbered") print "\nUploading {file} as {fn}".format(file=out_fn, fn=fn) postprocess_outputs[type] = dxpy.dxlink(dxpy.upload_local_file(out_fn, name=fn)) if kwargs["export_vcf"]: DownloadRefFasta(kwargs["reference_fasta"]) postprocess_outputs["vcf"] = ExportVCF(kwargs=kwargs, output_path=output_prefix, ref_fn="reference_fasta") return postprocess_outputs
def main(input_bams): # Initialize data object inputs on the platform # into dxpy.DXDataObject instances. input_bams = [dxpy.DXFile(item) for item in input_bams] # Download each file input to a new directory in the the local file system # using variable names for the filenames. # Construct output filenames. # Dispatch jobs to a pool of workers. out_paths = [] pool = Pool() # default is pool of cpu_count() workers for i, bam in enumerate(input_bams): dirname = str(i) filename = bam.name os.mkdir(dirname) in_path = os.path.join(dirname, filename) dxpy.download_dxfile(bam.get_id(), in_path) out_path = os.path.join(dirname, "scrub-" + filename) out_paths.append(out_path) pool.apply_async(scrub, (in_path, out_path)) # Close the worker pool and block until all jobs are complete. pool.close() pool.join() # Populate output fields and return. scrubbed_bams = [dxpy.upload_local_file(path) for path in out_paths] output = { "scrubbed_bams": [dxpy.dxlink(output_bam) for output_bam in scrubbed_bams] } return output
def _install_dep_bundle(self, bundle): if bundle["id"].get("$dnanexus_link", "").startswith("file-"): self.log("Downloading bundled file {name}".format(**bundle)) dxpy.download_dxfile(bundle["id"], bundle["name"]) self.run("dx-unpack '{}'".format(bundle["name"])) else: self.log('Skipping bundled dependency "{name}" because it does not refer to a file'.format(**bundle))
def scatter(orig_reads, split_size): # Fill in code here to do whatever is necessary to scatter the # input. if DEBUG: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) splitsize = split_size * 1000000 * 4 # each FQ read is 4 lines os.mkdir('splits') for f in orig_reads: reads_filename = dxpy.describe(f)['name'] reads_basename = strip_extensions(reads_filename, STRIP_EXTENSIONS) dxpy.download_dxfile(dxpy.DXFile(f).get_id(), reads_filename) reads_root_name = simplify_name() or reads_basename logger.info('* RUNNING /bin/zcat %s | /usr/bin/split -l %d -d - %s ' % (reads_filename, splitsize, 'splits/' + reads_root_name)) split_out = subprocess.check_output('/bin/zcat %s | /usr/bin/split -l %d -d - %s ' % (reads_filename, splitsize, 'splits/' + reads_root_name), shell=True) # can't shlex because of | logger.info(split_out) splits = os.listdir('splits') logger.info("* Return from scatter: %s *" % splits) # SHould we gzip here? return { "array_of_scattered_input": [ dxpy.dxlink(dxpy.upload_local_file('splits/' + split_file)) for split_file in splits] }
def merge_map_reports(map_report_set, target_root): '''Merges techrep map_reports.''' # Working on map_reports now all_reports="" biorep_map_report = target_root + '_map_report.txt' append_line("### Combined Bismark map report for several technical replicates ###\n",biorep_map_report) for techrep_map_report_dlink in map_report_set: file_desc = dxpy.describe(techrep_map_report_dlink) file_root = file_desc['name'] file_root = file_root.replace('_techrep_bismark_map_report.txt','') file_root = file_root.replace('_bismark_map_report.txt','') file_root = file_root.replace('_map_report.txt','') techrep_map_report = file_root + '_techrep_map_report.txt' append_line("###################################",biorep_map_report) append_line("### Map report for ${file_root} ###",biorep_map_report) print "* Downloading %s_techrep_bismark_map_report.txt file..." % file_root dxpy.download_dxfile(techrep_map_report_dlink, techrep_map_report) run_cmd('cat ' + techrep_map_report, out=biorep_map_report,append=True) if len(all_reports) == 0: all_reports = techrep_map_report else: all_reports += ',' + techrep_map_report if all_reports == techrep_map_report: # only one run_cmd('mv %s %s' % (techrep_map_report,biorep_map_report) ) all_reports = biorep_map_report return (biorep_map_report,all_reports)
def main(sam_file, probability): # The following line(s) initialize your data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. sam_file = dxpy.DXFile(sam_file) # The following line(s) download your file inputs to the local file system # using variable names for the filenames. dxpy.download_dxfile(sam_file.get_id(), "sam_file") if probability < 0 or probability > 1: raise dxpy.AppError("Probability parameter determines % of mappings included in output. Must be between 0 an 1.") subprocess.check_call(" ".join(["java", "-Xmx2g", "-jar", "/usr/local/bin/DownsampleSam.jar", "INPUT=sam_file", "OUTPUT=downsampled_sam", "PROBABILITY="+str(probability)]), shell=True) # The following line(s) use the Python bindings to upload your file outputs # after you have created them on the local file system. It assumes that you # have used the output field name for the filename for each output, but you # can change that behavior to suit your needs. downsampled_sam = dxpy.upload_local_file("downsampled_sam") downsampled_sam.rename(sam_file.describe()['name']+"_downsampled") # The following line fills in some basic dummy output and assumes # that you have created variables to represent your output with # the same name as your output fields. output = {} output["downsampled_sam"] = dxpy.dxlink(downsampled_sam) return output
def _download_one_file(file_rec, idir): src_file = file_rec['src_file_id'] trg_file = os.path.join(idir, file_rec['trg_fname']) print("downloading file: " + src_file + " to filesystem: " + trg_file) sys.stdout.flush() dxpy.download_dxfile(src_file, trg_file) return file_rec
def merge_map_reports(map_report_set, target_root): """Merges techrep map_reports.""" # Working on map_reports now all_reports = "" biorep_map_report = target_root + "_map_report.txt" append_line("### Combined Bismark map report for several technical replicates ###\n", biorep_map_report) for techrep_map_report_dlink in map_report_set: file_desc = dxpy.describe(techrep_map_report_dlink) file_root = file_desc["name"] file_root = file_root.replace("_techrep_bismark_map_report.txt", "") file_root = file_root.replace("_bismark_map_report.txt", "") file_root = file_root.replace("_map_report.txt", "") techrep_map_report = file_root + "_techrep_map_report.txt" append_line("###################################", biorep_map_report) append_line("### Map report for ${file_root} ###", biorep_map_report) print "* Downloading %s_techrep_bismark_map_report.txt file..." % file_root dxpy.download_dxfile(techrep_map_report_dlink, techrep_map_report) run_cmd("cat " + techrep_map_report, out=biorep_map_report, append=True) if len(all_reports) == 0: all_reports = techrep_map_report else: all_reports += "," + techrep_map_report if all_reports == techrep_map_report: # only one run_cmd("mv %s %s" % (techrep_map_report, biorep_map_report)) all_reports = biorep_map_report return (biorep_map_report, all_reports)
def main(input_bam, paired=True, params=''): # The following line(s) initialize your data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. input_bam = dxpy.DXFile(input_bam) base_name = remove_extensions(input_bam.describe()['name'], [".bam", ".BAM", ".sam", ".SAM"]) # The following line(s) download your file inputs to the local file system # using variable names for the filenames. dxpy.download_dxfile(input_bam.get_id(), "input.bam") # Fill in your application code here. command = "java -Xmx6g -jar /opt/jar/SamToFastq.jar INPUT=input.bam F=%s_1.fastq" % base_name if paired: command += " F2=%s_2.fastq" % base_name subprocess.check_call(command, shell=True) # The following line(s) use the Python bindings to upload your file outputs # after you have created them on the local file system. It assumes that you # have used the output field name for the filename for each output, but you # can change that behavior to suit your needs. output = {} fastq_file = dxpy.upload_local_file("%s_1.fastq" % base_name); output["fastq_file"] = dxpy.dxlink(fastq_file) if paired: paired_fastq_file = dxpy.upload_local_file("%s_2.fastq" % base_name); output["paired_fastq_file"] = dxpy.dxlink(paired_fastq_file) return output
def combine_files(countDXlinks, resultfn): """The 'gather' subjob of the applet. Arguments: countDXlinks (list[dict]): list of DXlinks to process job output files. resultfn (str): Filename to use for job output file. Returns: DXLink for the main function to return as the job output. Note: Only the DXLinks are passed as parameters. Subjobs work on a fresh instance so files must be downloaded to the machine """ if resultfn.endswith(".bam"): resultfn = resultfn[:-4] + '.txt' sum_reads = 0 with open(resultfn, 'w') as f: for i, dxlink in enumerate(countDXlinks): dxfile = dxpy.DXFile(dxlink) filename = "countfile{0}".format(i) dxpy.download_dxfile(dxfile, filename) with open(filename, 'r') as fsub: for line in fsub: sum_reads += parse_line_for_readcount(line) f.write(line) f.write('Total Reads: {0}'.format(sum_reads)) countDXFile = dxpy.upload_local_file(resultfn) countDXlink = dxpy.dxlink(countDXFile.get_id()) return {"countDXLink": countDXlink}
def main(quants_a, quants_b): # tool_versions.py --applet $script_name --appver $script_ver sw_versions = subprocess.check_output(['tool_versions.py', '--dxjson', 'dnanexus-executable.json']) dxfile_a = dxpy.DXFile(quants_a) dxfile_b = dxpy.DXFile(quants_b) print "* Downloading files..." dxpy.download_dxfile(dxfile_a.get_id(), "quants_a") dxpy.download_dxfile(dxfile_b.get_id(), "quants_b") # Create and appropriate name for output files out_root = root_name_from_pair(dxfile_a.name.split('.')[0],dxfile_b.name.split('.')[0]) mad_plot_file = out_root + '_mad_plot.png' # DX/ENCODE independent script is found in resources/usr/bin print "* Runnning MAD.R..." mad_output = subprocess.check_output(['Rscript', '/usr/bin/MAD.R', 'quants_a', 'quants_b']) subprocess.check_call(['mv', "MAplot.png", mad_plot_file]) print "* package properties..." qc_metrics = {} qc_metrics["MAD.R"] = json.loads(mad_output) meta_string = json.dumps(qc_metrics) print json.dumps(qc_metrics,indent=4) props = {} props["SW"] = sw_versions print "* Upload Plot..." plot_dxfile = dxpy.upload_local_file(mad_plot_file,properties=props,details=qc_metrics) return { "metadata": meta_string, "mad_plot": plot_dxfile }
def geneBody_coverage(BAM_file, BED_file): dxpy.download_dxfile(BED_file, "genes.bed") dxpy.download_dxfile(BAM_file, "mappings.bam") # split mappings into chunks that can be done on a single worker # all mappings are loaded into RAM so can only do 5 million at a time run_shell(" ".join(["samtools", "view", "mappings.bam", "|", "split", "-l 10000000", "-", "split_map"])) run_shell(" ".join(["samtools", "view", "-H", "mappings.bam", ">", "header_only.sam"])) files = os.listdir(".") jobs = [] for f in files: if f.startswith("split_map"): # add header run_shell(" ".join(["cat", "header_only.sam", f, ">", "temp.sam"])) # convert to BAM run_shell(" ".join(["samtools", "view", "-S", "-b", "temp.sam", ">", "temp.bam"])) # upload file split_bam = dxpy.upload_local_file("temp.bam") # run analysis jobs.append(dxpy.new_dxjob({"BAM_file":dxpy.dxlink(split_bam.get_id()), "BED_file":BED_file}, "run_gbc")) run_shell( "ls -l" ) gbc_agg_input = {"sub_reports":[]} for j in jobs: gbc_agg_input["sub_reports"].append({"job":j.get_id(), "field":"file"}) agg_job = dxpy.new_dxjob(gbc_agg_input, "gbc_agg").get_id() return {"results":{"job":agg_job, "field":"cover"}}
def bedmethyl_io(cx_report_dxlink, chrom_sizes_dxlink, target_root, qc_metrics, props): '''subjob runs cxrepo-bed.py, bedToBigBed on mem3_hdd2_x8''' print "* bedmethyl_io(): Retrieve CX report and chrom.sizes..." run_cmd('mkdir -p output/') cx_report = target_root + ".CX_report.txt" chrom_sizes = "chrom.sizes" dxpy.download_dxfile(chrom_sizes_dxlink, chrom_sizes) dxpy.download_dxfile(cx_report_dxlink, 'output/' + cx_report) (CpG_bed,CHG_bed,CHH_bed,CpG_bb,CHG_bb,CHH_bb) = bedmethyl(target_root, cx_report, chrom_sizes) print "* bedmethyl_io(): Storing bedmethyl results..." CpG_bed_dxfile = dxpy.upload_local_file(CpG_bed,properties=props,details=qc_metrics) CHG_bed_dxfile = dxpy.upload_local_file(CHG_bed,properties=props,details=qc_metrics) CHH_bed_dxfile = dxpy.upload_local_file(CHH_bed,properties=props,details=qc_metrics) CpG_bb_dxfile = dxpy.upload_local_file(CpG_bb,properties=props,details=qc_metrics) CHG_bb_dxfile = dxpy.upload_local_file(CHG_bb,properties=props,details=qc_metrics) CHH_bb_dxfile = dxpy.upload_local_file(CHH_bb,properties=props,details=qc_metrics) print "* bedmethyl_io(): Check storage..." run_cmd('ls -l') run_cmd('df -k .') return { "CpG_bed_dxlink": dxpy.dxlink(CpG_bed_dxfile), "CHG_bed_dxlink": dxpy.dxlink(CHG_bed_dxfile), "CHH_bed_dxlink": dxpy.dxlink(CHH_bed_dxfile), "CpG_bb_dxlink": dxpy.dxlink(CpG_bb_dxfile), "CHG_bb_dxlink": dxpy.dxlink(CHG_bb_dxfile), "CHH_bb_dxlink": dxpy.dxlink(CHH_bb_dxfile) }
def test_alignment_count(applet_id, project_id, folder, tmpdir): """Run BWA on a FASTQ file and verify that the number of alignments produced is correct. """ # Recall that applet_id is set in the associated conftest.py, which either # gets it from the command line or builds the applet and retrieves its id. # And tmpdir is some pytest magic. It's type is py.path.local.LocalPath. # It's strpath property just returns a string. applet = dxpy.DXApplet(applet_id) input_dict = {"fastq": dxpy.dxlink(SAMPLE_FASTQ), "genomeindex_targz": dxpy.dxlink(HS37D5_BWA_INDEX)} job = applet.run(input_dict, instance_type="mem1_ssd1_x16", folder=folder, project=project_id) job.wait_on_done() output_bam_dxfile = dxpy.DXFile(job.describe()["output"]["bam"]) local_filename = os.path.join(tmpdir.strpath, "test.bam") dxpy.download_dxfile(output_bam_dxfile.get_id(), local_filename) count_alignments_cmd = "samtools view {bam} | wc -l".format( bam=local_filename) num_alignments = int(subprocess.check_output(count_alignments_cmd, shell=True)) assert num_alignments == 1951476
def main(inputs): # The following line(s) initialize your data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. input_filenames = [] for input_file in inputs: dxf = dxpy.DXFile(input_file) input_filenames.append(dxf.name) dxpy.download_dxfile(dxf.get_id(), dxf.name) extension = splitext(splitext(input_filenames[-1])[0])[1] # uses last extension - presumably they are all the same pooled_filename = "-".join([splitext(splitext(fn)[0])[0] for fn in input_filenames]) + "_pooled%s.gz" % (extension) out, err = run_pipe(["gzip -dc %s" % (" ".join(input_filenames)), "gzip -c"], outfile=pooled_filename) pooled = dxpy.upload_local_file(pooled_filename) # The following line fills in some basic dummy output and assumes # that you have created variables to represent your output with # the same name as your output fields. output = {} output["pooled"] = dxpy.dxlink(pooled) return output
def main(fastq, genomeindex_targz): print "something else" fastq_dxfile = dxpy.DXFile(fastq) dxpy.download_dxfile(fastq_dxfile.get_id(), "input.fastq") genome_dxfile = dxpy.DXFile(genomeindex_targz) dxpy.download_dxfile(genome_dxfile.get_id(), "genome.tar.gz") os.makedirs("genome") tar_cmd = "tar xzvf genome.tar.gz -C genome" subprocess.check_call(tar_cmd, shell=True) genome_file = glob.glob("genome/*.bwt")[0] genome_file = re.sub("\.bwt$", "", genome_file) bwa_cmd = ( "bwa mem -t {nproc} {genome} {fastq} | " "samtools view -u -S - | " "samtools sort -m 256M -@ {nproc} - output".format( nproc=multiprocessing.cpu_count(), genome=genome_file, fastq="input.fastq" ) ) subprocess.check_call(bwa_cmd, shell=True) bam = dxpy.upload_local_file("output.bam") # The following line fills in some basic dummy output and assumes # that you have created variables to represent your output with # the same name as your output fields. output = {} output["bam"] = dxpy.dxlink(bam) return output
def read_duplication(BAM_file): dxpy.download_dxfile(BAM_file, "mappings.bam") run_shell( " ".join(["read_duplication.py", "-i mappings.bam", "-o read_dup"])) run_shell( " ".join(["cat", "read_dup.pos.DupRate.xls", "read_dup.seq.DupRate.xls", ">", "read_dup.txt"])) results_id = dxpy.upload_local_file("read_dup.txt", wait_on_close=True).get_id() return {"results":results_id}
def merge_extract(bam_set, map_report_set, dme_ix_dxlink, uncompress_bam, props): '''subjob runs bismark_methylation_extractor on mem1_hdd2_x32''' (target_root,biorep_bam) = merge_bams(bam_set, 32) (biorep_map,all_reports) = merge_map_reports(map_report_set, target_root) (qc_metrics, reads, biorep_bam_qc) = biorep_bam_qc_metrics(target_root, all_reports) print "* merge_extract(): Retrieve and uncompress index..." dme_ix = "dme_index.tar.gz" dxpy.download_dxfile(dme_ix_dxlink, dme_ix) run_cmd('tar -zxf ' + dme_ix) # NOTE: Better to use sam and let extractor use more threads, but this takes up precious storage (alignments, ncores) = bam_or_sam(biorep_bam, uncompress_bam, target_root) bismark_simple_extract(target_root, alignments, ncores) qc_metrics = bismark_qc_metrics(target_root, qc_metrics) print "* Retrieve split report..." append_line("\n===== bismark_methylation_extractor: splitting_report =====",biorep_bam_qc) run_cmd('cat %s_splitting_report.txt' % target_root,out=biorep_bam_qc,append=True,silent=True) # TODO: Is this even needed? Currently we do to get the size! #if len(bam_set) > 1: # Wouldn't need to do this unless there is a merge # print "* merge_extract(): Storing biorep bam..." # props_ex = props.copy() # props_ex.update({ 'reads': str(reads) }) # biorep_bam_dxlink = dxpy.dxlink(dxpy.upload_local_file(biorep_bam,properties=props_ex,details=qc_metrics,wait_on_close=True)) #else: # biorep_bam_dxlink = bam_set[0] print "* merge_extract(): Storing extraction results..." biorep_bam_qc_dxfile = dxpy.upload_local_file(biorep_bam_qc,properties=props,details=qc_metrics) biorep_map_dxfile = dxpy.upload_local_file(biorep_map, properties=props,details=qc_metrics) split_report_dxfile = dxpy.upload_local_file(target_root+'_splitting_report.txt') split_report_dxfile = dxpy.upload_local_file(target_root+'_splitting_report.txt') chrom_sizes_dxfile = dxpy.upload_local_file('input/chrom.sizes') mbias_report_dxfile = dxpy.upload_local_file(target_root+'_mbias_report.txt',properties=props,details=qc_metrics) CpG_context_dxfile = dxpy.upload_local_file('output/CpG_context_%s.txt' % (target_root)) CHG_context_dxfile = dxpy.upload_local_file('output/CHG_context_%s.txt' % (target_root)) CHH_context_dxfile = dxpy.upload_local_file('output/CHH_context_%s.txt' % (target_root)) print "* merge_extract(): Check storage..." run_cmd('ls -l') run_cmd('df -k .') return { #"biorep_bam_dxlink": biorep_bam_dxfile, "biorep_bam_qc_dxlink": dxpy.dxlink(biorep_bam_qc_dxfile), "biorep_map_dxlink": dxpy.dxlink(biorep_map_dxfile), "CpG_context_dxlink": dxpy.dxlink(CpG_context_dxfile), "CHG_context_dxlink": dxpy.dxlink(CHG_context_dxfile), "CHH_context_dxlink": dxpy.dxlink(CHH_context_dxfile), "split_report_dxlink": dxpy.dxlink(split_report_dxfile), "chrom_sizes_dxlink": dxpy.dxlink(chrom_sizes_dxfile), "mbias_report_dxlink": dxpy.dxlink(mbias_report_dxfile), "target_root": target_root, "qc_metrics": qc_metrics }
def main(input_bam, paired_end): input_bam_file = dxpy.DXFile(input_bam) input_bam_filename = input_bam_file.name input_bam_basename = input_bam_file.name.rstrip('.bam') dxpy.download_dxfile(input_bam_file.get_id(), input_bam_filename) intermediate_TA_filename = input_bam_basename + ".tagAlign" if paired_end: end_infix = 'PE2SE' else: end_infix = 'SE' final_TA_filename = input_bam_basename + '.' + end_infix + '.tagAlign.gz' subprocess.check_output('ls -l', shell=True) # =================== # Create tagAlign file # =================== out, err = common.run_pipe([ "bamToBed -i %s" % (input_bam_filename), r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'""", "tee %s" % (intermediate_TA_filename), "gzip -cn"], outfile=final_TA_filename) subprocess.check_output('ls -l', shell=True) # ================ # Create BEDPE file # ================ if paired_end: final_nmsrt_bam_prefix = input_bam_basename + ".nmsrt" final_nmsrt_bam_filename = final_nmsrt_bam_prefix + ".bam" command = \ "samtools sort -@ %d -n %s %s" \ % (cpu_count(), input_bam_filename, final_nmsrt_bam_prefix) logger.info(command) subprocess.check_call(shlex.split(command)) final_BEDPE_filename = input_bam_basename + ".bedpe.gz" out, err = common.run_pipe([ "bamToBed -bedpe -mate1 -i %s" % (final_nmsrt_bam_filename), "gzip -cn"], outfile=final_BEDPE_filename) subprocess.check_output('ls -l', shell=True) tagAlign_file = dxpy.upload_local_file(final_TA_filename) if paired_end: BEDPE_file = dxpy.upload_local_file(final_BEDPE_filename) output = {} output["tagAlign_file"] = dxpy.dxlink(tagAlign_file) if paired_end: output["BEDPE_file"] = dxpy.dxlink(BEDPE_file) return output
def read_distribution(BAM_file, BED_file): dxpy.download_dxfile(BAM_file, "mappings.bam") dxpy.download_dxfile(BED_file, "genes.bed") run_shell(" ".join(["read_distribution.py", "-i mappings.bam", "-r genes.bed", ">", "read_dist.txt"])) results_id = dxpy.upload_local_file("read_dist.txt", wait_on_close=True).get_id() return {"results":results_id}
def run_gbc(BAM_file, BED_file): dxpy.download_dxfile(BED_file, "genes.bed") dxpy.download_dxfile(BAM_file, "mappings.bam") run_shell( " ".join(["geneBody_coverage.py", "-i mappings.bam", "-r genes.bed", "-o geneBody"])) results_id = dxpy.upload_local_file("geneBody.geneBodyCoverage.txt", wait_on_close=True).get_id() return {"file":results_id}
def inner_distance(BAM_file, BED_file): dxpy.download_dxfile(BED_file, "genes.bed") dxpy.download_dxfile(BAM_file, "mappings.bam") run_shell( " ".join(["inner_distance.py", "-i mappings.bam", "-r genes.bed", "-o inner", "-l -303", "-u 5002"])) results_id = dxpy.upload_local_file("inner.inner_distance_freq.txt", wait_on_close=True).get_id() return {"results":results_id}
def signal_io(bedgraph_gz_dxlink, chrom_sizes_dxlink, target_root, qc_metrics, props): '''subjob runs bedGraphToBigWig on mem3_hdd2_x8''' print "* signal_io(): Retrieve bedgraph and chrom.sizes..." bedGraph = target_root + ".bedGraph" bedGraph_gz = bedGraph + ".gz" chrom_sizes = "chrom.sizes" dxpy.download_dxfile(bedgraph_gz_dxlink, bedGraph_gz) dxpy.download_dxfile(chrom_sizes_dxlink, chrom_sizes) bigWig = signal(target_root, bedGraph_gz, chrom_sizes) print "* signal_io(): Storing signal results..." bigWig_dxfile = dxpy.upload_local_file(bigWig, properties=props, details=qc_metrics, cleanup=True) print "* signal_io(): Check storage..." run_cmd('ls -l') run_cmd('df -k .') return {"bigWig_dxlink": dxpy.dxlink(bigWig_dxfile)}
def main(tumor_bam, normal_bam, reference, params='-F vcf'): # The following line(s) initialize your data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. tumor_bam = dxpy.DXFile(tumor_bam) normal_bam = dxpy.DXFile(normal_bam) reference = dxpy.DXFile(reference) # The following line(s) download your file inputs to the local file system # using variable names for the filenames. dxpy.download_dxfile(tumor_bam.get_id(), "tumor.bam") dxpy.download_dxfile(normal_bam.get_id(), "normal.bam") dxpy.download_dxfile(reference.get_id(), "ref.fa.gz") # The following line extracts the name from the file object so that # outputs can be named intelligently. It is not automatically generated by # the app wizard. name = tumor_bam.describe()['name'] + "_vs_" + normal_bam.describe( )['name'] # Append file extension based on whether the output will be VCF or now if "-F vcf" in params: name += ".vcf" else: name += ".snp" # Fill in your application code here. subprocess.check_call("gzip -d ref.fa.gz", shell=True) subprocess.check_call( "bam-somaticsniper -f ref.fa %s tumor.bam normal.bam %s" % (params, name), shell=True) # The following line(s) use the Python bindings to upload your file outputs # after you have created them on the local file system. It assumes that you # have used the output field name for the filename for each output, but you # can change that behavior to suit your needs. snps = dxpy.upload_local_file(name) # The following line fills in some basic dummy output and assumes # that you have created variables to represent your output with # the same name as your output fields. output = {} output["snps"] = dxpy.dxlink(snps) return output
def process(file_obj, file_meta): # Change the following to process whatever input this stage # receives. You may also want to copy and paste the logic to download # and upload files here as well if this stage receives file input # and/or makes file output. print file_obj print file_meta filename = dxpy.describe(file_obj)['name'] basename = filename.rstrip('.gz') dx_file = dxpy.download_dxfile(file_obj, filename) print "Run Validate Files" validate_args = validate_map.get(file_meta['file_format']) assembly = file_meta.get('assembly') if assembly: chromInfo = ['-chromInfo=%s/%s/chrom.sizes' % (encValData, assembly)] else: chromInfo = ['-chromInfo=%s/hg19/chrom.sizes' % encValData] print subprocess.check_output(['ls','-l']) valid = "Not validated yet" if validate_args is not None: print("Validating file.") validation_command = ['validateFiles'] + ['-verbose=2'] + validate_args + chromInfo + ['-doReport'] + [filename] try: print " ".join(validation_command) valid = subprocess.check_output(validation_command) except subprocess.CalledProcessError as e: pass #valid = "Process Error" print(e.output) #raise print valid print subprocess.check_output(['ls','-l']) print "Upload result" report_dxfile = dxpy.upload_local_file("%s.report" % filename) print report_dxfile ## is_valid == 'Error count 0' return { "report": report_dxfile, "validation": valid }
def process(fastq): # Change the following to process whatever input this stage # receives. You may also want to copy and paste the logic to download # and upload files here as well if this stage receives file input # and/or makes file output. print fastq reads_filename = dxpy.describe(fastq)['name'] reads_basename = reads_filename.rstrip('.gz').rstrip('.fq').rstrip( '.fastq') reads_file = dxpy.download_dxfile(fastq, "fastq.gz") subprocess.check_call(['mkdir', 'output']) print "Run QC" fqc_command = "/usr/bin/FastQC/fastqc fastq.gz -o output" print fqc_command stdio = subprocess.check_output(shlex.split(fqc_command)) print stdio print subprocess.check_output(['ls', '-l', 'output']) subprocess.check_call(['unzip', 'output/fastq_fastqc.zip']) print "Upload results" subprocess.check_call( ['mv', 'fastq_fastqc/fastqc_data.txt', "%s_data.txt" % reads_basename]) subprocess.check_call( ['mv', 'fastq_fastqc/summary.txt', "%s_summary.txt" % reads_basename]) subprocess.check_call( ['mv', 'output/fastq_fastqc.zip', "%s_fastqc.zip" % reads_basename]) report_dxfile = dxpy.upload_local_file("%s_data.txt" % reads_basename) summary_dxfile = dxpy.upload_local_file("%s_summary.txt" % reads_basename) zip_dxfile = dxpy.upload_local_file("%s_fastqc.zip" % reads_basename) print report_dxfile return { "report": report_dxfile, "summary": summary_dxfile, "zip": zip_dxfile }
def main(bam_file, ref_vcf_file, eval_vcf_file, qual_cutoff, depth_cutoff, bed_file=None): # The following line(s) initialize your data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. bam_file = dxpy.DXFile(bam_file) if bed_file is not None: bed_file = dxpy.DXFile(bed_file) ref_vcf_file = dxpy.DXFile(vcf_file) eval_vcf_file = dxpy.DXFile(eval_vcf_file) # The following line(s) download your file inputs to the local file system # using variable names for the filenames. dxpy.download_dxfile(bam_file.get_id(), "bam_file") dxpy.download_dxfile(vcf_file.get_id(), "vcf_file") if bed_file is not None: dxpy.download_dxfile(bed_file.get_id(), "bed_file") # Fill in your application code here. # The following line(s) use the Python bindings to upload your file outputs # after you have created them on the local file system. It assumes that you # have used the output field name for the filename for each output, but you # can change that behavior to suit your needs. sites_for_manual_review = dxpy.upload_local_file("sites_for_manual_review") # The following line fills in some basic dummy output and assumes # that you have created variables to represent your output with # the same name as your output fields. output = {} output["sites_for_manual_review"] = dxpy.dxlink(sites_for_manual_review) output["number_of_missed_sites"] = number_of_missed_sites output["found_sites"] = found_sites output["Sensitivity"] = Sensitivity output["specificity"] = specificity return output
def main(quants_a, quants_b, annotations): # tool_versions.py --applet $script_name --appver $script_ver sw_versions = subprocess.check_output(['tool_versions.py', '--dxjson', 'dnanexus-executable.json']) dxfile_a = dxpy.DXFile(quants_a) dxfile_b = dxpy.DXFile(quants_b) dxfile_anno = dxpy.DXFile(annotations) print "* Downloading files..." dxpy.download_dxfile(dxfile_a.get_id(), "quants_a.tsv") dxpy.download_dxfile(dxfile_b.get_id(), "quants_b.tsv") dxpy.download_dxfile(dxfile_anno.get_id(), "annotations.gtf.gz") # Create and appropriate name for output files out_root = root_name_from_pair(dxfile_a.name.split('.')[0],dxfile_b.name.split('.')[0]) print "* Expecting output: '"+out_root+"_srna_mad_plot.png'" # Must move sub-scripts into current dir so they will be found by srna-mad-qc.sh subprocess.check_call(['mv', "/usr/bin/extract_gene_ids.awk", '.']) subprocess.check_call(['mv', "/usr/bin/sum_srna_expression.awk", '.']) subprocess.check_call(['mv', "/usr/bin/MAD.R", '.']) # DX/ENCODE independent script is found in resources/usr/bin print "* ===== Calling DNAnexus and ENCODE independent script... =====" subprocess.check_call(['srna_mad_qc.sh','annotations.gtf.gz','quants_a.tsv','quants_b.tsv',out_root]) print "* ===== Returned from dnanexus and encodeD independent script =====" mad_plot_file = out_root + '_mad_plot.png' mad_qc_file = out_root + '_mad_qc.txt' print "* package properties..." qc_metrics = {} f_qc = open(mad_qc_file, 'r') mad_output = f_qc.read() f_qc.close() mad_output = mad_output.replace("NA","-1") qc_metrics["MAD.R"] = json.loads(mad_output) meta_string = json.dumps(qc_metrics) print json.dumps(qc_metrics,indent=4) props = {} props["SW"] = sw_versions print "* Upload Plot..." plot_dxfile = dxpy.upload_local_file(mad_plot_file,properties=props,details=qc_metrics) return { "metadata": meta_string, "mad_plot": plot_dxfile }
def run_bwa_backtrack_paired(fastq_file, fastq_file2, genome_fasta_file, genome_index_file, mark_duplicates, logger): """Runs BWA-backtrack on a pair of FASTQ files.""" fastq_file = dxpy.DXFile(fastq_file) fastq_file2 = dxpy.DXFile(fastq_file2) genome_fasta_file = dxpy.DXFile(genome_fasta_file) genome_index_file = dxpy.DXFile(genome_index_file) dxpy.download_dxfile(fastq_file.get_id(), "sample.fastq.gz") dxpy.download_dxfile(fastq_file2.get_id(), "sample_2.fastq.gz") dxpy.download_dxfile(genome_fasta_file.get_id(), "genome.fa.gz") dxpy.download_dxfile(genome_index_file.get_id(), "genome.tar.gz") subprocess.check_call("tar xzvf genome.tar.gz", shell=True) num_cores = str(cpu_count()) run_cmd( "bwa-0.6.2 aln -t " + num_cores + " genome.fa.gz sample.fastq.gz > sample.sai", logger) run_cmd( "bwa-0.6.2 aln -t " + num_cores + " genome.fa.gz sample_2.fastq.gz > sample_2.sai", logger) run_cmd( "bwa-0.6.2 sampe -P genome.fa.gz sample.sai sample_2.sai sample.fastq.gz sample_2.fastq.gz" + " > sample0.sam", logger) run_cmd("java -jar /CleanSam.jar INPUT=sample0.sam OUTPUT=sample1.bam", logger) run_cmd("samtools sort -@ " + num_cores + " sample1.bam sample", logger) if mark_duplicates: run_cmd( "java -jar /MarkDuplicates.jar " + "INPUT=sample.bam OUTPUT=sample_deduped.bam METRICS_FILE=/dev/null", logger) subprocess.check_call("mv sample_deduped.bam sample.bam", shell=True)
def coverage(CpG_context_dxlink, CHG_context_dxlink, CHH_context_dxlink, dme_ix_dxlink, target_root): '''subjob runs bismark2bedGraph and coverage2cytosine on mem3_hdd2_x8''' print "* coverage(): Retrieve context files and index..." CpG_context = 'CpG_context_%s.txt' % target_root CHG_context = 'CHG_context_%s.txt' % target_root CHH_context = 'CHH_context_%s.txt' % target_root run_cmd('mkdir -p output/') dxpy.download_dxfile(CpG_context_dxlink, 'output/%s.gz' % CpG_context) dxpy.download_dxfile(CHG_context_dxlink, 'output/%s.gz' % CHG_context) dxpy.download_dxfile(CHH_context_dxlink, 'output/%s.gz' % CHH_context) dme_ix = "dme_index.tar.gz" dxpy.download_dxfile(dme_ix_dxlink, dme_ix) print "* coverage(): Uncompress..." run_cmd('tar -zxf ' + dme_ix) run_cmd('gunzip output/%s.gz' % CpG_context) run_cmd('gunzip output/%s.gz' % CHG_context) run_cmd('gunzip output/%s.gz' % CHH_context) (bedGraph_gz, cx_report) = bismark_coverage(target_root, CpG_context, CHG_context, CHH_context) print "* coverage(): Storing coverage results..." cx_report_dxfile = dxpy.upload_local_file(cx_report) bedgraph_gz_dxfile = dxpy.upload_local_file(bedGraph_gz) print "* coverage(): Check storage..." run_cmd('ls -l') run_cmd('df -k .') return { "cx_report_dxlink": dxpy.dxlink(cx_report_dxfile), "bedgraph_gz_dxlink": dxpy.dxlink(bedgraph_gz_dxfile) }
def main(bam1, bam2, RE_site_bed): dxpy.download_dxfile(bam1, "input1.bam") dxpy.download_dxfile(bam2, "input2.bam") dxpy.download_dxfile(RE_site_bed, "RE.bed") command = "cp -r /miniconda ~; cp -r /.conda ~; bash -c 'PATH=/home/dnanexus/miniconda/miniconda2/bin:$PATH; source activate gitar; which python; python /usr/bin/HiCtool_hifive.arg.py input1.bam input2.bam RE.bed .'" print(command) subprocess.call(command, shell=True) fend_object_hdf5_filename = "./fend_object.hdf5" HiC_data_object_hdf5_filename = "./HiC_data_object.hdf5" HiC_distance_function_hdf5_filename = "./HiC_distance_function.hdf5" HiC_norm_binning_hdf5_filename = "./HiC_norm_binning.hdf5" HiC_project_object_hdf5_filename = "./HiC_project_object.hdf5" #fend_object_hdf5_file = dxpy.upload_local_file(fend_object_hdf5_filename, folder=outdir) #HiC_data_object_hdf5_file = dxpy.upload_local_file(HiC_data_object_hdf5_filename, folder=outdir) #HiC_distance_function_hdf5_file= dxpy.upload_local_file(HiC_distance_function_hdf5_filename, folder=outdir) #HiC_norm_binning_hdf5_file= dxpy.upload_local_file(HiC_norm_binning_hdf5_filename, folder=outdir) #HiC_project_object_hdf5_file= dxpy.upload_local_file(HiC_project_object_hdf5_filename, folder=outdir) fend_object_hdf5_file = dxpy.upload_local_file(fend_object_hdf5_filename) HiC_data_object_hdf5_file = dxpy.upload_local_file( HiC_data_object_hdf5_filename) HiC_distance_function_hdf5_file = dxpy.upload_local_file( HiC_distance_function_hdf5_filename) HiC_norm_binning_hdf5_file = dxpy.upload_local_file( HiC_norm_binning_hdf5_filename) HiC_project_object_hdf5_file = dxpy.upload_local_file( HiC_project_object_hdf5_filename) return { "fend_object_hdf5": fend_object_hdf5_file, "HiC_data_object_hdf5": HiC_data_object_hdf5_file, "HiC_distance_function_hdf5": HiC_distance_function_hdf5_file, "HiC_norm_binning_hdf5": HiC_norm_binning_hdf5_file, "HiC_project_object_hdf5": HiC_project_object_hdf5_file }
def main(rep1_peaks, rep2_peaks, pooled_peaks, idr_threshold, rank, interactive): # Initialize the data object inputs on the platform into # dxpy.DXDataObject instances. idr_version = 1 rep1_peaks_file = dxpy.DXFile(rep1_peaks) rep2_peaks_file = dxpy.DXFile(rep2_peaks) pooled_peaks_file = dxpy.DXFile(pooled_peaks) rep1_peaks_filename = rep1_peaks_file.name rep2_peaks_filename = rep2_peaks_file.name pooled_peaks_filename = pooled_peaks_file.name # Download the file inputs to the local file system. dxpy.download_dxfile(rep1_peaks_file.get_id(), rep1_peaks_filename) dxpy.download_dxfile(rep2_peaks_file.get_id(), rep2_peaks_filename) dxpy.download_dxfile(pooled_peaks_file.get_id(), pooled_peaks_filename) rep1_peaks_filename = uncompress(rep1_peaks_filename) rep2_peaks_filename = uncompress(rep2_peaks_filename) pooled_peaks_filename = uncompress(pooled_peaks_filename) print subprocess.check_output('ls -l', shell=True, stderr=subprocess.STDOUT) #rep1_vs_rep2_prefix = '%s_vs_%s.IDRv%d' %(os.path.basename(rep1_peaks_filename), os.path.basename(rep2_peaks_filename), idr_version) rep1_vs_rep2_prefix = '%sv%s.IDRv%d' %(os.path.basename(rep1_peaks_filename)[0:11], os.path.basename(rep2_peaks_filename)[0:11], idr_version) pooled_common_peaks_IDR_filename, IDR_overlap_narrowpeak_filename = run_idr( rep1_peaks_filename, rep2_peaks_filename, pooled_peaks_filename, rep1_vs_rep2_prefix, rank=rank, idr_version=idr_version, interactive=interactive) # ============================= # Get peaks passing the IDR threshold # ============================= if idr_version == 1: awk_string = r"""awk 'BEGIN{OFS="\t"} $14<=%2.2f {print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10}'""" %(idr_threshold) elif idr_version ==2: awk_string = r"""awk 'BEGIN{OFS="\t"} $12>=%2.2f {print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10}'""" %(-math.log10(idr_threshold)) final_IDR_thresholded_filename = rep1_vs_rep2_prefix + '.IDR%2.2f.narrowPeak' %(idr_threshold) run_pipe([ 'cat %s' %(pooled_common_peaks_IDR_filename), awk_string, 'sort -k7n,7n' #'gzip -c' ], final_IDR_thresholded_filename) npeaks_pass_filename = rep1_vs_rep2_prefix + '-npeaks-aboveIDR.txt' wc_output = subprocess.check_output(shlex.split('wc -l %s' %(final_IDR_thresholded_filename))) with open(npeaks_pass_filename, 'w') as fh: fh.write(wc_output) line_count = wc_output.split()[0] n_peaks = int(line_count) #TODO batch consistency plot # The following line(s) use the Python bindings to upload your file outputs # after you have created them on the local file system. It assumes that you # have used the output field name for the filename for each output, but you # can change that behavior to suit your needs. output = {} if idr_version == 1: IDR_overlap_narrowpeak_filename = compress(IDR_overlap_narrowpeak_filename) overlapped_peaks = dxpy.upload_local_file(IDR_overlap_narrowpeak_filename) EM_fit_output = dxpy.upload_local_file(rep1_vs_rep2_prefix + '-em.sav') empirical_curves_output = dxpy.upload_local_file(rep1_vs_rep2_prefix + '-uri.sav') EM_parameters_log = dxpy.upload_local_file(rep1_vs_rep2_prefix + '-Rout.txt') output.update({ "EM_fit_output": dxpy.dxlink(EM_fit_output), "empirical_curves_output": dxpy.dxlink(empirical_curves_output), "overlapped_peaks": dxpy.dxlink(overlapped_peaks) }) elif idr_version == 2: EM_fit_output = None empirical_curves_output = None overlapped_peaks = None EM_parameters_log = dxpy.upload_local_file(rep1_vs_rep2_prefix + '.log.txt') IDR2_plot = dxpy.upload_local_file(pooled_common_peaks_IDR_filename + '.png') output.update({ "IDR2_plot": dxpy.dxlink(IDR2_plot) }) npeaks_pass = dxpy.upload_local_file(npeaks_pass_filename) IDR_output = dxpy.upload_local_file(compress(pooled_common_peaks_IDR_filename)) IDR_peaks = dxpy.upload_local_file(compress(final_IDR_thresholded_filename)) # # return { "app_output_field": postprocess_job.get_output_ref("answer"), ...} # subprocess.check_output('ls -l', shell=True, stderr=subprocess.STDOUT) output.update({ "EM_parameters_log": dxpy.dxlink(EM_parameters_log), "npeaks_pass": dxpy.dxlink(npeaks_pass), "IDR_output": dxpy.dxlink(IDR_output), "IDR_peaks": dxpy.dxlink(IDR_peaks), "N": n_peaks }) logging.info("Exiting with output: %s", output) return output
def main(experiment, control, xcor_scores_input, chrom_sizes, narrowpeak_as, gappedpeak_as, broadpeak_as, genomesize): # Initialize data object inputs on the platform # into dxpy.DXDataObject instances. experiment = dxpy.DXFile(experiment) control = dxpy.DXFile(control) xcor_scores_input = dxpy.DXFile(xcor_scores_input) chrom_sizes = dxpy.DXFile(chrom_sizes) narrowPeak_as = dxpy.DXFile(narrowpeak_as) gappedPeak_as = dxpy.DXFile(gappedpeak_as) broadPeak_as = dxpy.DXFile(broadpeak_as) # Download the file inputs to the local file system # and use their own filenames. dxpy.download_dxfile(experiment.get_id(), experiment.name) dxpy.download_dxfile(control.get_id(), control.name) dxpy.download_dxfile(xcor_scores_input.get_id(), xcor_scores_input.name) dxpy.download_dxfile(chrom_sizes.get_id(), chrom_sizes.name) dxpy.download_dxfile(narrowPeak_as.get_id(), narrowPeak_as.name) dxpy.download_dxfile(gappedPeak_as.get_id(), gappedPeak_as.name) dxpy.download_dxfile(broadPeak_as.get_id(), broadPeak_as.name) #Define the output filenames peaks_dirname = 'peaks' if not os.path.exists(peaks_dirname): os.makedirs(peaks_dirname) prefix = experiment.name if prefix.endswith('.gz'): prefix = prefix[:-3] narrowPeak_fn = "%s/%s.narrowPeak" % (peaks_dirname, prefix) gappedPeak_fn = "%s/%s.gappedPeak" % (peaks_dirname, prefix) broadPeak_fn = "%s/%s.broadPeak" % (peaks_dirname, prefix) narrowPeak_gz_fn = narrowPeak_fn + ".gz" gappedPeak_gz_fn = gappedPeak_fn + ".gz" broadPeak_gz_fn = broadPeak_fn + ".gz" narrowPeak_bb_fn = "%s.bb" % (narrowPeak_fn) gappedPeak_bb_fn = "%s.bb" % (gappedPeak_fn) broadPeak_bb_fn = "%s.bb" % (broadPeak_fn) fc_signal_fn = "%s/%s.fc_signal.bw" % (peaks_dirname, prefix) pvalue_signal_fn = "%s/%s.pvalue_signal.bw" % (peaks_dirname, prefix) #Extract the fragment length estimate from column 3 of the cross-correlation scores file with open(xcor_scores_input.name, 'r') as fh: firstline = fh.readline() fraglen = firstline.split()[2] #third column print "Fraglen %s" % (fraglen) #=========================================== # Generate narrow peaks and preliminary signal tracks #============================================ command = 'macs2 callpeak ' + \ '-t %s -c %s ' %(experiment.name, control.name) + \ '-f BED -n %s/%s ' %(peaks_dirname, prefix) + \ '-g %s -p 1e-2 --nomodel --shift 0 --extsize %s --keep-dup all -B --SPMR' %(genomesize, fraglen) print command returncode = common.block_on(command) print "MACS2 exited with returncode %d" % (returncode) assert returncode == 0, "MACS2 non-zero return" # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format (score must be <1000) rescaled_narrowpeak_fn = common.rescale_scores('%s/%s_peaks.narrowPeak' % (peaks_dirname, prefix), scores_col=5) # Sort by Col8 in descending order and replace long peak names in Column 4 with Peak_<peakRank> pipe = [ 'sort -k 8gr,8gr %s' % (rescaled_narrowpeak_fn), r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""", 'tee %s' % (narrowPeak_fn), 'gzip -c' ] print pipe out, err = common.run_pipe(pipe, '%s' % (narrowPeak_gz_fn)) # remove additional files #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.xls ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.bed ${peakFile}_summits.bed #=========================================== # Generate Broad and Gapped Peaks #============================================ command = 'macs2 callpeak ' + \ '-t %s -c %s ' %(experiment.name, control.name) + \ '-f BED -n %s/%s ' %(peaks_dirname, prefix) + \ '-g %s -p 1e-2 --broad --nomodel --shift 0 --extsize %s --keep-dup all' %(genomesize, fraglen) print command returncode = common.block_on(command) print "MACS2 exited with returncode %d" % (returncode) assert returncode == 0, "MACS2 non-zero return" # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format (score must be <1000) rescaled_broadpeak_fn = common.rescale_scores('%s/%s_peaks.broadPeak' % (peaks_dirname, prefix), scores_col=5) # Sort by Col8 (for broadPeak) or Col 14(for gappedPeak) in descending order and replace long peak names in Column 4 with Peak_<peakRank> pipe = [ 'sort -k 8gr,8gr %s' % (rescaled_broadpeak_fn), r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""", 'tee %s' % (broadPeak_fn), 'gzip -c' ] print pipe out, err = common.run_pipe(pipe, '%s' % (broadPeak_gz_fn)) # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format (score must be <1000) rescaled_gappedpeak_fn = common.rescale_scores('%s/%s_peaks.gappedPeak' % (peaks_dirname, prefix), scores_col=5) pipe = [ 'sort -k 14gr,14gr %s' % (rescaled_gappedpeak_fn), r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""", 'tee %s' % (gappedPeak_fn), 'gzip -c' ] print pipe out, err = common.run_pipe(pipe, '%s' % (gappedPeak_gz_fn)) # remove additional files #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.xls ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.bed ${peakFile}_summits.bed #=========================================== # For Fold enrichment signal tracks #============================================ # This file is a tab delimited file with 2 columns Col1 (chromosome name), Col2 (chromosome size in bp). command = 'macs2 bdgcmp ' + \ '-t %s/%s_treat_pileup.bdg ' %(peaks_dirname, prefix) + \ '-c %s/%s_control_lambda.bdg ' %(peaks_dirname, prefix) + \ '--outdir %s -o %s_FE.bdg ' %(peaks_dirname, prefix) + \ '-m FE' print command returncode = common.block_on(command) print "MACS2 exited with returncode %d" % (returncode) assert returncode == 0, "MACS2 non-zero return" # Remove coordinates outside chromosome sizes (stupid MACS2 bug) pipe = [ 'slopBed -i %s/%s_FE.bdg -g %s -b 0' % (peaks_dirname, prefix, chrom_sizes.name), 'bedClip stdin %s %s/%s.fc.signal.bedgraph' % (chrom_sizes.name, peaks_dirname, prefix) ] print pipe out, err = common.run_pipe(pipe) #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_FE.bdg # Convert bedgraph to bigwig command = 'bedGraphToBigWig ' + \ '%s/%s.fc.signal.bedgraph ' %(peaks_dirname, prefix) + \ '%s ' %(chrom_sizes.name) + \ '%s' %(fc_signal_fn) print command returncode = common.block_on(command) print "bedGraphToBigWig exited with returncode %d" % (returncode) assert returncode == 0, "bedGraphToBigWig non-zero return" #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}.fc.signal.bedgraph #=========================================== # For -log10(p-value) signal tracks #============================================ # Compute sval = min(no. of reads in ChIP, no. of reads in control) / 1,000,000 out, err = common.run_pipe(['gzip -dc %s' % (experiment.name), 'wc -l']) chipReads = out.strip() out, err = common.run_pipe(['gzip -dc %s' % (control.name), 'wc -l']) controlReads = out.strip() sval = str(min(float(chipReads), float(controlReads)) / 1000000) print "chipReads = %s, controlReads = %s, sval = %s" % (chipReads, controlReads, sval) returncode = common.block_on( 'macs2 bdgcmp ' + \ '-t %s/%s_treat_pileup.bdg ' %(peaks_dirname, prefix) + \ '-c %s/%s_control_lambda.bdg ' %(peaks_dirname, prefix) + \ '--outdir %s -o %s_ppois.bdg ' %(peaks_dirname, prefix) + \ '-m ppois -S %s' %(sval)) print "MACS2 exited with returncode %d" % (returncode) assert returncode == 0, "MACS2 non-zero return" # Remove coordinates outside chromosome sizes (stupid MACS2 bug) pipe = [ 'slopBed -i %s/%s_ppois.bdg -g %s -b 0' % (peaks_dirname, prefix, chrom_sizes.name), 'bedClip stdin %s %s/%s.pval.signal.bedgraph' % (chrom_sizes.name, peaks_dirname, prefix) ] print pipe out, err = common.run_pipe(pipe) #rm -rf ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_ppois.bdg # Convert bedgraph to bigwig command = 'bedGraphToBigWig ' + \ '%s/%s.pval.signal.bedgraph ' %(peaks_dirname, prefix) + \ '%s ' %(chrom_sizes.name) + \ '%s' %(pvalue_signal_fn) print command returncode = common.block_on(command) print "bedGraphToBigWig exited with returncode %d" % (returncode) assert returncode == 0, "bedGraphToBigWig non-zero return" #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}.pval.signal.bedgraph #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_treat_pileup.bdg ${peakFile}_control_lambda.bdg #=========================================== # Generate bigWigs from beds to support trackhub visualization of peak files #============================================ narrowPeak_bb_fname = common.bed2bb('%s' % (narrowPeak_fn), chrom_sizes.name, narrowPeak_as.name, bed_type='bed6+4') gappedPeak_bb_fname = common.bed2bb('%s' % (gappedPeak_fn), chrom_sizes.name, gappedPeak_as.name, bed_type='bed12+3') broadPeak_bb_fname = common.bed2bb('%s' % (broadPeak_fn), chrom_sizes.name, broadPeak_as.name, bed_type='bed6+3') #Temporary during development to create empty files just to get the applet to exit for fn in [ narrowPeak_fn, gappedPeak_fn, broadPeak_fn, narrowPeak_bb_fn, gappedPeak_bb_fn, broadPeak_bb_fn, fc_signal_fn, pvalue_signal_fn ]: common.block_on('touch %s' % (fn)) # Upload the file outputs narrowPeak = dxpy.upload_local_file(narrowPeak_gz_fn) gappedPeak = dxpy.upload_local_file(gappedPeak_gz_fn) broadPeak = dxpy.upload_local_file(broadPeak_gz_fn) narrowPeak_bb = dxpy.upload_local_file(narrowPeak_bb_fn) gappedPeak_bb = dxpy.upload_local_file(gappedPeak_bb_fn) broadPeak_bb = dxpy.upload_local_file(broadPeak_bb_fn) fc_signal = dxpy.upload_local_file(fc_signal_fn) pvalue_signal = dxpy.upload_local_file(pvalue_signal_fn) # Build the output structure. output = { "narrowpeaks": dxpy.dxlink(narrowPeak), "gappedpeaks": dxpy.dxlink(gappedPeak), "broadpeaks": dxpy.dxlink(broadPeak), "narrowpeaks_bb": dxpy.dxlink(narrowPeak_bb), "gappedpeaks_bb": dxpy.dxlink(gappedPeak_bb), "broadpeaks_bb": dxpy.dxlink(broadPeak_bb), "fc_signal": dxpy.dxlink(fc_signal), "pvalue_signal": dxpy.dxlink(pvalue_signal) } return output
def main(rep1_ta, ctl1_ta, rep1_xcor, rep1_paired_end, npeaks, nodups, chrom_sizes, spp_version, rep2_ta=None, ctl2_ta=None, rep2_xcor=None, rep2_paired_end=None, as_file=None, idr_peaks=False, fragment_length=None, spp_instance=None): rep1_ta_file = dxpy.DXFile(rep1_ta) dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_file.name) rep1_ta_filename = rep1_ta_file.name ntags_rep1 = common.count_lines(rep1_ta_filename) simplicate_experiment = rep1_ta and not rep2_ta if simplicate_experiment: logger.info( "No rep2 tags specified so processing as a simplicate experiment.") else: logger.info( "Rep1 and rep2 tags specified so processing as a replicated experiment." ) if not simplicate_experiment: assert rep1_paired_end == rep2_paired_end, 'Mixed PE/SE not supported' rep2_ta_file = dxpy.DXFile(rep2_ta) dxpy.download_dxfile(rep2_ta_file.get_id(), rep2_ta_file.name) rep2_ta_filename = rep2_ta_file.name ntags_rep2 = common.count_lines(rep2_ta_filename) paired_end = rep1_paired_end unary_control = (ctl1_ta == ctl2_ta) or not ctl2_ta ctl1_ta_file = dxpy.DXFile(ctl1_ta) dxpy.download_dxfile(ctl1_ta_file.get_id(), ctl1_ta_file.name) ctl1_ta_filename = ctl1_ta_file.name if not unary_control: ctl2_ta_file = dxpy.DXFile(ctl2_ta) dxpy.download_dxfile(ctl2_ta_file.get_id(), ctl2_ta_file.name) ctl2_ta_filename = ctl2_ta_file.name else: ctl2_ta_file = ctl1_ta_file ctl2_ta_filename = ctl1_ta_file.name ntags_ctl1 = common.count_lines(ctl1_ta_filename) ntags_ctl2 = common.count_lines(ctl2_ta_filename) rep1_control = ctl1_ta # default. May be changed later. rep1_ctl_msg = "control rep1" rep2_control = ctl2_ta # default. May be changed later. rep2_ctl_msg = "control rep2" rep_info = [(ntags_rep1, 'replicate 1', rep1_ta_filename)] if not simplicate_experiment: rep_info.append((ntags_rep2, 'replicate 2', rep2_ta_filename)) rep_info.extend([(ntags_ctl1, 'control 1', ctl1_ta_filename), (ntags_ctl2, 'control 2', ctl2_ta_filename)]) for n, name, filename in rep_info: logger.info("Found %d tags in %s file %s" % (n, name, filename)) subprocess.check_output('ls -l', shell=True, stderr=subprocess.STDOUT) if not simplicate_experiment: pool_applet = dxpy.find_one_data_object( classname='applet', name='pool', project=dxpy.PROJECT_CONTEXT_ID, zero_ok=False, more_ok=False, return_handler=True) pool_replicates_subjob = \ pool_applet.run( {"inputs": [rep1_ta, rep2_ta], "prefix": 'pooled_reps'}, name='Pool replicates') pooled_replicates = pool_replicates_subjob.get_output_ref("pooled") pooled_replicates_xcor_subjob = \ xcor_only( pooled_replicates, paired_end, spp_version, name='Pool cross-correlation') if unary_control: logger.info("Only one control supplied.") if not simplicate_experiment: logger.info( "Using one control for both replicate 1 and 2 and for the pool." ) rep2_control = rep1_control control_for_pool = rep1_control pool_ctl_msg = "one control" else: pool_controls_subjob = pool_applet.run( { "inputs": [ctl1_ta, ctl2_ta], "prefix": "PL_ctls" }, name='Pool controls') pooled_controls = pool_controls_subjob.get_output_ref("pooled") # always use the pooled controls for the pool control_for_pool = pooled_controls pool_ctl_msg = "pooled controls" # use the pooled controls for the reps depending on the ratio of rep to # control reads ratio_ctl_reads = float(ntags_ctl1) / float(ntags_ctl2) if ratio_ctl_reads < 1: ratio_ctl_reads = 1 / ratio_ctl_reads ratio_cutoff = 1.2 if ratio_ctl_reads > ratio_cutoff: logger.info( "Number of reads in controls differ by > factor of %f. Using pooled controls." % (ratio_cutoff)) rep1_control = pooled_controls rep2_control = pooled_controls else: if ntags_ctl1 < ntags_rep1: logger.info( "Fewer reads in control replicate 1 than experiment replicate 1. Using pooled controls for replicate 1." ) rep1_control = pooled_controls rep1_ctl_msg = "pooled controls" elif not simplicate_experiment and ntags_ctl2 < ntags_rep2: logger.info( "Fewer reads in control replicate 2 than experiment replicate 2. Using pooled controls for replicate 2." ) rep2_control = pooled_controls rep2_ctl_msg = "pooled controls" else: logger.info("Using distinct controls for replicate 1 and 2.") rep1_control = ctl1_ta # default. May be changed later. rep2_control = ctl2_ta # default. May be changed later. rep1_ctl_msg = "control rep1" rep2_ctl_msg = "control rep2" common_args = { 'chrom_sizes': chrom_sizes, 'spp_version': spp_version, 'as_file': as_file, 'spp_instance': spp_instance } if fragment_length is not None: common_args.update({'fragment_length': fragment_length}) rep1_peaks_subjob = spp(rep1_ta, rep1_control, rep1_xcor, bigbed=True, name='Rep1 peaks vs %s' % (rep1_ctl_msg), prefix='R1', **common_args) if not simplicate_experiment: rep2_peaks_subjob = spp(rep2_ta, rep2_control, rep2_xcor, bigbed=True, name='Rep2 peaks vs %s' % (rep2_ctl_msg), prefix='R2', **common_args) pooled_peaks_subjob = spp( pooled_replicates, control_for_pool, pooled_replicates_xcor_subjob.get_output_ref("CC_scores_file"), bigbed=True, name='Pooled peaks vs %s' % (pool_ctl_msg), prefix='PL', **common_args) output = { 'rep1_peaks': rep1_peaks_subjob.get_output_ref("peaks"), 'rep1_peaks_bb': rep1_peaks_subjob.get_output_ref("peaks_bb"), 'rep1_xcor_plot': rep1_peaks_subjob.get_output_ref("xcor_plot"), 'rep1_xcor_scores': rep1_peaks_subjob.get_output_ref("xcor_scores") } if not simplicate_experiment: output.update({ 'rep2_peaks': rep2_peaks_subjob.get_output_ref("peaks"), 'rep2_peaks_bb': rep2_peaks_subjob.get_output_ref("peaks_bb"), 'rep2_xcor_plot': rep2_peaks_subjob.get_output_ref("xcor_plot"), 'rep2_xcor_scores': rep2_peaks_subjob.get_output_ref("xcor_scores"), 'pooled_peaks': pooled_peaks_subjob.get_output_ref("peaks"), 'pooled_peaks_bb': pooled_peaks_subjob.get_output_ref("peaks_bb"), 'pooled_xcor_plot': pooled_peaks_subjob.get_output_ref("xcor_plot"), 'pooled_xcor_scores': pooled_peaks_subjob.get_output_ref("xcor_scores") }) if idr_peaks: # also call peaks on pseudoreplicates for IDR pseudoreplicator_applet = \ dxpy.find_one_data_object( classname='applet', name='pseudoreplicator', project=dxpy.PROJECT_CONTEXT_ID, zero_ok=False, more_ok=False, return_handler=True) rep1_pr_subjob = \ pseudoreplicator_applet.run( {"input_tags": rep1_ta, "prefix": 'R1PR'}, name='Pseudoreplicate rep1 -> R1PR1,2') rep1pr1_peaks_subjob = spp( rep1_pr_subjob.get_output_ref("pseudoreplicate1"), rep1_control, rep1_xcor, bigbed=False, name='R1PR1 peaks vs %s' % (rep1_ctl_msg), prefix='R1PR1', **common_args) rep1pr2_peaks_subjob = spp( rep1_pr_subjob.get_output_ref("pseudoreplicate2"), rep1_control, rep1_xcor, bigbed=False, name='R1PR2 peaks vs %s' % (rep1_ctl_msg), prefix='R1PR2', **common_args) output.update({ 'rep1pr1_peaks': rep1pr1_peaks_subjob.get_output_ref("peaks"), 'rep1pr2_peaks': rep1pr2_peaks_subjob.get_output_ref("peaks") }) if not simplicate_experiment: rep2_pr_subjob = \ pseudoreplicator_applet.run( {"input_tags": rep2_ta, "prefix": 'R2PR'}, name='Pseudoreplicate rep2 -> R2PR1,2') pool_pr1_subjob = pool_applet.run( { "inputs": [ rep1_pr_subjob.get_output_ref("pseudoreplicate1"), rep2_pr_subjob.get_output_ref("pseudoreplicate1") ], "prefix": 'PPR1' }, name='Pool R1PR1+R2PR1 -> PPR1') pool_pr2_subjob = pool_applet.run( { "inputs": [ rep1_pr_subjob.get_output_ref("pseudoreplicate2"), rep2_pr_subjob.get_output_ref("pseudoreplicate2") ], "prefix": 'PPR2' }, name='Pool R1PR2+R2PR2 -> PPR2') rep2pr1_peaks_subjob = spp( rep2_pr_subjob.get_output_ref("pseudoreplicate1"), rep2_control, rep2_xcor, bigbed=False, name='R2PR1 peaks vs %s' % (rep2_ctl_msg), prefix='R2PR1', **common_args) rep2pr2_peaks_subjob = spp( rep2_pr_subjob.get_output_ref("pseudoreplicate2"), rep2_control, rep2_xcor, bigbed=False, name='R2PR2 peaks vs %s' % (rep2_ctl_msg), prefix='R2PR2', **common_args) pooledpr1_peaks_subjob = spp( pool_pr1_subjob.get_output_ref("pooled"), control_for_pool, pooled_replicates_xcor_subjob.get_output_ref("CC_scores_file"), bigbed=False, name='PPR1 peaks vs %s' % (pool_ctl_msg), prefix='PPR1', **common_args) pooledpr2_peaks_subjob = spp( pool_pr2_subjob.get_output_ref("pooled"), control_for_pool, pooled_replicates_xcor_subjob.get_output_ref("CC_scores_file"), bigbed=False, name='PPR2 peaks vs %s' % (pool_ctl_msg), prefix='PPR2', **common_args) output.update({ 'rep2pr1_peaks': rep2pr1_peaks_subjob.get_output_ref("peaks"), 'rep2pr2_peaks': rep2pr2_peaks_subjob.get_output_ref("peaks"), 'pooledpr1_peaks': pooledpr1_peaks_subjob.get_output_ref("peaks"), 'pooledpr2_peaks': pooledpr2_peaks_subjob.get_output_ref("peaks"), }) return output
def main(experiment, control, xcor_scores_input, npeaks, nodups, bigbed, chrom_sizes, as_file=None, prefix=None): # The following line(s) initialize your data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. experiment_file = dxpy.DXFile(experiment) control_file = dxpy.DXFile(control) xcor_scores_input_file = dxpy.DXFile(xcor_scores_input) chrom_sizes_file = dxpy.DXFile(chrom_sizes) chrom_sizes_filename = chrom_sizes_file.name dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename) if bigbed: as_file_file = dxpy.DXFile(as_file) as_file_filename = as_file_file.name dxpy.download_dxfile(as_file_file.get_id(), as_file_filename) # The following line(s) download your file inputs to the local file system # using variable names for the filenames. experiment_filename = experiment_file.name dxpy.download_dxfile(experiment_file.get_id(), experiment_filename) control_filename = control_file.name dxpy.download_dxfile(control_file.get_id(), control_filename) xcor_scores_input_filename = xcor_scores_input_file.name dxpy.download_dxfile(xcor_scores_input_file.get_id(), xcor_scores_input_filename) if not prefix: output_filename_prefix = experiment_filename.rstrip('.gz').rstrip('.tagAlign') else: output_filename_prefix = prefix peaks_filename = output_filename_prefix + '.regionPeak' final_peaks_filename = peaks_filename + '.gz' #spp adds .gz, so this is the file name that's actually created xcor_plot_filename = output_filename_prefix + '.pdf' xcor_scores_filename = output_filename_prefix + '.ccscores' print subprocess.check_output('ls -l', shell=True, stderr=subprocess.STDOUT) fraglen_column = 3 # third column in the cross-correlation scores input file with open(xcor_scores_input_filename, 'r') as f: line = f.readline() fragment_length = int(line.split('\t')[fraglen_column-1]) print "Read fragment length: %d" %(fragment_length) #run_spp_command = subprocess.check_output('which run_spp.R', shell=True) spp_tarball = '/phantompeakqualtools/spp_1.10.1.tar.gz' if nodups: run_spp = '/phantompeakqualtools/run_spp_nodups.R' else: run_spp = '/phantompeakqualtools/run_spp.R' #install spp subprocess.check_call('ls -l', shell=True) subprocess.check_call(shlex.split('R CMD INSTALL %s' %(spp_tarball))) spp_command = "Rscript %s -p=%d -c=%s -i=%s -npeak=%d -speak=%d -savr=%s -savp=%s -rf -out=%s" %(run_spp, cpu_count(), experiment_filename, control_filename, npeaks, fragment_length, peaks_filename, xcor_plot_filename, xcor_scores_filename) print spp_command # process = subprocess.Popen(shlex.split(spp_command), stderr=subprocess.STDOUT, stdout=subprocess.PIPE) # for line in iter(process.stdout.readline, ''): # sys.stdout.write(line) subprocess.check_call(shlex.split(spp_command)) #when one of the peak coordinates are an exact multiple of 10, spp (R) outputs the coordinate in scientific notation #this changes any such coodinates to decimal notation #this assumes 10-column output and that the 2nd and 3rd columns are coordinates #slopBed adjusts feature end coordinates that go off the end of the chromosome #bedClip removes any features that are still not within the boundaries of the chromosome fix_coordinate_peaks_filename = output_filename_prefix + '.fixcoord.regionPeak' out, err = common.run_pipe([ "gzip -dc %s" %(final_peaks_filename), "tee %s" %(peaks_filename), r"""awk 'BEGIN{OFS="\t"}{print $1,sprintf("%i",$2),sprintf("%i",$3),$4,$5,$6,$7,$8,$9,$10}'""", 'slopBed -i stdin -g %s -b 0' %(chrom_sizes_filename), 'bedClip stdin %s %s' %(chrom_sizes_filename, fix_coordinate_peaks_filename) ]) #These lines transfer the peaks files to the temporary workspace for debugging later #Only at the end are the final files uploaded that will be returned from the applet dxpy.upload_local_file(peaks_filename) dxpy.upload_local_file(fix_coordinate_peaks_filename) n_spp_peaks = common.count_lines(peaks_filename) print "%s peaks called by spp" %(n_spp_peaks) print "%s of those peaks removed due to bad coordinates" %(n_spp_peaks - common.count_lines(fix_coordinate_peaks_filename)) print "First 50 peaks" print subprocess.check_output('head -50 %s' %(fix_coordinate_peaks_filename), shell=True, stderr=subprocess.STDOUT) if bigbed: peaks_bb_filename = common.bed2bb(fix_coordinate_peaks_filename, chrom_sizes_filename, as_file_filename) if peaks_bb_filename: peaks_bb = dxpy.upload_local_file(peaks_bb_filename) if not filecmp.cmp(peaks_filename,fix_coordinate_peaks_filename): print "Returning peaks with fixed coordinates" print subprocess.check_output(shlex.split('gzip %s' %(fix_coordinate_peaks_filename))) final_peaks_filename = fix_coordinate_peaks_filename + '.gz' print subprocess.check_output('ls -l', shell=True, stderr=subprocess.STDOUT) #print subprocess.check_output('head %s' %(final_peaks_filename), shell=True, stderr=subprocess.STDOUT) #print subprocess.check_output('head %s' %(xcor_scores_filename), shell=True, stderr=subprocess.STDOUT) peaks = dxpy.upload_local_file(final_peaks_filename) xcor_plot = dxpy.upload_local_file(xcor_plot_filename) xcor_scores = dxpy.upload_local_file(xcor_scores_filename) output = {} output["peaks"] = dxpy.dxlink(peaks) output["xcor_plot"] = dxpy.dxlink(xcor_plot) output["xcor_scores"] = dxpy.dxlink(xcor_scores) if bigbed and peaks_bb_filename: output["peaks_bb"] = dxpy.dxlink(peaks_bb) return output
def post_extraction(CpG_context_dxlink, CHG_context_dxlink, CHH_context_dxlink, dme_ix_dxlink, target_root, qc_metrics, props): '''runs everything after bismark simple extraction in the main instance''' print "* post_extraction(): Retrieve context files and index..." CpG_context = 'CpG_context_%s.txt' % target_root CHG_context = 'CHG_context_%s.txt' % target_root CHH_context = 'CHH_context_%s.txt' % target_root run_cmd('mkdir -p output/') dxpy.download_dxfile(CpG_context_dxlink, 'output/%s.gz' % CpG_context) dxpy.download_dxfile(CHG_context_dxlink, 'output/%s.gz' % CHG_context) dxpy.download_dxfile(CHH_context_dxlink, 'output/%s.gz' % CHH_context) dme_ix = "dme_index.tar.gz" dxpy.download_dxfile(dme_ix_dxlink, dme_ix) print "* post_extraction(): Uncompress..." run_cmd('tar -zxf ' + dme_ix) run_cmd('mv input/chrom.sizes .') chrom_sizes = "chrom.sizes" run_cmd('gunzip output/%s.gz' % CpG_context) run_cmd('gunzip output/%s.gz' % CHG_context) run_cmd('gunzip output/%s.gz' % CHH_context) # First coverage: (bedGraph, cx_report) = bismark_coverage(target_root, CpG_context, CHG_context, CHH_context, gzip=False, cleanup=True) # Next beds (CpG_bed, CHG_bed, CHH_bed, CpG_bb, CHG_bb, CHH_bb) = bedmethyl(target_root, cx_report, chrom_sizes, cleanup=True) # Finally signal bigWig = signal(target_root, bedGraph, chrom_sizes, cleanup=True) print "* post_extraction(): Storing results..." CpG_bed_dxfile = dxpy.upload_local_file(CpG_bed, properties=props, details=qc_metrics) CHG_bed_dxfile = dxpy.upload_local_file(CHG_bed, properties=props, details=qc_metrics) CHH_bed_dxfile = dxpy.upload_local_file(CHH_bed, properties=props, details=qc_metrics) CpG_bb_dxfile = dxpy.upload_local_file(CpG_bb, properties=props, details=qc_metrics) CHG_bb_dxfile = dxpy.upload_local_file(CHG_bb, properties=props, details=qc_metrics) CHH_bb_dxfile = dxpy.upload_local_file(CHH_bb, properties=props, details=qc_metrics) bigWig_dxfile = dxpy.upload_local_file(bigWig, properties=props, details=qc_metrics) print "* post_extraction(): Check storage..." run_cmd('ls -l') run_cmd('df -k .') return { "CpG_bed_dxlink": dxpy.dxlink(CpG_bed_dxfile), "CHG_bed_dxlink": dxpy.dxlink(CHG_bed_dxfile), "CHH_bed_dxlink": dxpy.dxlink(CHH_bed_dxfile), "CpG_bb_dxlink": dxpy.dxlink(CpG_bb_dxfile), "CHG_bb_dxlink": dxpy.dxlink(CHG_bb_dxfile), "CHH_bb_dxlink": dxpy.dxlink(CHH_bb_dxfile), "bigWig_dxlink": dxpy.dxlink(bigWig_dxfile) }
def crop(reads1_file, reads2_file, crop_length, debug): if debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) logger.setLevel(logging.INFO) if crop_length == 'native': output = dict( zip(["cropped_reads1", "cropped_reads2"], [reads1_file, reads2_file])) else: reads1_filename = dxpy.describe(reads1_file)['name'] reads1_basename = strip_extensions(reads1_filename, STRIP_EXTENSIONS) dxpy.download_dxfile(reads1_file, reads1_filename) if reads2_file: end_string = "PE" reads2_filename = dxpy.describe(reads2_file)['name'] reads2_basename = \ strip_extensions(reads2_filename, STRIP_EXTENSIONS) dxpy.download_dxfile(reads2_file, reads2_filename) output_fwd_paired_filename = reads1_basename + '-crop-paired.fq.gz' output_fwd_unpaired_filename = \ reads1_basename + '-crop-unpaired.fq.gz' output_rev_paired_filename = reads2_basename + '-crop-paired.fq.gz' output_rev_unpaired_filename = \ reads2_basename + '-crop-unpaired.fq.gz' SE_output_filename = None else: end_string = "SE" reads2_filename = None reads2_basename = None output_fwd_paired_filename = None output_fwd_unpaired_filename = None output_rev_paired_filename = None output_rev_unpaired_filename = None SE_output_filename = reads1_basename + "-crop.fq.gz" crop_command = ' '.join([ s for s in [ 'java -jar', TRIMMOMATIC_PATH, end_string, '-threads %d' % (cpu_count()), reads1_filename, reads2_filename, SE_output_filename, output_fwd_paired_filename, output_fwd_unpaired_filename, output_rev_paired_filename, output_rev_unpaired_filename, 'MINLEN:%s' % (crop_length), 'CROP:%s' % (crop_length) ] if s ]) logger.info("Cropping with: %s" % (crop_command)) print(subprocess.check_output(shlex.split(crop_command))) print(subprocess.check_output(shlex.split('ls -l'))) if SE_output_filename: SE_output = dxpy.upload_local_file(SE_output_filename) cropped_reads = [dxpy.dxlink(SE_output), None] else: output_fwd_paired = \ dxpy.upload_local_file(output_fwd_paired_filename) output_rev_paired = \ dxpy.upload_local_file(output_rev_paired_filename) cropped_reads = [ dxpy.dxlink(output_fwd_paired), dxpy.dxlink(output_rev_paired) ] output = dict(zip(["cropped_reads1", "cropped_reads2"], cropped_reads)) logger.info("returning from crop with output %s" % (output)) return output
def merge_extract_full(bam_set, map_report_set, dme_ix_dxlink, uncompress_bam, props): '''subjob runs bismark_methylation_extractor on mem1_hdd2_x32''' (target_root, biorep_bam) = merge_bams(bam_set, 32) (biorep_map, all_reports) = merge_map_reports(map_report_set, target_root) (qc_metrics, reads, biorep_bam_qc) = biorep_bam_qc_metrics(target_root, all_reports) print "* merge_extract_full(): Retrieve and uncompress index..." dme_ix = "dme_index.tar.gz" dxpy.download_dxfile(dme_ix_dxlink, dme_ix) run_cmd('tar -zxf ' + dme_ix) # NOTE: Better to use sam and let extractor use more threads, but this takes up precious storage (alignments, ncores) = bam_or_sam(biorep_bam, uncompress_bam, target_root) bismark_full_extract(target_root, alignments, ncores) qc_metrics = bismark_qc_metrics(target_root, qc_metrics) print "* merge_extract_full(): Retrieve split report..." append_line( "\n===== bismark_methylation_extractor: splitting_report =====", biorep_bam_qc) run_cmd('cat %s_splitting_report.txt' % target_root, out=biorep_bam_qc, append=True, silent=True) # TODO: Is this even needed? Currently we do to get the size! #if len(bam_set) > 1: # Wouldn't need to do this unless there is a merge # print "* merge_extract(): Storing biorep bam..." # props_ex = props.copy() # props_ex.update({ 'reads': str(reads) }) # biorep_bam_dxlink = dxpy.dxlink(dxpy.upload_local_file(biorep_bam,properties=props_ex,details=qc_metrics,wait_on_close=True)) #else: # biorep_bam_dxlink = bam_set[0] print "* merge_extract_full(): Storing extraction results..." biorep_bam_qc_dxfile = dxpy.upload_local_file(biorep_bam_qc, properties=props, details=qc_metrics) biorep_map_dxfile = dxpy.upload_local_file(biorep_map, properties=props, details=qc_metrics) run_cmd('pigz output/%s.CX_report.txt' % target_root) cx_report_dxfile = dxpy.upload_local_file('output/%s.CX_report.txt.gz' % target_root) bedgraph_gz_dxfile = dxpy.upload_local_file('output/%s.bedGraph.gz' % target_root) chrom_sizes_dxfile = dxpy.upload_local_file('input/chrom.sizes') split_report_dxfile = dxpy.upload_local_file(target_root + '_splitting_report.txt') mbias_report_dxfile = dxpy.upload_local_file(target_root + '_mbias_report.txt', properties=props, details=qc_metrics) print "* merge_extract_full(): Check storage..." run_cmd('ls -l') run_cmd('df -k .') return { #"biorep_bam_dxlink": biorep_bam_dxfile, "biorep_bam_qc_dxlink": dxpy.dxlink(biorep_bam_qc_dxfile), "biorep_map_dxlink": dxpy.dxlink(biorep_map_dxfile), "split_report_dxlink": dxpy.dxlink(split_report_dxfile), "cx_report_dxlink": dxpy.dxlink(cx_report_dxfile), "bedgraph_gz_dxlink": dxpy.dxlink(bedgraph_gz_dxfile), "chrom_sizes_dxlink": dxpy.dxlink(chrom_sizes_dxfile), "mbias_report_dxlink": dxpy.dxlink(mbias_report_dxfile), "target_root": target_root, "qc_metrics": qc_metrics }
def postprocess(indexed_reads, unmapped_reads, reference_tar, bwa_version, samtools_version, debug): if debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) samtools = SAMTOOLS_PATH.get(samtools_version) assert samtools, "samtools version %s is not supported" % ( samtools_version) bwa = BWA_PATH.get(bwa_version) assert bwa, "BWA version %s is not supported" % (bwa_version) logger.info("In postprocess with samtools %s and bwa %s" % (samtools, bwa)) indexed_reads_filenames = [] unmapped_reads_filenames = [] for i, reads in enumerate(indexed_reads): read_pair_number = i + 1 fn = dxpy.describe(reads)['name'] logger.info("indexed_reads %d: %s" % (read_pair_number, fn)) indexed_reads_filenames.append(fn) dxpy.download_dxfile(reads, fn) unmapped = unmapped_reads[i] fn = dxpy.describe(unmapped)['name'] logger.info("unmapped reads %d: %s" % (read_pair_number, fn)) unmapped_reads_filenames.append(fn) dxpy.download_dxfile(unmapped, fn) reference_tar_filename = dxpy.describe(reference_tar)['name'] logger.info("reference_tar: %s" % (reference_tar_filename)) dxpy.download_dxfile(reference_tar, reference_tar_filename) # extract the reference files from the tar reference_dirname = 'reference_files' reference_filename = \ resolve_reference(reference_tar_filename, reference_dirname) logger.info("Using reference file: %s" % (reference_filename)) paired_end = len(indexed_reads) == 2 if paired_end: r1_basename = strip_extensions(unmapped_reads_filenames[0], STRIP_EXTENSIONS) r2_basename = strip_extensions(unmapped_reads_filenames[1], STRIP_EXTENSIONS) reads_basename = r1_basename + r2_basename else: reads_basename = strip_extensions(unmapped_reads_filenames[0], STRIP_EXTENSIONS) raw_bam_filename = '%s.raw.srt.bam' % (reads_basename) raw_bam_mapstats_filename = '%s.raw.srt.bam.flagstat.qc' % (reads_basename) if paired_end: reads1_filename = indexed_reads_filenames[0] reads2_filename = indexed_reads_filenames[1] unmapped_reads1_filename = unmapped_reads_filenames[0] unmapped_reads2_filename = unmapped_reads_filenames[1] raw_sam_filename = reads_basename + ".raw.sam" badcigar_filename = "badreads.tmp" steps = [ "%s sampe -P %s %s %s %s %s" % (bwa, reference_filename, reads1_filename, reads2_filename, unmapped_reads1_filename, unmapped_reads2_filename), "tee %s" % (raw_sam_filename), r"""awk 'BEGIN {FS="\t" ; OFS="\t"} ! /^@/ && $6!="*" { cigar=$6; gsub("[0-9]+D","",cigar); n = split(cigar,vals,"[A-Z]"); s = 0; for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10) ; if (s!=seqlen) print $1"\t" ; }'""", "sort", "uniq" ] out, err = common.run_pipe(steps, badcigar_filename) print(out) if err: logger.error("sampe error: %s" % (err)) steps = [ "cat %s" % (raw_sam_filename), "grep -v -F -f %s" % (badcigar_filename) ] else: # single end reads_filename = indexed_reads_filenames[0] unmapped_reads_filename = unmapped_reads_filenames[0] steps = [ "%s samse %s %s %s" % (bwa, reference_filename, reads_filename, unmapped_reads_filename) ] if samtools_version == "0.1.9": steps.extend([ "%s view -Su -" % (samtools), "%s sort - %s" % (samtools, raw_bam_filename.rstrip('.bam')) ]) # samtools adds .bam else: steps.extend([ "%s view -@%d -Su -" % (samtools, cpu_count()), "%s sort -@%d - %s" % (samtools, cpu_count(), raw_bam_filename.rstrip('.bam')) ]) # samtools adds .bam logger.info("Running pipe: %s" % (steps)) out, err = common.run_pipe(steps) if out: print(out) if err: logger.error("samtools error: %s" % (err)) with open(raw_bam_mapstats_filename, 'w') as fh: subprocess.check_call(shlex.split("%s flagstat %s" % (samtools, raw_bam_filename)), stdout=fh) print(subprocess.check_output('ls -l', shell=True)) mapped_reads = dxpy.upload_local_file(raw_bam_filename) mapping_statistics = dxpy.upload_local_file(raw_bam_mapstats_filename) flagstat_qc = flagstat_parse(raw_bam_mapstats_filename) output = { 'mapped_reads': dxpy.dxlink(mapped_reads), 'mapping_statistics': dxpy.dxlink(mapping_statistics), 'n_mapped_reads': flagstat_qc.get('mapped')[0] # 0 is hi-q reads } logger.info("Returning from postprocess with output: %s" % (output)) return output
def merge_bams(bam_set, ncores): '''Merges techrep bams into biorep bam.''' # NOTE: dme-align produces *_techrep_bismark.bam and dme-extract merges 1+ techrep bams into a *_bismark_biorep.bam. # The reason for the name 'word' order is so thal older *_bismark.bam alignments are recognizable as techrep bams target_root = "" merged = "" tech_reps = "" exp_id = "" rep_tech = "" for techrep_bam_dlink in reversed(bam_set): file_desc = dxpy.describe(techrep_bam_dlink) file_root = file_desc['name'] print "* Working on '" + str(techrep_bam_dlink) + "' " + file_root file_root = file_root.replace('_techrep_bismark.bam', '') file_root = file_root.replace('_bismark.bam', '') if len(target_root) == 0: target_root = file_root else: target_root = file_root + '_' + target_root if len(merged) == 0: target_root += '_bismark_biorep' merged = 's merged as' # Try to simplify the names if os.path.isfile('/usr/bin/parse_property.py'): if len(exp_id) == 0: file_path = file_desc['folder'] + '/' + file_desc['name'] exp_id = subprocess.check_output(shlex.split('parse_property.py -f %s --project %s --exp_id -q' \ % (file_desc['id'], file_desc['project']) )) exp_id = ''.join(exp_id.split()) # Remove \n, etc. if len(exp_id) > 0: print "* Discovered exp_id: '%s'" % exp_id if len(exp_id) > 0: rep_tech = subprocess.check_output(shlex.split('parse_property.py -f %s --project %s --rep_tech -q' \ % (file_desc['id'], file_desc['project']) )) rep_tech = ''.join(rep_tech.split()) # Remove \n, etc. if len(rep_tech) > 0: print "* Discovered rep_tech: '%s'" % rep_tech if len(tech_reps) > 0: tech_reps = tech_reps + '_' + rep_tech else: tech_reps = rep_tech print "* Downloading %s_techrep_bismark.bam file..." % file_root dxpy.download_dxfile(techrep_bam_dlink, file_root + '_techrep_bismark.bam') if not os.path.isfile("sofar.bam"): run_cmd('mv %s_techrep_bismark.bam sofar.bam' % file_root) else: print "* Merging in %s_techrep_bismark.bam..." % file_root # NOTE: keeps the first header run_cmd('samtools cat sofar.bam %s_techrep_bismark.bam' % file_root, out='merging.bam') run_cmd('mv merging.bam sofar.bam') run_cmd('rm %s_techrep_bismark.bam' % file_root) # STORAGE IS LIMITED if len(exp_id) > 0 and len(tech_reps) > 0: target_root = '%s_%s_bismark_biorep' % (exp_id, tech_reps) print "* Name biorep bam as: %s.bam" % target_root else: print "* Long biorep bam to be named: %s.bam" % target_root # At this point there is a 'sofar.bam' with one or more input bams if len(merged) == 0: target_root = file_root + "_bismark_biorep" run_cmd('mv sofar.bam %s.bam' % target_root) print "* Only one input file '%s.bam', no merging required." % target_root else: # sorting needed due to samtools cat print "* Sorting merged bam..." run_cmd('samtools sort -@ %d -m 1600M -f sofar.bam %s.bam' % (ncores, target_root)) run_cmd('rm sofar.bam') # STORAGE IS LIMITED print "* Files merged into '%s.bam'" % target_root return (target_root, target_root + '.bam')
def peaks_stanza(accession, url, name, n, tracktype='bigBed 6 +', lowpass=[], dx=None): return_string = \ "\t\ttrack %s%d\n" %(accession,n) + \ "\t\tbigDataUrl %s\n" %(url) + \ "\t\tshortLabel %s\n" %(name[:17]) + \ "\t\tparent %sviewpeaks on\n" %(accession) + \ "\t\ttype %s\n" %(tracktype) + \ "\t\tvisibility dense\n" + \ "\t\tview PK\n" + \ "\t\tpriority %d\n\n" %(n) n_stanzas = 1 if not lowpass: lowpass = [] if isinstance(lowpass, int): lowpass = [lowpass] extra_stanza_count = 0 for (i, cutoff) in enumerate(lowpass, start=1): fn = dx.get_id() if not os.path.isfile(fn): dxpy.download_dxfile(dx.get_id(), fn) cutoffstr = '-lt%d' % (cutoff) outfn = fn + cutoffstr print fn, os.path.getsize(fn), subprocess.check_output( 'wc -l %s' % (fn), shell=True).split()[0] bed_fn = fn + '.bed' common.block_on('bigBedToBed %s %s' % (fn, bed_fn)) common.run_pipe([ 'cat %s' % (bed_fn), r"""awk 'BEGIN{FS="\t";OFS="\t"}{if (($3-$2) < %d) {print $0}}'""" % (cutoff) ], outfn) print outfn, os.path.getsize(outfn), subprocess.check_output( 'wc -l %s' % (outfn), shell=True).split()[0] if tracktype == 'bigBed 6 +': as_file = 'narrowPeak.as' elif tracktype == 'bigBed 12 +': as_file = 'gappedPeak.as' else: print "Cannot match tracktype %s to any .as file" % (tracktype) bb_fn = common.bed2bb(outfn, 'mm10.chrom.sizes', as_file) newdx = dxpy.upload_local_file(filename=bb_fn, folder="/tracks", wait_on_close=True) new_url, headers = newdx.get_download_url(duration=sys.maxint, preauthenticated=True) new_lines = [ "\t\ttrack %s%d" % (accession, n + i), "\t\tbigDataUrl %s" % (new_url), "\t\tshortLabel %s" % (name[:17 - len(cutoffstr)] + cutoffstr), "\t\tparent %sviewpeaks on" % (accession), "\t\ttype %s" % (tracktype), "\t\tvisibility dense", "\t\tview PK", "\t\tpriority %d\n\n" % (n + i) ] new_stanza = '\n'.join(new_lines) return_string += new_stanza n_stanzas += 1 os.remove(bed_fn) os.remove(bb_fn) os.remove(outfn) os.remove(fn) return (return_string, n_stanzas)
def main(pipe_file, file_meta, key=None, debug=False, skipvalidate=True): # The following line(s) initialize your data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. #files = [dxpy.DXFile(item) for item in files] # The following line(s) download your file inputs to the local file system # using variable names for the filenames. #for i, f in enumerate(files): # dxpy.download_dxfile(f.get_id(), "files-" + str(i)) # Split your work into parallel tasks. As an example, the # following generates 10 subjobs running with the same dummy # input. encd.logger = logging.getLogger("Applet.dxe") if debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) (AUTHID, AUTHPW, SERVER) = encd.processkey(key) f_des = dxpy.describe(pipe_file) filename = f_des['name'] fid = f_des['id'] folder = dxpy.DXFile(fid, project=dxpy.PROJECT_CONTEXT_ID).folder logger.info("* Downloading file from dx to local...") start = datetime.now() dx_file = dxpy.download_dxfile(pipe_file, filename) end = datetime.now() duration = end - start logger.info("* Download in %.2f seconds" % duration.seconds) if filename.endswith('.bed') or filename.endswith('.gff'): subprocess.check_call(['gzip', filename]) filename = filename + '.gz' # gathering metadata file_meta['submitted_file_name'] = "%s/%s" % (folder, filename) file_meta['md5sum'] = dx.calc_md5(filename).hexdigest() file_meta['file_size'] = os.path.getsize(filename) if "aliases" not in file_meta: file_meta["aliases"] = [] file_meta["aliases"].append("dnanexus:" + fid) if file_meta.get('accession') != None: file_meta[ "status"] = "upload failed" # Can only repost to same accession if status is upload failed. if not skipvalidate: logger.info("* Validating: %s (%s)" % (filename, folder)) start = datetime.now() v = validate(filename, file_meta) end = datetime.now() duration = end - start logger.info("* Validated in %.2f seconds" % duration.seconds) else: v = {'validation': 'Not Run'} if v['validation'] == "Error count 0\n" or v['validation'].find( 'Not Run') == 0: ## yes with CR logger.info("* Posting file and metadata to ENCODEd...") f_obj = encd.post_file(filename, file_meta, SERVER, AUTHID, AUTHPW) v['accession'] = f_obj.get('accession', "NOT POSTED") if v['accession'] == "NOT POSTED": v['accession'] = f_obj.get("external_accession", "NOT POSTED") if v['accession'] == "NOT POSTED": v['accession'] = file_meta.get("external_accession", "NOT POSTED") print "* Returned f_obj..." print json.dumps(f_obj, indent=4, sort_keys=True) raise # This will ensure that splashdown doesn't continue uploading. post_status = f_obj.get('status', 'upload failed') if post_status == 'upload failed': logger.info("* Post ERROR on %s to '%s': %s" % (filename, v['accession'], post_status)) # NOTE: need to set the accession to dx file nonetheless, since the file object was created in encodeD else: logger.info("* Posted %s to '%s'" % (filename, v['accession'])) # update pipe_file md5sum and accession properties dx.file_set_property(fid, 'md5sum', file_meta['md5sum'], proj_id=dxpy.PROJECT_CONTEXT_ID, verbose=True) acc_key = dx.property_accesion_key(SERVER) if post_status == 'upload failed': acc_key = acc_key + ' upload failed' acc = dx.file_set_property(fid, acc_key, v['accession'], proj_id=dxpy.PROJECT_CONTEXT_ID, verbose=True) if acc == None or acc != v['accession']: logger.info("* Failed to update '%s' to '%s' in file properties" % (acc_key, v['accession'])) else: logger.info("* Updated '%s' to '%s' in file properties" % (acc_key, acc)) #logger.debug(json.dumps(f_obj, indent=4, sort_keys=True)) if post_status == 'upload failed': raise # This will ensure that splashdown doesn't continue uploading. else: logger.info("* File invalid: %s" % v['validation']) v['accession'] = "NOT POSTED" return v
def histone(args, analysis, experiment_accession, first_analysis): authid, authpw, server = processkey(args.key) keypair = (authid, authpw) stages = analysis.get('stages') peaks_stage = next( stage for stage in stages if stage['execution']['name'] == "ENCODE Peaks")['execution'] replicated_stages = [ stage['execution'] for stage in stages if 'Final' in stage['execution']['name'] ] # this is just a cheap way of determining singlicate or replicate analysis # singlicate analyses have no rescue_ratio singlicate_analysis = all(stage['output'].get('rep2_signal') is None for stage in replicated_stages) output_names = [ 'rep1_narrowpeaks_bb', 'rep1_gappedpeaks_bb', 'rep1_pvalue_signal', 'rep1_fc_signal', ] if singlicate_analysis else [ 'rep1_narrowpeaks_bb', 'rep2_narrowpeaks_bb', 'pooled_narrowpeaks_bb', 'rep1_gappedpeaks_bb', 'rep2_gappedpeaks_bb', 'pooled_gappedpeaks_bb', 'rep1_pvalue_signal', 'rep2_pvalue_signal', 'pooled_pvalue_signal', 'rep1_fc_signal', 'rep2_fc_signal', 'pooled_fc_signal' ] outputs = dict( zip(output_names, [{ 'dx': dxpy.DXFile(peaks_stage['output'][output_name]) } for output_name in output_names])) output_names.insert(3, 'replicated_narrowpeaks_bb') outputs.update({ 'replicated_narrowpeaks_bb': { 'dx': dxpy.DXFile( next(stage['execution']['output']['overlapping_peaks_bb'] for stage in stages if stage['execution']['name'] == 'Final narrowpeaks')) } }) output_names.insert(7, 'replicated_gappedpeaks_bb') outputs.update({ 'replicated_gappedpeaks_bb': { 'dx': dxpy.DXFile( next(stage['execution']['output']['overlapping_peaks_bb'] for stage in stages if stage['execution']['name'] == 'Final gappedpeaks')) } }) track_directory = os.path.join(args.ddir, experiment_accession) url_base = urlparse.urljoin(args.turl, experiment_accession + '/') #print "url_base %s" %(url_base) if not args.nodownload and not os.path.exists(track_directory): os.makedirs(track_directory) if first_analysis: if os.path.exists(args.tdbpath): if args.truncate: trackDb = open(args.tdbpath, 'w') else: trackDb = open(args.tdbpath, 'a') else: if not os.path.exists(os.path.dirname(args.tdbpath)): os.makedirs(os.path.dirname(args.tdbpath)) trackDb = open(args.tdbpath, 'w') else: trackDb = open(args.tdbpath, 'a') for (output_name, output) in outputs.iteritems(): local_path = os.path.join(track_directory, output['dx'].name) print output_name, output['dx'].get_id(), local_path if not args.nodownload: dxpy.download_dxfile(output['dx'].get_id(), local_path) outputs[output_name].update({'local_path': local_path}) #print "Joining %s and %s" %(url_base, os.path.basename(local_path)) if args.dxf: url, headers = output['dx'].get_download_url(duration=sys.maxint, preauthenticated=True) outputs[output_name].update({'url': url}) else: outputs[output_name].update({ 'url': urlparse.urljoin(url_base, os.path.basename(local_path)) }) #print outputs[output_name]['url'] experiment = encoded_get( urlparse.urljoin(server, '/experiments/%s' % (experiment_accession)), keypair) description = '%s %s %s %s' % ( experiment['target']['label'], experiment['replicates'][0]['library'] ['biosample']['biosample_term_name'], experiment['replicates'][0]['library']['biosample'].get('life_stage'), experiment['replicates'][0]['library']['biosample'].get('age_display')) longLabel = 'E3 Histone ChIP - %s - %s' % (experiment_accession, description) if args.tag: longLabel += ' - %s' % (args.tag) trackDb.write(composite_stanza(experiment_accession, longLabel)) first_peaks = True first_signal = True priority = 1 for (n, output_name) in enumerate(output_names, start=1): if output_name.endswith('narrowpeaks_bb'): if first_peaks: trackDb.write(viewpeaks_stanza(experiment_accession)) first_peaks = False stanzas, n_stanzas = peaks_stanza(experiment_accession, outputs[output_name]['url'], output_name, priority, tracktype="bigBed 6 +", lowpass=args.lowpass, dx=outputs[output_name]['dx']) trackDb.write(stanzas) priority += n_stanzas elif output_name.endswith('gappedpeaks_bb'): if first_peaks: trackDb.write(viewpeaks_stanza(experiment_accession)) first_peaks = False stanzas, n_stanzas = peaks_stanza(experiment_accession, outputs[output_name]['url'], output_name, priority, tracktype="bigBed 12 +", lowpass=args.lowpass, dx=outputs[output_name]['dx']) trackDb.write(stanzas) priority += n_stanzas elif output_name.endswith('_signal'): if first_signal: trackDb.write(viewsignal_stanza(experiment_accession)) first_signal = False trackDb.write( signal_stanza(experiment_accession, outputs[output_name]['url'], output_name, priority, tracktype="bigWig")) priority += 1 trackDb.close()
def main(input_vcf, reference, input_bam=None, annotation_vcf=None, comparison_vcf=None, dbsnp=None, genes=None, gatk_annotator_params='', snpeff_build_params='-gtf22 -v', snpeff_annotate_params='-v -onlyCoding true -i vcf -o vcf'): # The following line(s) initialize your data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. input_vcf = dxpy.DXFile(input_vcf) reference = dxpy.DXFile(reference) ref_name = reference.describe()['name'].replace(".gz", "") if genes != None: genes = dxpy.DXFile(genes) genes_name = genes.describe()['name'] if annotation_vcf != None: annotation_vcf = dxpy.DXFile(annotation_vcf) annotation_name = annotation_vcf.describe()['name'] if comparison_vcf != None: comparison_vcf = dxpy.DXFile(comparison_vcf) comparison_name = comparison_vcf.describe()['name'] if dbsnp != None: print "dbsnp present" dbsnp = dxpy.DXFile(dbsnp) dbsnp_name = dbsnp.describe()['name'] if input_bam != None: input_bam = dxpy.DXFile(input_bam) bam_name = input_bam.describe()['name'] base_name = input_vcf.describe()['name'].replace(".vcf", '') vcf_name = input_vcf.describe()['name'] # The following line(s) download your file inputs to the local file system # using variable names for the filenames. dxpy.download_dxfile(input_vcf.get_id(), "%s" % vcf_name) dxpy.download_dxfile(reference.get_id(), "%s.gz" % ref_name) if genes != None: dxpy.download_dxfile(genes.get_id(), "%s" % genes_name) if annotation_vcf != None: dxpy.download_dxfile(annotation_vcf.get_id(), "%s" % annotation_name) if comparison_vcf != None: dxpy.download_dxfile(comparison_vcf.get_id(), "%s" % comparison_name) if dbsnp != None: dxpy.download_dxfile(dbsnp.get_id(), "%s" % dbsnp_name) if input_bam != None: dxpy.download_dxfile(input_bam.get_id(), "%s" % bam_name) # Fill in your application code here. subprocess.check_call("gzip -d %s.gz" % ref_name, shell=True) if genes != None: subprocess.check_call("mv %s /snpEff_2_0_5/data/genomes/%s" % (ref_name, ref_name), shell=True) genes_file = open("/snpEff_2_0_5/snpEff.config", "a+") genes_file.write("\n%s.genome : Custom_species\n" % ref_name.replace(".fa", "")) genes_file.close() subprocess.check_call("mkdir /snpEff_2_0_5/data/%s" % ref_name.replace(".fa", ""), shell=True) subprocess.check_call( "mv %s /snpEff_2_0_5/data/%s/%s" % (genes_name, ref_name.replace(".fa", ""), genes_name), shell=True) #Build the snpeff database subprocess.check_call( "java -Xmx4g -jar /snpEff_2_0_5/snpEff.jar build -c /snpEff_2_0_5/snpEff.config %s %s" % (snpeff_build_params, ref_name.replace(".fa", "")), shell=True) # Produce snpeff annotation file subprocess.check_call( "java -Xmx4g -jar /snpEff_2_0_5/snpEff.jar -c /snpEff_2_0_5/snpEff.config %s %s %s > snpeff.vcf" % (snpeff_annotate_params, ref_name.replace(".fa", ""), vcf_name), shell=True) ref_name = "/snpEff_2_0_5/data/genomes/%s" try: subprocess.check_call("tabix -p vcf %s" % dbsnp_name, shell=True) except: print "Tried tabix indexing dbsnp file and failed. Proceeding as though file is uncompressed VCF" annotate_command = "java -Xmx4g -jar /opt/jar/GenomeAnalysisTK.jar -T VariantAnnotator -R %s --variant %s -L %s -o %s_annotated.vcf %s" % ( ref_name, vcf_name, vcf_name, base_name, gatk_annotator_params) if dbsnp != None: annotate_command += " --dbsnp %s" % dbsnp_name if input_bam != None: annotate_command += " -I %s" % input_bam if genes != None: annotate_command += " -A SnpEff --snpEffFile snpeff.vcf" if annotation_vcf != None: annotate_command += " -resource %s" % annotation_vcf if comparison_vcf != None: annotate_command += " -comp %s" % comparison_name subprocess.check_call(annotate_command, shell=True) # The following line(s) use the Python bindings to upload your file outputs # after you have created them on the local file system. It assumes that you # have used the output field name for the filename for each output, but you # can change that behavior to suit your needs. annotated_variants = dxpy.upload_local_file("%s_annotated.vcf" % base_name) # The following line fills in some basic dummy output and assumes # that you have created variables to represent your output with # the same name as your output fields. output = {} output["annotated_variants"] = dxpy.dxlink(annotated_variants) return output
def postprocess(indexed_reads, unmapped_reads, reference_tar, bwa_version, samtools_version): print "In postprocess with:" if samtools_version == "0.1.19": samtools = "/usr/local/bin/samtools-0.1.19/samtools" elif samtools_version == "1.0": samtools = "/usr/local/bin/samtools-1.0/bin/samtools" else: samtools = "/usr/local/bin/samtools-0.1.19/samtools" if bwa_version == "0.7.7": bwa = "bwa0.7.7" elif bwa_version == "0.7.10": bwa = "bwa0.7.10" else: print "BWA version %s not supported, defaulting to 0.7.7" bwa = "bwa0.7.7" print "samtools version: %s" %(samtools) print "bwa version %s" %(bwa) indexed_reads_filenames = [] unmapped_reads_filenames = [] for i,reads in enumerate(indexed_reads): read_pair_number = i+1 fn = dxpy.describe(reads)['name'] print "indexed_reads %d: %s" %(read_pair_number, fn) indexed_reads_filenames.append(fn) dxpy.download_dxfile(reads,fn) unmapped = unmapped_reads[i] fn = dxpy.describe(unmapped)['name'] print "unmapped reads %d: %s" %(read_pair_number, fn) unmapped_reads_filenames.append(fn) dxpy.download_dxfile(unmapped,fn) reference_tar_filename = dxpy.describe(reference_tar)['name'] print "reference_tar: %s" %(reference_tar_filename) dxpy.download_dxfile(reference_tar, reference_tar_filename) # extract the reference files from the tar if reference_tar_filename.endswith('.gz') or reference_tar_filename.endswith('.tgz'): tar_command = 'tar -xzvf %s' %(reference_tar_filename) else: tar_command = 'tar -xvf %s' %(reference_tar_filename) print "Unpacking %s" %(reference_tar_filename) print subprocess.check_output(shlex.split(tar_command)) reference_filename = resolve_reference() paired_end = len(indexed_reads) == 2 if paired_end: r1_basename = unmapped_reads_filenames[0].rstrip('.gz').rstrip('.fq').rstrip('.fastq') r2_basename = unmapped_reads_filenames[1].rstrip('.gz').rstrip('.fq').rstrip('.fastq') reads_basename = r1_basename + r2_basename else: reads_basename = unmapped_reads_filenames[0].rstrip('.gz').rstrip('.fq').rstrip('.fastq') raw_bam_filename = '%s.raw.srt.bam' %(reads_basename) raw_bam_mapstats_filename = '%s.raw.srt.bam.flagstat.qc' %(reads_basename) if paired_end: reads1_filename = indexed_reads_filenames[0] reads2_filename = indexed_reads_filenames[1] unmapped_reads1_filename = unmapped_reads_filenames[0] unmapped_reads2_filename = unmapped_reads_filenames[1] raw_sam_filename = reads_basename + ".raw.sam" badcigar_filename = "badreads.tmp" steps = [ "%s sampe -P %s %s %s %s %s" %(bwa, reference_filename, reads1_filename, reads2_filename, unmapped_reads1_filename, unmapped_reads2_filename), "tee %s" %(raw_sam_filename), r"""awk 'BEGIN {FS="\t" ; OFS="\t"} ! /^@/ && $6!="*" { cigar=$6; gsub("[0-9]+D","",cigar); n = split(cigar,vals,"[A-Z]"); s = 0; for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10) ; if (s!=seqlen) print $1"\t" ; }'""", "sort", "uniq" ] out,err = run_pipe(steps,badcigar_filename) if err: print "sampe error: %s" %(err) steps = [ "cat %s" %(raw_sam_filename), "grep -v -F -f %s" %(badcigar_filename)] else: #single end reads_filename = indexed_reads_filenames[0] unmapped_reads_filename = unmapped_reads_filenames[0] steps = [ "%s samse %s %s %s" %(bwa, reference_filename, reads_filename, unmapped_reads_filename) ] if samtools_version == "0.1.9": steps.extend(["%s view -Su -" %(samtools), "%s sort - %s" %(samtools, raw_bam_filename.rstrip('.bam')) ]) # samtools adds .bam else: steps.extend(["%s view -@%d -Su -" %(samtools, cpu_count()), "%s sort -@%d - %s" %(samtools, cpu_count(), raw_bam_filename.rstrip('.bam')) ]) # samtools adds .bam print "Running pipe:" print steps out,err = run_pipe(steps) if out: print "samtools output: %s" %(out) if err: print "samtools error: %s" %(err) with open(raw_bam_mapstats_filename, 'w') as fh: subprocess.check_call(shlex.split("%s flagstat %s" \ %(samtools, raw_bam_filename)), stdout=fh) print subprocess.check_output('ls', shell=True) mapped_reads = dxpy.upload_local_file(raw_bam_filename) mapping_statistics = dxpy.upload_local_file(raw_bam_mapstats_filename) output = { "mapped_reads": dxpy.dxlink(mapped_reads), "mapping_statistics": dxpy.dxlink(mapping_statistics) } print "Returning from post with output: %s" %(output) return output
def main(input_bam, paired_end, spp_version): # The following line(s) initialize your data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. input_bam_file = dxpy.DXFile(input_bam) input_bam_filename = input_bam_file.name input_bam_basename = input_bam_file.name.rstrip('.bam') dxpy.download_dxfile(input_bam_file.get_id(), input_bam_filename) intermediate_TA_filename = input_bam_basename + ".tagAlign" if paired_end: end_infix = 'PE2SE' else: end_infix = 'SE' final_TA_filename = input_bam_basename + '.' + end_infix + '.tagAlign.gz' # =================== # Create tagAlign file # =================== out, err = common.run_pipe([ "bamToBed -i %s" % (input_bam_filename), r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'""", "tee %s" % (intermediate_TA_filename), "gzip -cn" ], outfile=final_TA_filename) # ================ # Create BEDPE file # ================ if paired_end: final_BEDPE_filename = input_bam_basename + ".bedpe.gz" # need namesorted bam to make BEDPE final_nmsrt_bam_prefix = input_bam_basename + ".nmsrt" final_nmsrt_bam_filename = final_nmsrt_bam_prefix + ".bam" samtools_sort_command = \ "samtools sort -n %s %s" % (input_bam_filename, final_nmsrt_bam_prefix) logger.info(samtools_sort_command) subprocess.check_output(shlex.split(samtools_sort_command)) out, err = common.run_pipe([ "bamToBed -bedpe -mate1 -i %s" % (final_nmsrt_bam_filename), "gzip -cn" ], outfile=final_BEDPE_filename) # ================================= # Subsample tagAlign file # ================================ logger.info("Intermediate tA md5: %s" % (common.md5(intermediate_TA_filename))) NREADS = 15000000 if paired_end: end_infix = 'MATE1' else: end_infix = 'SE' subsampled_TA_filename = \ input_bam_basename + \ ".filt.nodup.sample.%d.%s.tagAlign.gz" % (NREADS/1000000, end_infix) steps = [ 'grep -v "chrM" %s' % (intermediate_TA_filename), 'shuf -n %d --random-source=%s' % (NREADS, intermediate_TA_filename) ] if paired_end: steps.extend([r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'"""]) steps.extend(['gzip -cn']) out, err = common.run_pipe(steps, outfile=subsampled_TA_filename) logger.info("Subsampled tA md5: %s" % (common.md5(subsampled_TA_filename))) # Calculate Cross-correlation QC scores CC_scores_filename = subsampled_TA_filename + ".cc.qc" CC_plot_filename = subsampled_TA_filename + ".cc.plot.pdf" # CC_SCORE FILE format # Filename <tab> # numReads <tab> # estFragLen <tab> # corr_estFragLen <tab> # PhantomPeak <tab> # corr_phantomPeak <tab> # argmin_corr <tab> # min_corr <tab> # phantomPeakCoef <tab> # relPhantomPeakCoef <tab> # QualityTag spp_tarball = SPP_VERSION_MAP.get(spp_version) assert spp_tarball, "spp version %s is not supported" % (spp_version) # install spp subprocess.check_output(shlex.split('R CMD INSTALL %s' % (spp_tarball))) # run spp run_spp_command = '/phantompeakqualtools/run_spp_nodups.R' out, err = common.run_pipe([ "Rscript %s -c=%s -p=%d -filtchr=chrM -savp=%s -out=%s" % (run_spp_command, subsampled_TA_filename, cpu_count(), CC_plot_filename, CC_scores_filename) ]) out, err = common.run_pipe( [r"""sed -r 's/,[^\t]+//g' %s""" % (CC_scores_filename)], outfile="temp") out, err = common.run_pipe(["mv temp %s" % (CC_scores_filename)]) tagAlign_file = dxpy.upload_local_file(final_TA_filename) if paired_end: BEDPE_file = dxpy.upload_local_file(final_BEDPE_filename) CC_scores_file = dxpy.upload_local_file(CC_scores_filename) CC_plot_file = dxpy.upload_local_file(CC_plot_filename) xcor_qc = xcor_parse(CC_scores_filename) # Return the outputs output = { "tagAlign_file": dxpy.dxlink(tagAlign_file), "CC_scores_file": dxpy.dxlink(CC_scores_file), "CC_plot_file": dxpy.dxlink(CC_plot_file), "paired_end": paired_end, "RSC": float(xcor_qc.get('relPhantomPeakCoef')), "NSC": float(xcor_qc.get('phantomPeakCoef')), "est_frag_len": float(xcor_qc.get('estFragLen')) } if paired_end: output.update({"BEDPE_file": dxpy.dxlink(BEDPE_file)}) return output
def main(experiment, reps_peaks, r1pr_peaks, r2pr_peaks, pooledpr_peaks, chrom_sizes, as_file, blacklist=None): #TODO for now just taking the peak files. This applet should actually call IDR instead of #putting that in the workflow populator script # Initialize the data object inputs on the platform into # dxpy.DXDataObject instances. reps_peaks_file = dxpy.DXFile(reps_peaks) r1pr_peaks_file = dxpy.DXFile(r1pr_peaks) r2pr_peaks_file = dxpy.DXFile(r2pr_peaks) pooledpr_peaks_file = dxpy.DXFile(pooledpr_peaks) chrom_sizes_file = dxpy.DXFile(chrom_sizes) as_file_file = dxpy.DXFile(as_file) if blacklist is not None: blacklist_file = dxpy.DXFile(blacklist) blacklist_filename = 'blacklist_%s' % (blacklist_file.name) dxpy.download_dxfile(blacklist_file.get_id(), blacklist_filename) blacklist_filename = common.uncompress(blacklist_filename) # Download the file inputs to the local file system. #Need to prepend something to ensure the local filenames will be unique reps_peaks_filename = 'true_%s' % (reps_peaks_file.name) r1pr_peaks_filename = 'r1pr_%s' % (r1pr_peaks_file.name) r2pr_peaks_filename = 'r2pr_%s' % (r2pr_peaks_file.name) pooledpr_peaks_filename = 'pooledpr_%s' % (pooledpr_peaks_file.name) chrom_sizes_filename = chrom_sizes_file.name as_file_filename = as_file_file.name dxpy.download_dxfile(reps_peaks_file.get_id(), reps_peaks_filename) dxpy.download_dxfile(r1pr_peaks_file.get_id(), r1pr_peaks_filename) dxpy.download_dxfile(r2pr_peaks_file.get_id(), r2pr_peaks_filename) dxpy.download_dxfile(pooledpr_peaks_file.get_id(), pooledpr_peaks_filename) dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename) dxpy.download_dxfile(as_file_file.get_id(), as_file_filename) print subprocess.check_output('ls -l', shell=True) reps_peaks_filename = common.uncompress(reps_peaks_filename) r1pr_peaks_filename = common.uncompress(r1pr_peaks_filename) r2pr_peaks_filename = common.uncompress(r2pr_peaks_filename) pooledpr_peaks_filename = common.uncompress(pooledpr_peaks_filename) Nt = common.count_lines(reps_peaks_filename) print "%d peaks from true replicates" % (Nt) N1 = common.count_lines(r1pr_peaks_filename) print "%d peaks from rep1 self-pseudoreplicates" % (N1) N2 = common.count_lines(r2pr_peaks_filename) print "%d peaks from rep2 self-pseudoreplicates" % (N2) Np = common.count_lines(pooledpr_peaks_filename) print "%d peaks from pooled pseudoreplicates" % (Np) conservative_set_filename = '%s_final_conservative.narrowPeak' % ( experiment) if blacklist is not None: blacklist_filter(reps_peaks_filename, conservative_set_filename, blacklist_filename) else: conservative_set_filename = reps_peaks_filename Ncb = common.count_lines(conservative_set_filename) print "%d peaks blacklisted from the conservative set" % (Nt - Ncb) if Nt >= Np: peaks_to_filter_filename = reps_peaks_filename No = Nt else: peaks_to_filter_filename = pooledpr_peaks_filename No = Np optimal_set_filename = '%s_final_optimal.narrowPeak' % (experiment) if blacklist is not None: blacklist_filter(peaks_to_filter_filename, optimal_set_filename, blacklist_filename) else: optimal_set_filename = peaks_to_filter_filename Nob = common.count_lines(optimal_set_filename) print "%d peaks blacklisted from the optimal set" % (No - Nob) rescue_ratio = float(max(Np, Nt)) / float(min(Np, Nt)) self_consistency_ratio = float(max(N1, N2)) / float(min(N1, N2)) if rescue_ratio > 2 and self_consistency_ratio > 2: reproducibility = 'fail' elif rescue_ratio > 2 or self_consistency_ratio > 2: reproducibility = 'borderline' else: reproducibility = 'pass' output = {} #bedtobigbed often fails, so skip creating the bb if it does conservative_set_bb_filename = common.bed2bb(conservative_set_filename, chrom_sizes_filename, as_file_filename) optimal_set_bb_filename = common.bed2bb(optimal_set_filename, chrom_sizes_filename, as_file_filename) if conservative_set_bb_filename: conservative_set_bb_output = dxpy.upload_local_file( conservative_set_bb_filename) output.update( {"conservative_set_bb": dxpy.dxlink(conservative_set_bb_output)}) if optimal_set_bb_filename: optimal_set_bb_output = dxpy.upload_local_file(optimal_set_bb_filename) output.update({"optimal_set_bb": dxpy.dxlink(optimal_set_bb_output)}) output.update({ "Nt": Nt, "N1": N1, "N2": N2, "Np": Np, "conservative_set": dxpy.dxlink( dxpy.upload_local_file( common.compress(conservative_set_filename))), "optimal_set": dxpy.dxlink( dxpy.upload_local_file(common.compress(optimal_set_filename))), "rescue_ratio": rescue_ratio, "self_consistency_ratio": self_consistency_ratio, "reproducibility_test": reproducibility }) logging.info("Exiting with output: %s", output) return output
def main(rep1_ta, rep2_ta, ctl1_ta, ctl2_ta, rep1_xcor, rep2_xcor, rep1_paired_end, rep2_paired_end, chrom_sizes, genomesize, narrowpeak_as, gappedpeak_as, broadpeak_as): if not rep1_paired_end == rep2_paired_end: raise ValueError('Mixed PE/SE not supported (yet)') paired_end = rep1_paired_end # The following lines initialize the data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. rep1_ta_file = dxpy.DXFile(rep1_ta) rep2_ta_file = dxpy.DXFile(rep2_ta) unary_control = ctl1_ta == ctl2_ta ctl1_ta_file = dxpy.DXFile(ctl1_ta) ctl2_ta_file = dxpy.DXFile(ctl2_ta) rep1_xcor_file = dxpy.DXFile(rep1_xcor) rep2_xcor_file = dxpy.DXFile(rep2_xcor) # The following line(s) download your file inputs to the local file system # using variable names for the filenames. dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_file.name) dxpy.download_dxfile(rep2_ta_file.get_id(), rep2_ta_file.name) dxpy.download_dxfile(ctl1_ta_file.get_id(), ctl1_ta_file.name) dxpy.download_dxfile(ctl2_ta_file.get_id(), ctl2_ta_file.name) dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_file.name) dxpy.download_dxfile(rep2_xcor_file.get_id(), rep2_xcor_file.name) rep1_ta_filename = rep1_ta_file.name rep2_ta_filename = rep2_ta_file.name ctl1_ta_filename = ctl1_ta_file.name ctl2_ta_filename = ctl2_ta_file.name rep1_xcor_filename = rep1_xcor_file.name rep2_xcor_filename = rep2_xcor_file.name ntags_rep1 = count_lines(rep1_ta_filename) ntags_rep2 = count_lines(rep2_ta_filename) ntags_ctl1 = count_lines(ctl1_ta_filename) ntags_ctl2 = count_lines(ctl2_ta_filename) for n, name, filename in [(ntags_rep1, 'replicate 1', rep1_ta_filename), (ntags_rep2, 'replicate 2', rep2_ta_filename), (ntags_ctl1, 'control 1', ctl1_ta_filename), (ntags_ctl2, 'control 2', ctl2_ta_filename)]: print "Found %d tags in %s file %s" % (n, name, filename) print subprocess.check_output('ls -l', shell=True, stderr=subprocess.STDOUT) pool_applet = dxpy.find_one_data_object(classname='applet', name='pool', zero_ok=False, more_ok=False, return_handler=True) pool_replicates_subjob = pool_applet.run({"inputs": [rep1_ta, rep2_ta]}) pooled_replicates = pool_replicates_subjob.get_output_ref("pooled") rep1_control = ctl1_ta #default. May be changed later. rep2_control = ctl2_ta #default. May be changed later. if unary_control: print "Only one control supplied. Using it for both replicate 1 and 2 and for the pool." control_for_pool = rep1_control else: pool_controls_subjob = pool_applet.run({"inputs": [ctl1_ta, ctl2_ta]}) pooled_controls = pool_controls_subjob.get_output_ref("pooled") #always use the pooled controls for the pool control_for_pool = pooled_controls #use the pooled controls for the reps depending on the ration of rep to control reads ratio_ctl_reads = float(ntags_ctl1) / float(ntags_ctl2) if ratio_ctl_reads < 1: ratio_ctl_reads = 1 / ratio_ctl_reads ratio_cutoff = 1.2 if ratio_ctl_reads > ratio_cutoff: print "Number of reads in controls differ by > factor of %f. Using pooled controls." % ( ratio_cutoff) rep1_control = pooled_controls rep2_control = pooled_controls else: if ntags_ctl1 < ntags_rep1: print "Fewer reads in control replicate 1 than experiment replicate 1. Using pooled controls for replicate 1." rep1_control = pooled_controls elif ntags_ctl2 < ntags_rep2: print "Fewer reads in control replicate 2 than experiment replicate 2. Using pooled controls for replicate 2." rep2_control = pooled_controls else: print "Using distinct controls for replicate 1 and 2." pseudoreplicator_applet = dxpy.find_one_data_object( classname='applet', name='pseudoreplicator', zero_ok=False, more_ok=False, return_handler=True) rep1_pr_subjob = pseudoreplicator_applet.run({"input_tags": rep1_ta}) rep2_pr_subjob = pseudoreplicator_applet.run({"input_tags": rep2_ta}) pool_pr1_subjob = pool_applet.run({ "inputs": [ rep1_pr_subjob.get_output_ref("pseudoreplicate1"), rep2_pr_subjob.get_output_ref("pseudoreplicate1") ] }) pool_pr2_subjob = pool_applet.run({ "inputs": [ rep1_pr_subjob.get_output_ref("pseudoreplicate2"), rep2_pr_subjob.get_output_ref("pseudoreplicate2") ] }) pooled_replicates_xcor_subjob = xcor_only(pooled_replicates, paired_end) rep1_pr1_xcor_subjob = xcor_only( rep1_pr_subjob.get_output_ref("pseudoreplicate1"), paired_end) rep1_pr2_xcor_subjob = xcor_only( rep1_pr_subjob.get_output_ref("pseudoreplicate2"), paired_end) rep2_pr1_xcor_subjob = xcor_only( rep2_pr_subjob.get_output_ref("pseudoreplicate1"), paired_end) rep2_pr2_xcor_subjob = xcor_only( rep2_pr_subjob.get_output_ref("pseudoreplicate2"), paired_end) pool_pr1_xcor_subjob = xcor_only(pool_pr1_subjob.get_output_ref("pooled"), paired_end) pool_pr2_xcor_subjob = xcor_only(pool_pr2_subjob.get_output_ref("pooled"), paired_end) common_args = { 'chrom_sizes': chrom_sizes, 'genomesize': genomesize, 'narrowpeak_as': narrowpeak_as, 'gappedpeak_as': gappedpeak_as, 'broadpeak_as': broadpeak_as } common_args.update({'prefix': 'r1'}) rep1_peaks_subjob = macs2(rep1_ta, rep1_control, rep1_xcor, **common_args) common_args.update({'prefix': 'r2'}) rep2_peaks_subjob = macs2(rep2_ta, rep2_control, rep2_xcor, **common_args) common_args.update({'prefix': 'pool'}) pooled_peaks_subjob = macs2( pooled_replicates, control_for_pool, pooled_replicates_xcor_subjob.get_output_ref("CC_scores_file"), **common_args) common_args.update({'prefix': 'r1pr1'}) rep1pr1_peaks_subjob = macs2( rep1_pr_subjob.get_output_ref("pseudoreplicate1"), rep1_control, rep1_pr1_xcor_subjob.get_output_ref("CC_scores_file"), **common_args) common_args.update({'prefix': 'r1pr2'}) rep1pr2_peaks_subjob = macs2( rep1_pr_subjob.get_output_ref("pseudoreplicate2"), rep1_control, rep1_pr2_xcor_subjob.get_output_ref("CC_scores_file"), **common_args) common_args.update({'prefix': 'r2pr1'}) rep2pr1_peaks_subjob = macs2( rep2_pr_subjob.get_output_ref("pseudoreplicate1"), rep2_control, rep2_pr1_xcor_subjob.get_output_ref("CC_scores_file"), **common_args) common_args.update({'prefix': 'r2pr2'}) rep2pr2_peaks_subjob = macs2( rep2_pr_subjob.get_output_ref("pseudoreplicate2"), rep2_control, rep2_pr2_xcor_subjob.get_output_ref("CC_scores_file"), **common_args) common_args.update({'prefix': 'ppr1'}) pooledpr1_peaks_subjob = macs2( pool_pr1_subjob.get_output_ref("pooled"), control_for_pool, pool_pr1_xcor_subjob.get_output_ref("CC_scores_file"), **common_args) common_args.update({'prefix': 'ppr2'}) pooledpr2_peaks_subjob = macs2( pool_pr2_subjob.get_output_ref("pooled"), control_for_pool, pool_pr2_xcor_subjob.get_output_ref("CC_scores_file"), **common_args) output = { 'rep1_narrowpeaks': rep1_peaks_subjob.get_output_ref("narrowpeaks"), 'rep1_gappedpeaks': rep1_peaks_subjob.get_output_ref("gappedpeaks"), 'rep1_broadpeaks': rep1_peaks_subjob.get_output_ref("broadpeaks"), 'rep1_narrowpeaks_bb': rep1_peaks_subjob.get_output_ref("narrowpeaks_bb"), 'rep1_gappedpeaks_bb': rep1_peaks_subjob.get_output_ref("gappedpeaks_bb"), 'rep1_broadpeaks_bb': rep1_peaks_subjob.get_output_ref("broadpeaks_bb"), 'rep1_fc_signal': rep1_peaks_subjob.get_output_ref("fc_signal"), 'rep1_pvalue_signal': rep1_peaks_subjob.get_output_ref("pvalue_signal"), 'rep2_narrowpeaks': rep2_peaks_subjob.get_output_ref("narrowpeaks"), 'rep2_gappedpeaks': rep2_peaks_subjob.get_output_ref("gappedpeaks"), 'rep2_broadpeaks': rep2_peaks_subjob.get_output_ref("broadpeaks"), 'rep2_narrowpeaks_bb': rep2_peaks_subjob.get_output_ref("narrowpeaks_bb"), 'rep2_gappedpeaks_bb': rep2_peaks_subjob.get_output_ref("gappedpeaks_bb"), 'rep2_broadpeaks_bb': rep2_peaks_subjob.get_output_ref("broadpeaks_bb"), 'rep2_fc_signal': rep2_peaks_subjob.get_output_ref("fc_signal"), 'rep2_pvalue_signal': rep2_peaks_subjob.get_output_ref("pvalue_signal"), 'pooled_narrowpeaks': pooled_peaks_subjob.get_output_ref("narrowpeaks"), 'pooled_gappedpeaks': pooled_peaks_subjob.get_output_ref("gappedpeaks"), 'pooled_broadpeaks': pooled_peaks_subjob.get_output_ref("broadpeaks"), 'pooled_narrowpeaks_bb': pooled_peaks_subjob.get_output_ref("narrowpeaks_bb"), 'pooled_gappedpeaks_bb': pooled_peaks_subjob.get_output_ref("gappedpeaks_bb"), 'pooled_broadpeaks_bb': pooled_peaks_subjob.get_output_ref("broadpeaks_bb"), 'pooled_fc_signal': pooled_peaks_subjob.get_output_ref("fc_signal"), 'pooled_pvalue_signal': pooled_peaks_subjob.get_output_ref("pvalue_signal"), 'rep1pr1_narrowpeaks': rep1pr1_peaks_subjob.get_output_ref("narrowpeaks"), 'rep1pr1_gappedpeaks': rep1pr1_peaks_subjob.get_output_ref("gappedpeaks"), 'rep1pr1_broadpeaks': rep1pr1_peaks_subjob.get_output_ref("broadpeaks"), 'rep1pr1_fc_signal': rep1pr1_peaks_subjob.get_output_ref("fc_signal"), 'rep1pr1_pvalue_signal': rep1pr1_peaks_subjob.get_output_ref("pvalue_signal"), 'rep1pr2_narrowpeaks': rep1pr2_peaks_subjob.get_output_ref("narrowpeaks"), 'rep1pr2_gappedpeaks': rep1pr2_peaks_subjob.get_output_ref("gappedpeaks"), 'rep1pr2_broadpeaks': rep1pr2_peaks_subjob.get_output_ref("broadpeaks"), 'rep1pr2_fc_signal': rep1pr2_peaks_subjob.get_output_ref("fc_signal"), 'rep1pr2_pvalue_signal': rep1pr2_peaks_subjob.get_output_ref("pvalue_signal"), 'rep2pr1_narrowpeaks': rep2pr1_peaks_subjob.get_output_ref("narrowpeaks"), 'rep2pr1_gappedpeaks': rep2pr1_peaks_subjob.get_output_ref("gappedpeaks"), 'rep2pr1_broadpeaks': rep2pr1_peaks_subjob.get_output_ref("broadpeaks"), 'rep2pr1_fc_signal': rep2pr1_peaks_subjob.get_output_ref("fc_signal"), 'rep2pr1_pvalue_signal': rep2pr1_peaks_subjob.get_output_ref("pvalue_signal"), 'rep2pr2_narrowpeaks': rep2pr2_peaks_subjob.get_output_ref("narrowpeaks"), 'rep2pr2_gappedpeaks': rep2pr2_peaks_subjob.get_output_ref("gappedpeaks"), 'rep2pr2_broadpeaks': rep2pr2_peaks_subjob.get_output_ref("broadpeaks"), 'rep2pr2_fc_signal': rep2pr2_peaks_subjob.get_output_ref("fc_signal"), 'rep2pr2_pvalue_signal': rep2pr2_peaks_subjob.get_output_ref("pvalue_signal"), 'pooledpr1_narrowpeaks': pooledpr1_peaks_subjob.get_output_ref("narrowpeaks"), 'pooledpr1_gappedpeaks': pooledpr1_peaks_subjob.get_output_ref("gappedpeaks"), 'pooledpr1_broadpeaks': pooledpr1_peaks_subjob.get_output_ref("broadpeaks"), 'pooledpr1_fc_signal': pooledpr1_peaks_subjob.get_output_ref("fc_signal"), 'pooledpr1_pvalue_signal': pooledpr1_peaks_subjob.get_output_ref("pvalue_signal"), 'pooledpr2_narrowpeaks': pooledpr2_peaks_subjob.get_output_ref("narrowpeaks"), 'pooledpr2_gappedpeaks': pooledpr2_peaks_subjob.get_output_ref("gappedpeaks"), 'pooledpr2_broadpeaks': pooledpr2_peaks_subjob.get_output_ref("broadpeaks"), 'pooledpr2_fc_signal': pooledpr2_peaks_subjob.get_output_ref("fc_signal"), 'pooledpr2_pvalue_signal': pooledpr2_peaks_subjob.get_output_ref("pvalue_signal") } return output
def main(base_name, At, An, Bt, Bn, Ct, Cn, Dt, Dn, Et=None, En=None, Ft=None, Fn=None, Gt=None, Gn=None, Ht=None, Hn=None, It=None, In=None, Jt=None, Jn=None, Kt=None, Kn=None, Lt=None, Ln=None, Mt=None, Mn=None, Nt=None, Nn=None, Ot=None, On=None, Pt=None, Pn=None, Qt=None, Qn=None, Rt=None, Rn=None, St=None, Sn=None, Tt=None, Tn=None): # The following line(s) initialize your data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. At = dxpy.DXFile(At) Bt = dxpy.DXFile(Bt) Ct = dxpy.DXFile(Ct) Dt = dxpy.DXFile(Dt) if Et is not None: Et = dxpy.DXFile(Et) if Ft is not None: Ft = dxpy.DXFile(Ft) if Gt is not None: Gt = dxpy.DXFile(Gt) if Ht is not None: Ht = dxpy.DXFile(Ht) if It is not None: It = dxpy.DXFile(It) if Jt is not None: Jt = dxpy.DXFile(Jt) if Kt is not None: Kt = dxpy.DXFile(Kt) if Lt is not None: Lt = dxpy.DXFile(Lt) if Mt is not None: Mt = dxpy.DXFile(Mt) if Nt is not None: Nt = dxpy.DXFile(Nt) if Ot is not None: Ot = dxpy.DXFile(Ot) if Pt is not None: Pt = dxpy.DXFile(Pt) if Qt is not None: Qt = dxpy.DXFile(Qt) if Rt is not None: Rt = dxpy.DXFile(Rt) if St is not None: St = dxpy.DXFile(St) if Tt is not None: Tt = dxpy.DXFile(Tt) # The following line(s) download your file inputs to the local file system # using variable names for the filenames. dxpy.download_dxfile(At.get_id(), "At") dxpy.download_dxfile(Bt.get_id(), "Bt") dxpy.download_dxfile(Ct.get_id(), "Ct") dxpy.download_dxfile(Dt.get_id(), "Dt") if Et is not None: dxpy.download_dxfile(Et.get_id(), "Et") if Ft is not None: dxpy.download_dxfile(Ft.get_id(), "Ft") if Gt is not None: dxpy.download_dxfile(Gt.get_id(), "Gt") if Ht is not None: dxpy.download_dxfile(Ht.get_id(), "Ht") if It is not None: dxpy.download_dxfile(It.get_id(), "It") if Jt is not None: dxpy.download_dxfile(Jt.get_id(), "Jt") if Kt is not None: dxpy.download_dxfile(Kt.get_id(), "Kt") if Lt is not None: dxpy.download_dxfile(Lt.get_id(), "Lt") if Mt is not None: dxpy.download_dxfile(Mt.get_id(), "Mt") if Nt is not None: dxpy.download_dxfile(Nt.get_id(), "Nt") if Ot is not None: dxpy.download_dxfile(Ot.get_id(), "Ot") if Pt is not None: dxpy.download_dxfile(Pt.get_id(), "Pt") if Qt is not None: dxpy.download_dxfile(Qt.get_id(), "Qt") if Rt is not None: dxpy.download_dxfile(Rt.get_id(), "Rt") if St is not None: dxpy.download_dxfile(St.get_id(), "St") if Tt is not None: dxpy.download_dxfile(Tt.get_id(), "Tt") # Fill in your application code here. total_t = ['At', 'Bt', 'Ct', 'Dt'] if Et is not None: total_t = total_t + ['Et'] if Ft is not None: total_t = total_t + ['Ft'] if Gt is not None: total_t = total_t + ['Gt'] if Ht is not None: total_t = total_t + ['Ht'] if It is not None: total_t = total_t + ['It'] if Jt is not None: total_t = total_t + ['Jt'] if Kt is not None: total_t = total_t + ['Kt'] if Lt is not None: total_t = total_t + ['Lt'] if Mt is not None: total_t = total_t + ['Mt'] if Nt is not None: total_t = total_t + ['Nt'] if Ot is not None: total_t = total_t + ['Ot'] if Pt is not None: total_t = total_t + ['Pt'] if Qt is not None: total_t = total_t + ['Qt'] if Rt is not None: total_t = total_t + ['Rt'] if St is not None: total_t = total_t + ['St'] if Tt is not None: total_t = total_t + ['Tt'] total_n = [ An, Bn, Cn, Dn, En, Fn, Gn, Hn, In, Jn, Kn, Ln, Mn, Nn, On, Pn, Qn, Rn, Sn, Tn ] total_n = [x for x in total_n if x is not None] TPM = list(map(read_TPM, total_t)) cts = generate_table(TPM, total_n) meta = generate_meta(list(total_n)) with localconverter(ro.default_converter + pandas2ri.converter): r.assign("cts", cts) r.assign("meta", meta) r('dds <- DESeqDataSetFromMatrix(countData = cts,\ colData = meta,\ design = ~ condition)') r('dds <- DESeq(dds)') r('vsd <- vst(dds, blind=FALSE)') r('plotPCA(vsd, intgroup=c("condition"))') string = 'ggsave("' + base_name + '_PCA.pdf")' r(string) r('PCA_information <- plotPCA(vsd, intgroup=c("condition"),returnData=TRUE)' ) string = 'write.csv(as.data.frame(PCA_information),file="' + base_name + '_table_PCA.csv' + '")' r(string) # The following line(s) use the Python bindings to upload your file outputs # after you have created them on the local file system. It assumes that you # have used the output field name for the filename for each output, but you # can change that behavior to suit your needs. plot = dxpy.upload_local_file(base_name + "_PCA.pdf") csv = dxpy.upload_local_file(base_name + "_table_PCA.csv") # The following line fills in some basic dummy output and assumes # that you have created variables to represent your output with # the same name as your output fields. output = {} output["plot"] = dxpy.dxlink(plot) output["csv"] = dxpy.dxlink(csv) return output
def main(rep1_ta, rep2_ta, ctl1_ta, ctl2_ta, rep1_xcor, rep2_xcor, npeaks, nodups, rep1_paired_end, rep2_paired_end, chrom_sizes, as_file=None, idr_peaks=False): if not rep1_paired_end == rep2_paired_end: raise ValueError('Mixed PE/SE not supported (yet)') paired_end = rep1_paired_end # The following lines initialize the data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. rep1_ta_file = dxpy.DXFile(rep1_ta) rep2_ta_file = dxpy.DXFile(rep2_ta) unary_control = ctl1_ta == ctl2_ta ctl1_ta_file = dxpy.DXFile(ctl1_ta) ctl2_ta_file = dxpy.DXFile(ctl2_ta) rep1_xcor_file = dxpy.DXFile(rep1_xcor) rep2_xcor_file = dxpy.DXFile(rep2_xcor) # The following line(s) download your file inputs to the local file system # using variable names for the filenames. dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_file.name) dxpy.download_dxfile(rep2_ta_file.get_id(), rep2_ta_file.name) dxpy.download_dxfile(ctl1_ta_file.get_id(), ctl1_ta_file.name) dxpy.download_dxfile(ctl2_ta_file.get_id(), ctl2_ta_file.name) dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_file.name) dxpy.download_dxfile(rep2_xcor_file.get_id(), rep2_xcor_file.name) rep1_ta_filename = rep1_ta_file.name rep2_ta_filename = rep2_ta_file.name ctl1_ta_filename = ctl1_ta_file.name ctl2_ta_filename = ctl2_ta_file.name rep1_xcor_filename = rep1_xcor_file.name rep2_xcor_filename = rep2_xcor_file.name ntags_rep1 = count_lines(rep1_ta_filename) ntags_rep2 = count_lines(rep2_ta_filename) ntags_ctl1 = count_lines(ctl1_ta_filename) ntags_ctl2 = count_lines(ctl2_ta_filename) for n, name, filename in [(ntags_rep1, 'replicate 1', rep1_ta_filename), (ntags_rep2, 'replicate 2', rep2_ta_filename), (ntags_ctl1, 'control 1', ctl1_ta_filename), (ntags_ctl2, 'control 2', ctl2_ta_filename)]: print "Found %d tags in %s file %s" % (n, name, filename) print subprocess.check_output('ls -l', shell=True, stderr=subprocess.STDOUT) pool_applet = dxpy.find_one_data_object(classname='applet', name='pool', project=dxpy.PROJECT_CONTEXT_ID, zero_ok=False, more_ok=False, return_handler=True) pool_replicates_subjob = pool_applet.run({"inputs": [rep1_ta, rep2_ta]}) pooled_replicates = pool_replicates_subjob.get_output_ref("pooled") pooled_replicates_xcor_subjob = xcor_only(pooled_replicates, paired_end) rep1_control = ctl1_ta #default. May be changed later. rep2_control = ctl2_ta #default. May be changed later. if unary_control: print "Only one control supplied. Using it for both replicate 1 and 2 and for the pool." control_for_pool = rep1_control else: pool_controls_subjob = pool_applet.run({"inputs": [ctl1_ta, ctl2_ta]}) pooled_controls = pool_controls_subjob.get_output_ref("pooled") #always use the pooled controls for the pool control_for_pool = pooled_controls #use the pooled controls for the reps depending on the ration of rep to control reads ratio_ctl_reads = float(ntags_ctl1) / float(ntags_ctl2) if ratio_ctl_reads < 1: ratio_ctl_reads = 1 / ratio_ctl_reads ratio_cutoff = 1.2 if ratio_ctl_reads > ratio_cutoff: print "Number of reads in controls differ by > factor of %f. Using pooled controls." % ( ratio_cutoff) rep1_control = pooled_controls rep2_control = pooled_controls else: if ntags_ctl1 < ntags_rep1: print "Fewer reads in control replicate 1 than experiment replicate 1. Using pooled controls for replicate 1." rep1_control = pooled_controls elif ntags_ctl2 < ntags_rep2: print "Fewer reads in control replicate 2 than experiment replicate 2. Using pooled controls for replicate 2." rep2_control = pooled_controls else: print "Using distinct controls for replicate 1 and 2." rep1_peaks_subjob = spp(rep1_ta, rep1_control, rep1_xcor, chrom_sizes=chrom_sizes, bigbed=True, as_file=as_file) rep2_peaks_subjob = spp(rep2_ta, rep2_control, rep2_xcor, chrom_sizes=chrom_sizes, bigbed=True, as_file=as_file) pooled_peaks_subjob = spp( pooled_replicates, control_for_pool, pooled_replicates_xcor_subjob.get_output_ref("CC_scores_file"), chrom_sizes=chrom_sizes, bigbed=True, as_file=as_file) output = { 'rep1_peaks': rep1_peaks_subjob.get_output_ref("peaks"), 'rep1_peaks_bb': rep1_peaks_subjob.get_output_ref("peaks_bb"), 'rep1_xcor_plot': rep1_peaks_subjob.get_output_ref("xcor_plot"), 'rep1_xcor_scores': rep1_peaks_subjob.get_output_ref("xcor_scores"), 'rep2_peaks': rep2_peaks_subjob.get_output_ref("peaks"), 'rep2_peaks_bb': rep2_peaks_subjob.get_output_ref("peaks_bb"), 'rep2_xcor_plot': rep2_peaks_subjob.get_output_ref("xcor_plot"), 'rep2_xcor_scores': rep2_peaks_subjob.get_output_ref("xcor_scores"), 'pooled_peaks': pooled_peaks_subjob.get_output_ref("peaks"), 'pooled_peaks_bb': pooled_peaks_subjob.get_output_ref("peaks_bb"), 'pooled_xcor_plot': pooled_peaks_subjob.get_output_ref("xcor_plot"), 'pooled_xcor_scores': pooled_peaks_subjob.get_output_ref("xcor_scores") } if idr_peaks: #also call peaks on pseudoreplicates for IDR pseudoreplicator_applet = dxpy.find_one_data_object( classname='applet', name='pseudoreplicator', project=dxpy.PROJECT_CONTEXT_ID, zero_ok=False, more_ok=False, return_handler=True) rep1_pr_subjob = pseudoreplicator_applet.run({"input_tags": rep1_ta}) rep2_pr_subjob = pseudoreplicator_applet.run({"input_tags": rep2_ta}) pool_pr1_subjob = pool_applet.run({ "inputs": [ rep1_pr_subjob.get_output_ref("pseudoreplicate1"), rep2_pr_subjob.get_output_ref("pseudoreplicate1") ] }) pool_pr2_subjob = pool_applet.run({ "inputs": [ rep1_pr_subjob.get_output_ref("pseudoreplicate2"), rep2_pr_subjob.get_output_ref("pseudoreplicate2") ] }) rep1_pr1_xcor_subjob = xcor_only( rep1_pr_subjob.get_output_ref("pseudoreplicate1"), paired_end) rep1_pr2_xcor_subjob = xcor_only( rep1_pr_subjob.get_output_ref("pseudoreplicate2"), paired_end) rep2_pr1_xcor_subjob = xcor_only( rep2_pr_subjob.get_output_ref("pseudoreplicate1"), paired_end) rep2_pr2_xcor_subjob = xcor_only( rep2_pr_subjob.get_output_ref("pseudoreplicate2"), paired_end) pool_pr1_xcor_subjob = xcor_only( pool_pr1_subjob.get_output_ref("pooled"), paired_end) pool_pr2_xcor_subjob = xcor_only( pool_pr2_subjob.get_output_ref("pooled"), paired_end) rep1pr1_peaks_subjob = spp( rep1_pr_subjob.get_output_ref("pseudoreplicate1"), rep1_control, rep1_pr1_xcor_subjob.get_output_ref("CC_scores_file"), chrom_sizes=chrom_sizes, bigbed=False) rep1pr2_peaks_subjob = spp( rep1_pr_subjob.get_output_ref("pseudoreplicate2"), rep1_control, rep1_pr2_xcor_subjob.get_output_ref("CC_scores_file"), chrom_sizes=chrom_sizes, bigbed=False) rep2pr1_peaks_subjob = spp( rep2_pr_subjob.get_output_ref("pseudoreplicate1"), rep2_control, rep2_pr1_xcor_subjob.get_output_ref("CC_scores_file"), chrom_sizes=chrom_sizes, bigbed=False) rep2pr2_peaks_subjob = spp( rep2_pr_subjob.get_output_ref("pseudoreplicate2"), rep2_control, rep2_pr2_xcor_subjob.get_output_ref("CC_scores_file"), chrom_sizes=chrom_sizes, bigbed=False) pooledpr1_peaks_subjob = spp( pool_pr1_subjob.get_output_ref("pooled"), control_for_pool, pool_pr1_xcor_subjob.get_output_ref("CC_scores_file"), chrom_sizes=chrom_sizes, bigbed=False) pooledpr2_peaks_subjob = spp( pool_pr2_subjob.get_output_ref("pooled"), control_for_pool, pool_pr2_xcor_subjob.get_output_ref("CC_scores_file"), chrom_sizes=chrom_sizes, bigbed=False) output.update({ 'rep1pr1_peaks': rep1pr1_peaks_subjob.get_output_ref("peaks"), 'rep1pr1_xcor_plot': rep1pr1_peaks_subjob.get_output_ref("xcor_plot"), 'rep1pr1_xcor_scores': rep1pr1_peaks_subjob.get_output_ref("xcor_scores"), 'rep1pr2_peaks': rep1pr2_peaks_subjob.get_output_ref("peaks"), 'rep1pr2_xcor_plot': rep1pr2_peaks_subjob.get_output_ref("xcor_plot"), 'rep1pr2_xcor_scores': rep1pr2_peaks_subjob.get_output_ref("xcor_scores"), 'rep2pr1_peaks': rep2pr1_peaks_subjob.get_output_ref("peaks"), 'rep2pr1_xcor_plot': rep2pr1_peaks_subjob.get_output_ref("xcor_plot"), 'rep2pr1_xcor_scores': rep2pr1_peaks_subjob.get_output_ref("xcor_scores"), 'rep2pr2_peaks': rep2pr2_peaks_subjob.get_output_ref("peaks"), 'rep2pr2_xcor_plot': rep2pr2_peaks_subjob.get_output_ref("xcor_plot"), 'rep2pr2_xcor_scores': rep2pr2_peaks_subjob.get_output_ref("xcor_scores"), 'pooledpr1_peaks': pooledpr1_peaks_subjob.get_output_ref("peaks"), 'pooledpr1_xcor_plot': pooledpr1_peaks_subjob.get_output_ref("xcor_plot"), 'pooledpr1_xcor_scores': pooledpr1_peaks_subjob.get_output_ref("xcor_scores"), 'pooledpr2_peaks': pooledpr2_peaks_subjob.get_output_ref("peaks"), 'pooledpr2_xcor_plot': pooledpr2_peaks_subjob.get_output_ref("xcor_plot"), 'pooledpr2_xcor_scores': pooledpr2_peaks_subjob.get_output_ref("xcor_scores"), }) return output
def main(input_SAM, deviations=None, histogram_width=None, min_percent=None, metric_acc_level=None, ref=None, is_sorted=None, stop_after=None): # The following line(s) download your file inputs to the local file system # using variable names for the filenames. dxpy.download_dxfile(input_SAM, "input") if ref != None: dxpy.download_dxfile(ref, "ref.fa") command = "java -Xmx2g -jar /CollectInsertSizeMetrics.jar" command += " INPUT=input" command += " OUTPUT=insert_distribution.txt" command += " HISTOGRAM_FILE=histogram.pdf" if deviations != None: command += " DEVIATIONS=" + str(deviations) if histogram_width != None: command += " HISTOGRAM_WIDTH=" + str(histogram_width) if min_percent != None: command += " MINIMUM_PCT=" + str(histogram_width) if metric_acc_level != None: for level in metric_acc_level: command += " METRIC_ACCUMULATION_LEVEL=" + str(level) if ref != None: command += " REFERENCE_SEQUENCE=ref.fa" if is_sorted != None: if is_sorted: command += " ASSUME_SORTED=true" else: command += " ASSUME_SORTED=false" if stop_after != None: command += " STOP_AFTER=" + str(stop_after) print "Executing:" print command # CALL the command here: subprocess.check_call(command, shell=True) # The following line(s) use the Python bindings to upload your file outputs # after you have created them on the local file system. It assumes that you # have used the output field name for the filename for each output, but you # can change that behavior to suit your needs. histogram = dxpy.upload_local_file("histogram.pdf") histogram.rename( dxpy.DXFile(input_SAM).describe()['name'] + "_histogram.pdf") output_dist = dxpy.upload_local_file("insert_distribution.txt") output_dist.rename( dxpy.DXFile(input_SAM).describe()['name'] + "_insert_dist.txt") # The following line fills in some basic dummy output and assumes # that you have created variables to represent your output with # the same name as your output fields. output = {} output["histogram"] = dxpy.dxlink(histogram) output["output"] = dxpy.dxlink(output_dist) return output