def main(**job_inputs): output = {} os.mkdir('/home/dnanexus/split') # make k-mer histogra w/ meryl _write_generator_file(job_inputs['sequences_fastx'], GENERATOR_FILENAME) _run_meryl(output, job_inputs["sequences_fastx"], job_inputs["k_mer_size"],job_inputs["is10x"]) # Run Genomescope mem_in_b = dx_utils.run_cmd("head -n1 /proc/meminfo | awk '{print int($2*0.6*1024)}'", returnOutput=True) read_length = _get_read_length() cmd = ['Rscript', './genomescope.R', "mer_counts.tsv", str(job_inputs['k_mer_size']), str(read_length), './', str(MAX_KMER_COVERAGE) ] _run_cmd(cmd) genomescope_summary = _get_genomescope_summary('summary.txt', os.path.exists('model.txt')) # Upload the output files. output['genomescope_figures'] = [ dxpy.upload_local_file('plot.png', name='{0}.gs.png'.format(job_inputs['output_prefix'])), dxpy.upload_local_file('plot.log.png', name='{0}.gs.log.png'.format(job_inputs['output_prefix'])) ] output['genomescope_files'] = [ dxpy.upload_local_file('summary.txt', name='{0}.summary.txt'.format(job_inputs['output_prefix'])), dxpy.upload_local_file('progress.txt', name='{0}.progress.txt'.format(job_inputs['output_prefix'])) ] # If GenomeScope failed to converge for some reason, there will be no model.txt file. if os.path.exists('model.txt'): output['genomescope_files'].append(dxpy.upload_local_file('model.txt', name='{0}.model.txt'.format(job_inputs['output_prefix']))) output.update(genomescope_summary) return output
def coverage(CpG_context_dxlink, CHG_context_dxlink, CHH_context_dxlink, dme_ix_dxlink, target_root): '''subjob runs bismark2bedGraph and coverage2cytosine on mem3_hdd2_x8''' print "* coverage(): Retrieve context files and index..." CpG_context = 'output/CpG_context_%s.txt' % target_root CHG_context = 'output/CHG_context_%s.txt' % target_root CHH_context = 'output/CHH_context_%s.txt' % target_root run_cmd('mkdir -p output/') dxpy.download_dxfile(CpG_context_dxlink, CpG_context) dxpy.download_dxfile(CHG_context_dxlink, CHG_context) dxpy.download_dxfile(CHH_context_dxlink, CHH_context) dme_ix = "dme_index.tar.gz" dxpy.download_dxfile(dme_ix_dxlink, dme_ix) print "* coverage(): Uncompress index..." run_cmd('tar -zxf ' + dme_ix) (bedGraph_gz, cx_report) = bismark_coverage(target_root, CpG_context, CHG_context, CHH_context) print "* coverage(): Storing coverage results..." cx_report_dxfile = dxpy.upload_local_file(cx_report) bedgraph_gz_dxfile = dxpy.upload_local_file(bedGraph_gz) print "* coverage(): Check storage..." run_cmd('ls -l') run_cmd('df -k .') return { "cx_report_dxlink": dxpy.dxlink(cx_report_dxfile), "bedgraph_gz_dxlink": dxpy.dxlink(bedgraph_gz_dxfile) }
def main ( HiC_norm_binning_hdf5, HiC_data_object_hdf5, fend_object_hdf5, chromosome, contact_matrix_binsize, chrlen_file): dxpy.download_dxfile(HiC_norm_binning_hdf5, "HiC_norm_binning.hdf5") dxpy.download_dxfile(HiC_data_object_hdf5, "HiC_data_object.hdf5") dxpy.download_dxfile(fend_object_hdf5, "fend_object.hdf5") dxpy.download_dxfile(chrlen_file, "chrlen_file") command = "cp -r /miniconda ~; cp -r /.conda ~; bash -c 'PATH=/home/dnanexus/miniconda/miniconda2/bin:$PATH; source activate gitar; which python; python /usr/bin/HiCtool_norm_contact_matrix.arg.py HiC_norm_binning.hdf5 {chromosome} {contact_matrix_binsize} chrlen_file .'".format(chromosome=chromosome, contact_matrix_binsize=contact_matrix_binsize) print(command) subprocess.call(command, shell=True) observed_contact_matrix_filename = subprocess.check_output("ls -1 HiCtool_observed_contact_matrix*.txt",shell=True).strip() normalized_fend_contact_matrix_filename = subprocess.check_output("ls -1 HiCtool_normalized_fend_contact_matrix*.txt",shell=True).strip() normalized_enrich_contact_matrix_filename = subprocess.check_output("ls -1 HiCtool_normalized_enrich_contact_matrix*.txt",shell=True).strip() expected_fend_contact_matrix_filename = subprocess.check_output("ls -1 HiCtool_expected_fend_contact_matrix*.txt",shell=True).strip() expected_enrich_contact_matrix_filename = subprocess.check_output("ls -1 HiCtool_expected_enrich_contact_matrix*.txt",shell=True).strip() observed_contact_matrix_file = dxpy.upload_local_file(observed_contact_matrix_filename) normalized_fend_contact_matrix_file = dxpy.upload_local_file(normalized_fend_contact_matrix_filename) normalized_enrich_contact_matrix_file = dxpy.upload_local_file(normalized_enrich_contact_matrix_filename) expected_fend_contact_matrix_file = dxpy.upload_local_file(expected_fend_contact_matrix_filename) expected_enrich_contact_matrix_file = dxpy.upload_local_file(expected_enrich_contact_matrix_filename) return { "observed_contact_matrix": observed_contact_matrix_file, "normalized_fend_contact_matrix": normalized_fend_contact_matrix_file, "normalized_enrich_contact_matrix": normalized_enrich_contact_matrix_file, "expected_fend_contact_matrix": expected_fend_contact_matrix_file, "expected_enrich_contact_matrix": expected_enrich_contact_matrix_file }
def create_index_file(bam_filename, bam_dxlink): """Create Index file. Sorts BAM if needed """ print("Creating Index file.") index_filename = "{bam}.bai".format(bam=bam_filename) cmd_index = ['samtools', 'index', bam_filename] sorted_filename = bam_filename try: run_cmd(cmd_index) except NotIndexedException: print("Sorting BAM") sorted_filename = bam_filename[:-4] + '.sorted.bam' cmd_sort = [ 'samtools', 'sort', bam_filename, bam_filename[:-4] + '.sorted'] run_cmd(cmd_sort) print("Indexing BAM") index_cmd = ['samtools', 'index', sorted_filename] index_filename = "{sorted_bam_name}.bai".format( sorted_bam_name=sorted_filename) run_cmd(index_cmd) finally: index_file_link = dxpy.dxlink(dxpy.upload_local_file(index_filename)) aligned_sorted_bam = dxpy.dxlink(dxpy.upload_local_file(sorted_filename)) return aligned_sorted_bam, index_file_link
def get_use_bases_mask(self, output_folder): ''' command = "python calculate_use_bases_mask.py {runinfoFile} {sampleSheet} {lane}" gbsc_utils.createSubprocess(cmd=command) ''' misc_subfolder = output_folder + '/miscellany' run_info_file = 'RunInfo.xml' command = 'python calculate_use_bases_mask.py %s ' % run_info_file command += '%s ' % self.sample_sheet command += '%d ' % int(self.lane_index) command += '%d' % int(self.bcl2fastq_version) stdout, stderr = self.createSubprocess(cmd=command, pipeStdout=True) self.use_bases_mask = stdout print 'This is use_bases_mask value: %s' % self.use_bases_mask use_bases_mask_file = 'use_bases_mask.txt' with open(use_bases_mask_file, 'w') as OUT: OUT.write(self.use_bases_mask) dxpy.upload_local_file(filename=use_bases_mask_file, properties=None, project=self.lane_project_id, folder=misc_subfolder, parents=True) return self.use_bases_mask
def create_index_file(bam_filename, bam_dxlink): """Create Index file. Sorts BAM if needed """ print "Creating Index file." index_filename = "{bam}.bai".format(bam=bam_filename) cmd_index = ['samtools', 'index', bam_filename] sorted_filename = bam_filename try: run_cmd(cmd_index) except NotIndexedException: print "Sorting BAM" sorted_filename = bam_filename[:-4] + '.sorted.bam' cmd_sort = [ 'samtools', 'sort', bam_filename, bam_filename[:-4] + '.sorted'] run_cmd(cmd_sort) print "Indexing BAM" index_cmd = ['samtools', 'index', sorted_filename] index_filename = "{sorted_bam_name}.bai".format( sorted_bam_name=sorted_filename) run_cmd(index_cmd) finally: index_file_link = dxpy.dxlink(dxpy.upload_local_file(index_filename)) aligned_sorted_bam = dxpy.dxlink(dxpy.upload_local_file(sorted_filename)) return aligned_sorted_bam, index_file_link
def main ( bam1, bam2, RE_site_bed): dxpy.download_dxfile(bam1, "input1.bam") dxpy.download_dxfile(bam2, "input2.bam") dxpy.download_dxfile(RE_site_bed, "RE.bed") command = "cp -r /miniconda ~; cp -r /.conda ~; bash -c 'PATH=/home/dnanexus/miniconda/miniconda2/bin:$PATH; source activate gitar; which python; python /usr/bin/HiCtool_hifive.arg.py input1.bam input2.bam RE.bed .'" print(command) subprocess.call(command, shell=True) fend_object_hdf5_filename = "./fend_object.hdf5" HiC_data_object_hdf5_filename = "./HiC_data_object.hdf5" HiC_distance_function_hdf5_filename = "./HiC_distance_function.hdf5" HiC_norm_binning_hdf5_filename = "./HiC_norm_binning.hdf5" HiC_project_object_hdf5_filename = "./HiC_project_object.hdf5" #fend_object_hdf5_file = dxpy.upload_local_file(fend_object_hdf5_filename, folder=outdir) #HiC_data_object_hdf5_file = dxpy.upload_local_file(HiC_data_object_hdf5_filename, folder=outdir) #HiC_distance_function_hdf5_file= dxpy.upload_local_file(HiC_distance_function_hdf5_filename, folder=outdir) #HiC_norm_binning_hdf5_file= dxpy.upload_local_file(HiC_norm_binning_hdf5_filename, folder=outdir) #HiC_project_object_hdf5_file= dxpy.upload_local_file(HiC_project_object_hdf5_filename, folder=outdir) fend_object_hdf5_file = dxpy.upload_local_file(fend_object_hdf5_filename) HiC_data_object_hdf5_file = dxpy.upload_local_file(HiC_data_object_hdf5_filename) HiC_distance_function_hdf5_file= dxpy.upload_local_file(HiC_distance_function_hdf5_filename) HiC_norm_binning_hdf5_file= dxpy.upload_local_file(HiC_norm_binning_hdf5_filename) HiC_project_object_hdf5_file= dxpy.upload_local_file(HiC_project_object_hdf5_filename) return { "fend_object_hdf5": fend_object_hdf5_file, "HiC_data_object_hdf5": HiC_data_object_hdf5_file, "HiC_distance_function_hdf5": HiC_distance_function_hdf5_file, "HiC_norm_binning_hdf5": HiC_norm_binning_hdf5_file, "HiC_project_object_hdf5": HiC_project_object_hdf5_file }
def main ( HiC_norm_binning_hdf5, HiC_data_object_hdf5, fend_object_hdf5, contact_matrix_binsize, chrlen_file): dxpy.download_dxfile(HiC_norm_binning_hdf5, "HiC_norm_binning.hdf5") dxpy.download_dxfile(HiC_data_object_hdf5, "HiC_data_object.hdf5") dxpy.download_dxfile(fend_object_hdf5, "fend_object.hdf5") dxpy.download_dxfile(chrlen_file, "chrlen_file") FH_chrlen = open("chrlen_file","r") chrs = [line.strip().split('\t')[0] for line in FH_chrlen]; matrix_list_filename = "HiCtool_matrix_list.txt" fout = open(matrix_list_filename,"w") for chr in chrs: command = "cp -r /miniconda ~; cp -r /.conda ~; bash -c 'PATH=/home/dnanexus/miniconda/miniconda2/bin:$PATH; source activate gitar; which python; python /usr/bin/HiCtool_norm_contact_matrix.arg.py HiC_norm_binning.hdf5 {chromosome} {contact_matrix_binsize} chrlen_file .'".format(chromosome=chr, contact_matrix_binsize=contact_matrix_binsize) print(command) subprocess.call(command, shell=True) observed_matrix_filename = subprocess.check_output("ls -1 HiCtool_observed_contact_matrix_chr{chr}_*txt".format(chr=chr), shell=True).strip() normalized_fend_matrix_filename = subprocess.check_output("ls -1 HiCtool_normalized_fend_contact_matrix_chr{chr}_*txt".format(chr=chr), shell=True).strip() normalized_enrich_matrix_filename = subprocess.check_output("ls -1 HiCtool_normalized_enrich_contact_matrix_chr{chr}_*txt".format(chr=chr), shell=True).strip() fout.write("chr"+chr+"\t"+observed_matrix_filename+"\t"+normalized_fend_matrix_filename+"\t"+normalized_enrich_matrix_filename+'\n') fout.close() FH_chrlen.close() all_contact_matrices_filename = "HiCtool_contact_matrices.tar.gz" subprocess.call("tar -czf {gzfile} HiCtool_*contact_matrix*txt".format(gzfile=all_contact_matrices_filename), shell=True) all_contact_matrices_file = dxpy.upload_local_file(all_contact_matrices_filename) matrix_list_file = dxpy.upload_local_file(matrix_list_filename) return { "all_contact_matrices": all_contact_matrices_file, "matrix_list": matrix_list_file }
def main(input_bam, paired=True, params=''): # The following line(s) initialize your data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. input_bam = dxpy.DXFile(input_bam) base_name = remove_extensions(input_bam.describe()['name'], [".bam", ".BAM", ".sam", ".SAM"]) # The following line(s) download your file inputs to the local file system # using variable names for the filenames. dxpy.download_dxfile(input_bam.get_id(), "input.bam") # Fill in your application code here. command = "java -Xmx6g -jar /opt/jar/SamToFastq.jar INPUT=input.bam F=%s_1.fastq" % base_name if paired: command += " F2=%s_2.fastq" % base_name subprocess.check_call(command, shell=True) # The following line(s) use the Python bindings to upload your file outputs # after you have created them on the local file system. It assumes that you # have used the output field name for the filename for each output, but you # can change that behavior to suit your needs. output = {} fastq_file = dxpy.upload_local_file("%s_1.fastq" % base_name); output["fastq_file"] = dxpy.dxlink(fastq_file) if paired: paired_fastq_file = dxpy.upload_local_file("%s_2.fastq" % base_name); output["paired_fastq_file"] = dxpy.dxlink(paired_fastq_file) return output
def process(fastq): # Change the following to process whatever input this stage # receives. You may also want to copy and paste the logic to download # and upload files here as well if this stage receives file input # and/or makes file output. print fastq reads_filename = dxpy.describe(fastq)["name"] reads_basename = reads_filename.rstrip(".gz").rstrip(".fq").rstrip(".fastq") reads_file = dxpy.download_dxfile(fastq, "fastq.gz") subprocess.check_call(["mkdir", "output"]) print "Run QC" fqc_command = "/usr/bin/FastQC/fastqc fastq.gz -o output" print fqc_command stdio = subprocess.check_output(shlex.split(fqc_command)) print stdio print subprocess.check_output(["ls", "-l", "output"]) subprocess.check_call(["unzip", "output/fastq_fastqc.zip"]) print "Upload results" subprocess.check_call(["mv", "fastq_fastqc/fastqc_data.txt", "%s_data.txt" % reads_basename]) subprocess.check_call(["mv", "fastq_fastqc/summary.txt", "%s_summary.txt" % reads_basename]) subprocess.check_call(["mv", "output/fastq_fastqc.zip", "%s_fastqc.zip" % reads_basename]) report_dxfile = dxpy.upload_local_file("%s_data.txt" % reads_basename) summary_dxfile = dxpy.upload_local_file("%s_summary.txt" % reads_basename) zip_dxfile = dxpy.upload_local_file("%s_fastqc.zip" % reads_basename) print report_dxfile return {"report": report_dxfile, "summary": summary_dxfile, "zip": zip_dxfile}
def main(input_bam, paired=True, params=''): # The following line(s) initialize your data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. input_bam = dxpy.DXFile(input_bam) base_name = remove_extensions(input_bam.describe()['name'], [".bam", ".BAM", ".sam", ".SAM"]) # The following line(s) download your file inputs to the local file system # using variable names for the filenames. dxpy.download_dxfile(input_bam.get_id(), "input.bam") # Fill in your application code here. command = "java -Xmx6g -jar /opt/jar/SamToFastq.jar INPUT=input.bam F=%s_1.fastq" % base_name if paired: command += " F2=%s_2.fastq" % base_name subprocess.check_call(command, shell=True) # The following line(s) use the Python bindings to upload your file outputs # after you have created them on the local file system. It assumes that you # have used the output field name for the filename for each output, but you # can change that behavior to suit your needs. output = {} fastq_file = dxpy.upload_local_file("%s_1.fastq" % base_name) output["fastq_file"] = dxpy.dxlink(fastq_file) if paired: paired_fastq_file = dxpy.upload_local_file("%s_2.fastq" % base_name) output["paired_fastq_file"] = dxpy.dxlink(paired_fastq_file) return output
def create_final_set_of_peak_calls(job_inputs): replicate_idr_prefixes = [r.replace('.tar.gz', '') for r in job_inputs['replicate_idr_files']] pseudo_replicate_idr_prefixes = [r.replace('.tar.gz', '') for r in job_inputs['pseudo_replicate_idr_files']] pooled_pseudo_replicate_idr_prefix = job_inputs['pooled_pseudo_replicate_idr_files'].replace('.tar.gz', '') (num_peaks_each_rep, num_peaks_each_pseudo_rep, numPeaks_Rep0) = get_thresholds(replicate_idr_prefixes, pseudo_replicate_idr_prefixes, pooled_pseudo_replicate_idr_prefix, job_inputs['replicate_peaks_threshold'], job_inputs['pseudo_replicate_peaks_threshold'], job_inputs['pooled_pseudo_replicate_peaks_threshold']) max_numPeaks_Rep = max(num_peaks_each_rep) pooled_replicates_peaks_fn = download_and_gunzip_file(job_inputs['pooled_replicate_peaks_file']) coi = {'signal.value': 7, 'p.value': 8, 'q.value': 9}[job_inputs['ranking_measure']] cmd = 'sort -k{0}nr,{0}nr "{1}" | head -n {2} | gzip -c > "{3}_conservative.regionPeak.gz"'.format(coi, pooled_replicates_peaks_fn, max_numPeaks_Rep, job_inputs['output_prefix']) print cmd subprocess.check_output(cmd, shell=True) opt_thresh = max(max_numPeaks_Rep, numPeaks_Rep0) cmd = 'sort -k{0}nr,{0}nr "{1}" | head -n {2} | gzip -c > "{3}_optimal.regionPeak.gz"'.format(coi, pooled_replicates_peaks_fn, opt_thresh, job_inputs['output_prefix']) print cmd subprocess.check_output(cmd, shell=True) conservative_result = dxpy.upload_local_file('{0}_conservative.regionPeak.gz'.format(job_inputs['output_prefix'])) optimal_result = dxpy.upload_local_file('{0}_optimal.regionPeak.gz'.format(job_inputs['output_prefix'])) return {'conservative_peak_calls': dxpy.dxlink(conservative_result), 'optimal_peak_calls': dxpy.dxlink(optimal_result), 'num_peaks_each_rep': num_peaks_each_rep, 'num_peaks_each_pseudo_rep': num_peaks_each_pseudo_rep, 'num_peaks_pooled_pseudo_rep': numPeaks_Rep0}
def makeInputsBwa(): try: contigset_importer = dxpy.DXApplet(dxpy.find_data_objects(classname="applet", properties={"name": "fasta_contigset_importer"}).next()['id']) reads_importer = dxpy.DXApplet(dxpy.find_data_objects(classname="applet", properties={"name": "Letter Space FASTQ importer"}).next()['id']) except StopIteration: raise Exception("fasta_contigset_importer or Letter Space FASTQ importer not found, please upload them") genome_archive = dxpy.upload_local_file(os.path.join(test_resources_dir, "hg19_chr22.fa.xz"), wait_on_close=True) contigset_importer_input = {"name": "hg19_chr22", "sequence_file": dxpy.dxlink(genome_archive)} print "Running fasta_contigset_importer with", contigset_importer_input job = contigset_importer.run(contigset_importer_input) job.wait_on_done() contig_set = job.describe()["output"]["contig_set"] left_reads = dxpy.upload_local_file(os.path.join(test_resources_dir, "small_left.fq"), wait_on_close=True) right_reads = dxpy.upload_local_file(os.path.join(test_resources_dir, "small_right.fq"), wait_on_close=True) #left_reads = dxpy.upload_local_file(os.path.join(test_resources_dir, "SRR188205_1_1M.fastq.xz"), wait_on_close=True) #right_reads = dxpy.upload_local_file(os.path.join(test_resources_dir, "SRR188205_2_1M.fastq.xz"), wait_on_close=True) reads_importer_input = {"left_file": dxpy.dxlink(left_reads), "right_file": dxpy.dxlink(right_reads)} print "Running LetterSpaceFileObjectToReadsTable with", reads_importer_input job = reads_importer.run(reads_importer_input) job.wait_on_done() reads = job.describe()["output"]["reads"] return {"reads": [reads] * 3, "reference": contig_set}
def test_get_appet_with_asset(self): bundle_name = "test-bundle-depends.tar.gz" bundle_tmp_dir = tempfile.mkdtemp() os.mkdir(os.path.join(bundle_tmp_dir, "a")) with open(os.path.join(bundle_tmp_dir, 'a', 'foo.txt'), 'w') as file_in_bundle: file_in_bundle.write('foo\n') subprocess.check_call(['tar', '-czf', os.path.join(bundle_tmp_dir, bundle_name), '-C', os.path.join(bundle_tmp_dir, 'a'), '.']) bundle_file = dxpy.upload_local_file(filename=os.path.join(bundle_tmp_dir, bundle_name), project=self.project, wait_on_close=True) asset_file = dxpy.upload_local_file(filename=os.path.join(bundle_tmp_dir, bundle_name), project=self.project, wait_on_close=True) dxrecord_details = {"archiveFileId": {"$dnanexus_link": asset_file.get_id()}} dxrecord = dxpy.new_dxrecord(project=self.project, types=["AssetBundle"], details=dxrecord_details, name='asset-lib-test', properties={"version": "0.0.1"}) dxrecord.close() asset_bundle_id = dxrecord.get_id() asset_file.set_properties({"AssetBundle": asset_bundle_id}) code_str = """#!/bin/bash main(){ echo 'Hello World' } """ app_spec = { "name": "asset_depends", "dxapi": "1.0.0", "runSpec": { "code": code_str, "interpreter": "bash", "assetDepends": [{"id": asset_bundle_id}], "bundledDepends": [{"name": bundle_name, "id": {"$dnanexus_link": bundle_file.get_id()}}] }, "inputSpec": [], "outputSpec": [], "version": "1.0.0" } app_dir = self.write_app_directory("asset_depends", json.dumps(app_spec)) asset_applet_id = json.loads(run("dx build --json {app_dir}".format(app_dir=app_dir)))["id"] with chdir(tempfile.mkdtemp()): run("dx get --omit-resources " + asset_applet_id) self.assertTrue(os.path.exists("asset_depends")) self.assertFalse(os.path.exists(os.path.join("asset_depends", "resources"))) self.assertTrue(os.path.exists(os.path.join("asset_depends", "dxapp.json"))) applet_spec = json.load(open(os.path.join("asset_depends", "dxapp.json"))) self.assertEqual([{"name": "asset-lib-test", "project": self.project, "folder": "/", "version": "0.0.1"} ], applet_spec["runSpec"]["assetDepends"]) self.assertEqual([{"name": bundle_name, "id": {"$dnanexus_link": bundle_file.get_id()}}], applet_spec["runSpec"]["bundledDepends"])
def main(input_bam, paired_end): input_bam_file = dxpy.DXFile(input_bam) input_bam_filename = input_bam_file.name input_bam_basename = input_bam_file.name.rstrip('.bam') dxpy.download_dxfile(input_bam_file.get_id(), input_bam_filename) intermediate_TA_filename = input_bam_basename + ".tagAlign" if paired_end: end_infix = 'PE2SE' else: end_infix = 'SE' final_TA_filename = input_bam_basename + '.' + end_infix + '.tagAlign.gz' subprocess.check_output('ls -l', shell=True) # =================== # Create tagAlign file # =================== out, err = common.run_pipe([ "bamToBed -i %s" % (input_bam_filename), r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'""", "tee %s" % (intermediate_TA_filename), "gzip -cn"], outfile=final_TA_filename) subprocess.check_output('ls -l', shell=True) # ================ # Create BEDPE file # ================ if paired_end: final_nmsrt_bam_prefix = input_bam_basename + ".nmsrt" final_nmsrt_bam_filename = final_nmsrt_bam_prefix + ".bam" command = \ "samtools sort -@ %d -n %s %s" \ % (cpu_count(), input_bam_filename, final_nmsrt_bam_prefix) logger.info(command) subprocess.check_call(shlex.split(command)) final_BEDPE_filename = input_bam_basename + ".bedpe.gz" out, err = common.run_pipe([ "bamToBed -bedpe -mate1 -i %s" % (final_nmsrt_bam_filename), "gzip -cn"], outfile=final_BEDPE_filename) subprocess.check_output('ls -l', shell=True) tagAlign_file = dxpy.upload_local_file(final_TA_filename) if paired_end: BEDPE_file = dxpy.upload_local_file(final_BEDPE_filename) output = {} output["tagAlign_file"] = dxpy.dxlink(tagAlign_file) if paired_end: output["BEDPE_file"] = dxpy.dxlink(BEDPE_file) return output
def postprocess(bam_files, report_files, bam_root, nthreads=8, use_cat=False, use_sort=False): # This is the "gather" phase which aggregates and performs any # additional computation after the "map" (and therefore after all # the "process") jobs are done. if DEBUG: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) logger.debug("** In Postprocess - refactored dme-merge-bams - *") versions = "Unknown" if os.path.isfile(VERSION_SCRIPT): try: versions = subprocess.check_output( shlex.split( 'tool_versions.py --dxjson dnanexus-executable.json')) except: pass merged_bam = merge_bams(bam_files, bam_root, use_cat, use_sort, nthreads) (merged_report, report_file_names) = merge_reports(bam_root, report_files, bam_root) (merged_qc, nreads, metadata) = merge_qc(bam_root, report_file_names) props = { 'SW': versions, 'reads': nreads, } output = { "bam_techrep": dxpy.dxlink( dxpy.upload_local_file(merged_bam, details=metadata, properties=props)), "bam_techrep_qc": dxpy.dxlink( dxpy.upload_local_file(merged_qc, details=metadata, properties={'SW': versions})), "map_techrep": dxpy.dxlink( dxpy.upload_local_file(merged_report, details=metadata, properties={'SW': versions})), "reads": nreads, "metadata": json.dumps(metadata) } return output
def main(input_SAM, deviations=None, histogram_width=None, min_percent=None, metric_acc_level=None, ref=None, is_sorted=None, stop_after=None): # The following line(s) download your file inputs to the local file system # using variable names for the filenames. dxpy.download_dxfile(input_SAM, "input") if ref != None: dxpy.download_dxfile(ref, "ref.fa") command = "java -Xmx2g -jar /CollectInsertSizeMetrics.jar" command += " INPUT=input" command += " OUTPUT=insert_distribution.txt" command += " HISTOGRAM_FILE=histogram.pdf" if deviations != None: command += " DEVIATIONS="+str(deviations) if histogram_width != None: command += " HISTOGRAM_WIDTH="+str(histogram_width) if min_percent != None: command += " MINIMUM_PCT="+str(histogram_width) if metric_acc_level != None: for level in metric_acc_level: command += " METRIC_ACCUMULATION_LEVEL="+str(level) if ref != None: command += " REFERENCE_SEQUENCE=ref.fa" if is_sorted != None: if is_sorted: command += " ASSUME_SORTED=true" else: command += " ASSUME_SORTED=false" if stop_after != None: command += " STOP_AFTER="+str(stop_after) print "Executing:" print command # CALL the command here: subprocess.check_call(command, shell=True) # The following line(s) use the Python bindings to upload your file outputs # after you have created them on the local file system. It assumes that you # have used the output field name for the filename for each output, but you # can change that behavior to suit your needs. histogram = dxpy.upload_local_file("histogram.pdf") histogram.rename(dxpy.DXFile(input_SAM).describe()['name']+"_histogram.pdf") output_dist = dxpy.upload_local_file("insert_distribution.txt") output_dist.rename(dxpy.DXFile(input_SAM).describe()['name']+"_insert_dist.txt") # The following line fills in some basic dummy output and assumes # that you have created variables to represent your output with # the same name as your output fields. output = {} output["histogram"] = dxpy.dxlink(histogram) output["output"] = dxpy.dxlink(output_dist) return output
def upload(self, to_upload, **kwargs): """Upload a list of files and directories to a directory. This is not a batch level operation. If some file errors, the files uploaded before will remain present. Args: to_upload (List[Union[str, OBSUploadObject]]): A list of posix file names, directory names, or OBSUploadObject objects to upload. Raises: ValueError: When source path is not a directory TargetExistsError: When destination directory already exists """ dx_upload_objects = [ name for name in to_upload if isinstance(name, OBSUploadObject) ] all_files_to_upload = utils.walk_files_and_dirs([ name for name in to_upload if not isinstance(name, OBSUploadObject) ]) dx_upload_objects.extend([ OBSUploadObject( f, object_name=('/' + self.resource if self.resource else Path('')) / utils.file_name_to_object_name(f)) for f in all_files_to_upload ]) for upload_obj in dx_upload_objects: upload_obj.object_name = Path(upload_obj.object_name) upload_obj.source = Path(upload_obj.source) dest_file = Path('{drive}{project}:{path}'.format( drive=self.drive, project=self.canonical_project, path=upload_obj.object_name)) if upload_obj.source.isfile(): dest_is_file = dest_file.isfile() if dest_is_file: # only occurs if upload is called directly with existing objects logger.warning( 'Destination path ({}) already exists, will not cause ' 'duplicate file objects on the platform. Skipping...'. format(dest_file)) else: with _wrap_dx_calls(): dxpy.upload_local_file( filename=upload_obj.source, project=self.canonical_project, folder='/' + (dest_file.parent.resource or ''), parents=True, name=dest_file.name) elif upload_obj.source.isdir(): dest_file.makedirs_p() else: raise stor_exceptions.NotFoundError( 'Source path ({}) does not exist. Please provide a valid source' .format(upload_obj.source))
def once(): try: dxpy.upload_local_file(filename = local_path, project = project.get_id(), folder = destFolder, wait_on_close=True) return True except: return False
def main(**kwargs): dxpy.download_folder(DCC_CREDENTIALS_PROJECT, '.', folder=DCC_CREDENTIALS_FOLDER) if 'key' in kwargs: key = '-'.join([dxpy.api.system_whoami()['id'], kwargs.pop('key')]) else: key = dxpy.api.system_whoami()['id'] key_tuple = common.processkey(key, KEYFILE) if not key_tuple: logger.error("Key %s is not found in the keyfile %s" % (key, KEYFILE)) raise PortalCredentialsError("Supply a valid keypair ID") authid, authpw, server = key_tuple if 'url' in kwargs: server = kwargs.pop('url') keypair = (authid, authpw) tokens = ['python3 checkfiles.py'] for k, v in kwargs.iteritems(): if isinstance(v, bool): if v: tokens.append("--" + k.replace('_', '-')) continue if isinstance(v, str) or isinstance(v, unicode) or isinstance(v, int): tokens.append(' '.join(["--" + k.replace('_', '-'), str(v)])) if 'dx_file' in kwargs: dxfile = dxpy.DXFile(kwargs.get('dx_file')) local_file = dxpy.download_dxfile(dxfile, dxfile.name) tokens.append("--local-file %s" % (dxfile.name)) # this is just to get a command string to print that has no secrets tokens_safe = deepcopy(tokens) tokens_safe.append("--username %s --password %s" % ("." * len(authid), "." * len(authpw))) tokens_safe.append(server) logger.info(' '.join(tokens_safe)) tokens.append("--username %s --password %s" % (authid, authpw)) # this needs to be the last token tokens.append(server) checkfiles_command = ' '.join(tokens) subprocess.check_call(shlex.split(checkfiles_command)) output = {} outfilename = kwargs.get('out') errfilename = kwargs.get('err') if outfilename: out = dxpy.upload_local_file(outfilename) output.update({'out': dxpy.dxlink(out)}) if errfilename: err = dxpy.upload_local_file(errfilename) output.update({'err': dxpy.dxlink(err)}) return output
def merge_extract(bam_set, map_report_set, dme_ix_dxlink, uncompress_bam, props): '''subjob runs bismark_methylation_extractor on mem1_hdd2_x32''' (target_root,biorep_bam) = merge_bams(bam_set, 32) (biorep_map,all_reports) = merge_map_reports(map_report_set, target_root) (qc_metrics, reads, biorep_bam_qc) = biorep_bam_qc_metrics(target_root, all_reports) print "* merge_extract(): Retrieve and uncompress index..." dme_ix = "dme_index.tar.gz" dxpy.download_dxfile(dme_ix_dxlink, dme_ix) run_cmd('tar -zxf ' + dme_ix) # NOTE: Better to use sam and let extractor use more threads, but this takes up precious storage (alignments, ncores) = bam_or_sam(biorep_bam, uncompress_bam, target_root) bismark_simple_extract(target_root, alignments, ncores) qc_metrics = bismark_qc_metrics(target_root, qc_metrics) print "* Retrieve split report..." append_line("\n===== bismark_methylation_extractor: splitting_report =====",biorep_bam_qc) run_cmd('cat %s_splitting_report.txt' % target_root,out=biorep_bam_qc,append=True,silent=True) # TODO: Is this even needed? Currently we do to get the size! #if len(bam_set) > 1: # Wouldn't need to do this unless there is a merge # print "* merge_extract(): Storing biorep bam..." # props_ex = props.copy() # props_ex.update({ 'reads': str(reads) }) # biorep_bam_dxlink = dxpy.dxlink(dxpy.upload_local_file(biorep_bam,properties=props_ex,details=qc_metrics,wait_on_close=True)) #else: # biorep_bam_dxlink = bam_set[0] print "* merge_extract(): Storing extraction results..." biorep_bam_qc_dxfile = dxpy.upload_local_file(biorep_bam_qc,properties=props,details=qc_metrics) biorep_map_dxfile = dxpy.upload_local_file(biorep_map, properties=props,details=qc_metrics) split_report_dxfile = dxpy.upload_local_file(target_root+'_splitting_report.txt') split_report_dxfile = dxpy.upload_local_file(target_root+'_splitting_report.txt') chrom_sizes_dxfile = dxpy.upload_local_file('input/chrom.sizes') mbias_report_dxfile = dxpy.upload_local_file(target_root+'_mbias_report.txt',properties=props,details=qc_metrics) CpG_context_dxfile = dxpy.upload_local_file('output/CpG_context_%s.txt' % (target_root)) CHG_context_dxfile = dxpy.upload_local_file('output/CHG_context_%s.txt' % (target_root)) CHH_context_dxfile = dxpy.upload_local_file('output/CHH_context_%s.txt' % (target_root)) print "* merge_extract(): Check storage..." run_cmd('ls -l') run_cmd('df -k .') return { #"biorep_bam_dxlink": biorep_bam_dxfile, "biorep_bam_qc_dxlink": dxpy.dxlink(biorep_bam_qc_dxfile), "biorep_map_dxlink": dxpy.dxlink(biorep_map_dxfile), "CpG_context_dxlink": dxpy.dxlink(CpG_context_dxfile), "CHG_context_dxlink": dxpy.dxlink(CHG_context_dxfile), "CHH_context_dxlink": dxpy.dxlink(CHH_context_dxfile), "split_report_dxlink": dxpy.dxlink(split_report_dxfile), "chrom_sizes_dxlink": dxpy.dxlink(chrom_sizes_dxfile), "mbias_report_dxlink": dxpy.dxlink(mbias_report_dxfile), "target_root": target_root, "qc_metrics": qc_metrics }
def process(filename, bucket_url, project, folder, skipvalidate=False): # Change the following to process whatever input this stage # receives. You may also want to copy and paste the logic to download # and upload files here as well if this stage receives file input # and/or makes file output. logger.debug(filename) test = list( dxpy.find_data_objects(classname='file', folder=folder, project=project, name_mode='exact', name=filename, return_handler=False) ) if not test or len(test) == 0: #cp the file from the bucket subprocess.check_call(shlex.split('aws s3 cp %s . --quiet' %(bucket_url)), stderr=subprocess.STDOUT) subprocess.check_call(shlex.split('ls -l %s' %(filename))) dx_file = dxpy.upload_local_file(filename, project=project, folder=folder) else: dxpy.download_dxfile(test[0]['id'], filename) dx_file=dxpy.dxfile.DXFile(test[0]['id']) reads_basename = filename.rstrip('.gz').rstrip('.fq').rstrip('.fastq') if skipvalidate: return { "file": dx_file, "report": None, "summary": None, "zip": None } subprocess.check_call(['mkdir', 'output']) logger.info("Run QC") fqc_command = "/usr/bin/FastQC/fastqc " + filename + " -o output" logger.debug(fqc_command) stdio = subprocess.check_output(shlex.split(fqc_command)) logger.debug(stdio) logger.debug(subprocess.check_output(['ls','-l', 'output'])) subprocess.check_call(['unzip', "output/%s_fastqc.zip" % reads_basename]) logger.info("Upload results") subprocess.check_call(['mv', "%s_fastqc/fastqc_data.txt" % reads_basename, "%s_data.txt" % reads_basename ]) subprocess.check_call(['mv', "%s_fastqc/summary.txt" % reads_basename, "%s_summary.txt" % reads_basename ]) report_dxfile = dxpy.upload_local_file("%s_data.txt" % reads_basename, folder=folder, project=project) summary_dxfile = dxpy.upload_local_file("%s_summary.txt" % reads_basename, folder=folder, project=project) zip_dxfile = dxpy.upload_local_file("output/%s_fastqc.zip" % reads_basename, folder=folder, project=project) logger.debug(report_dxfile) return { "file": dx_file, "report": report_dxfile, "summary": summary_dxfile, "zip": zip_dxfile }
def main(cons1, cons2, outroot, xchr=True, recalnums=1, skip=20, timemax=7500000.0): # The following line(s) initialize your data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. cons1 = dxpy.DXFile(cons1) cons2 = dxpy.DXFile(cons2) # The following line(s) download your file inputs to the local file system # using variable names for the filenames. dxpy.download_dxfile(cons1.get_id(), "cons1") dxpy.download_dxfile(cons2.get_id(), "cons2") outname1 = outroot + '.psmcfa' outname2 = outroot + '.psmc' # Fill in your application code here. #create the psmcfa file createPSMCfa('cons1', 'cons2', outname1, skip) print 'Generated the PSMC fasta file.' sys.stdout.flush() #run psmc the first time subprocess.check_call(['psmc', '-t', '15', '-r', '5', '-p', "4+25*2+4+6", '-o', 'test.psmc', outname1]) print 'Done with first run of PSMC.' sys.stdout.flush() #run the recal script and run psmc again. while (recalnums > 1): (tmaxNew, parfile) = writeRecalFile('test.psmc', timemax, skip, xchr) subprocess.check_call(['psmc', '-t', str(round(tmaxNew,4)), '-i', parfile, '-o', 'test.psmc', outname1]) recalnums -= 1 print 'Recals left', recalnums sys.stdout.flush() (tmaxNew, parfile) = writeRecalFile('test.psmc', timemax, skip, xchr) subprocess.check_call(['psmc', '-t', str(round(tmaxNew,4)), '-i', parfile, '-o', outname2, outname1]) print 'Finished final recalibration run.' # The following line(s) use the Python bindings to upload your file outputs # after you have created them on the local file system. It assumes that you # have used the output field name for the filename for each output, but you # can change that behavior to suit your needs. outfile1 = dxpy.upload_local_file(outname1); outfile2 = dxpy.upload_local_file(outname2); # The following line fills in some basic dummy output and assumes # that you have created variables to represent your output with # the same name as your output fields. output = {} output["outfile1"] = dxpy.dxlink(outfile1) output["outfile2"] = dxpy.dxlink(outfile2) return output
def create_final_set_of_peak_calls(job_inputs): replicate_idr_prefixes = [ r.replace('.tar.gz', '') for r in job_inputs['replicate_idr_files'] ] pseudo_replicate_idr_prefixes = [ r.replace('.tar.gz', '') for r in job_inputs['pseudo_replicate_idr_files'] ] pooled_pseudo_replicate_idr_prefix = job_inputs[ 'pooled_pseudo_replicate_idr_files'].replace('.tar.gz', '') (num_peaks_each_rep, num_peaks_each_pseudo_rep, numPeaks_Rep0) = get_thresholds( replicate_idr_prefixes, pseudo_replicate_idr_prefixes, pooled_pseudo_replicate_idr_prefix, job_inputs['replicate_peaks_threshold'], job_inputs['pseudo_replicate_peaks_threshold'], job_inputs['pooled_pseudo_replicate_peaks_threshold']) max_numPeaks_Rep = max(num_peaks_each_rep) pooled_replicates_peaks_fn = download_and_gunzip_file( job_inputs['pooled_replicate_peaks_file']) coi = { 'signal.value': 7, 'p.value': 8, 'q.value': 9 }[job_inputs['ranking_measure']] cmd = 'sort -k{0}nr,{0}nr "{1}" | head -n {2} | gzip -c > "{3}_conservative.regionPeak.gz"'.format( coi, pooled_replicates_peaks_fn, max_numPeaks_Rep, job_inputs['output_prefix']) print cmd subprocess.check_output(cmd, shell=True) opt_thresh = max(max_numPeaks_Rep, numPeaks_Rep0) cmd = 'sort -k{0}nr,{0}nr "{1}" | head -n {2} | gzip -c > "{3}_optimal.regionPeak.gz"'.format( coi, pooled_replicates_peaks_fn, opt_thresh, job_inputs['output_prefix']) print cmd subprocess.check_output(cmd, shell=True) conservative_result = dxpy.upload_local_file( '{0}_conservative.regionPeak.gz'.format(job_inputs['output_prefix'])) optimal_result = dxpy.upload_local_file('{0}_optimal.regionPeak.gz'.format( job_inputs['output_prefix'])) return { 'conservative_peak_calls': dxpy.dxlink(conservative_result), 'optimal_peak_calls': dxpy.dxlink(optimal_result), 'num_peaks_each_rep': num_peaks_each_rep, 'num_peaks_each_pseudo_rep': num_peaks_each_pseudo_rep, 'num_peaks_pooled_pseudo_rep': numPeaks_Rep0 }
def process(scattered_input, dme_ix, ncpus, reads_root): # Fill in code here to process the input and create output. if DEBUG: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) dme_ix = dxpy.DXFile(dme_ix) # The following line(s) download your file inputs to the local file system # using variable names for the filenames. dxpy.download_dxfile(dme_ix.get_id(), "index.tgz") fq = dxpy.DXFile(scattered_input) name = fq.describe()['name'] dxpy.download_dxfile(fq.get_id(), name) bam_root = name + '_techrep' logger.info("* === Calling DNAnexus and ENCODE independent script... ===") logger.debug("** DIR: %s" % os.listdir('./')) logger.debug(subprocess.check_output(shlex.split('head %s' % name))) if os.path.isfile(ALIGN_SCRIPT): logger.debug("** Executable %s exists" % ALIGN_SCRIPT) else: logger.debug("** Executable %s DOES NOT exist" % ALIGN_SCRIPT) exit(1) align_cmd = '%s index.tgz %s %s %s no_stats' % (ALIGN_SCRIPT, name, str(ncpus), bam_root) logger.debug('** command line: %s' % align_cmd) map_out = subprocess.check_output(shlex.split(align_cmd)) logger.info("* === Returned from dname_align_se ===") # As always, you can choose not to return output if the # "postprocess" stage does not require any input, e.g. rows have # been added to a GTable that has been created in advance. Just # make sure that the "postprocess" job does not run until all # "process" jobs have finished by making it wait for "map" to # finish using the depends_on argument (this is already done for # you in the invocation of the "postprocess" job in "main"). logger.debug("** DIR: %s" % os.listdir('./')) logger.debug("** OUTPUT DIR: %s" % os.listdir('output/')) os.rename(bam_root + '_bismark.bam', bam_root + '.bam') return { "bam_file": dxpy.dxlink(dxpy.upload_local_file(bam_root + '.bam')), "report_file": dxpy.dxlink( dxpy.upload_local_file(bam_root + '_bismark_map_report.txt')) }
def create_sample_sheet(self, output_folder): ''' Description: ''' misc_subfolder = output_folder + '/miscellany' command = 'python create_sample_sheet.py -r %s ' % self.run_name command += '-t %s ' % self.lims_token command += '-u %s ' % self.lims_url command += '-b %d ' % int(self.bcl2fastq_version) command += '-l %d' % int(self.lane_index) stdout, stderr = self.createSubprocess(cmd=command, pipeStdout=True) self.sample_sheet = '%s_L%d_samplesheet.csv' % (self.run_name, self.lane_index) stdout_elements = stdout.split() self.sample_sheet = stdout_elements[1] print 'This is the self.sample_sheet: %s' % self.sample_sheet # DEV: This is a dirty hack. Need to fix issue in LIMS ASAP -PBR 6/6/2016 if self.seq_instrument not in ['Cooper', 'Gadget']: print 'This is not a HiSeq 4000 run: need to RC i5s' self.reverse_complement_i5(self.sample_sheet) else: print 'This is a HiSeq 4000 run; indexes are fine' # Reverse complement i5 index keys in barcode_dict dual_index = False barcode_dict_rci5 = {} for key in self.barcode_dict.keys(): indexes = key.split('-') if len(indexes) > 1: dual_index = True index_i7 = indexes[0] index_rci5 = reverse_complement(indexes[1]) barcode_rci5 = '-'.join([index_i7, index_rci5]) barcode_dict_rci5[barcode_rci5] = self.barcode_dict[key] if dual_index == True: self.barcode_dict = barcode_dict_rci5 # DEV: insert check so that samplesheet is only uploaded if does not exist. # Also, maybe add it to output? dxpy.upload_local_file(filename=self.sample_sheet, properties=None, project=self.lane_project_id, folder=misc_subfolder, parents=True) return self.sample_sheet
def sort_bam(job_inputs): input_bam = dxpy.DXFile(job_inputs['input_bam']) fn = input_bam.describe()['name'] dxpy.download_dxfile(input_bam.get_id(), fn) # Sort and optionally remove unmapped and multimapped reads sorted_ofn = os.path.splitext(fn)[0] + '_sorted.bam' cmd = '/sambamba sort -t {0} -o /dev/stdout {1} '.format( multiprocessing.cpu_count() - 1, fn) if job_inputs['quality_filter']: cmd += '| /sambamba view -f bam -F "(mapping_quality > 1) and not unmapped" -o /dev/stdout /dev/stdin ' cmd += '> ' + sorted_ofn print cmd subprocess.check_call(cmd, shell=True) # Count mapped, unique reads. cmd = '/sambamba view -f bam -F "(mapping_quality > 1) and not unmapped" -c ' + sorted_ofn print cmd num_uniquely_mapped_reads = int( subprocess.check_output(cmd, shell=True).strip()) pcr_bottleneck_coefficient = calc_pcr_bottleneck_coefficient(sorted_ofn) final_ofn = sorted_ofn if job_inputs['remove_duplicates']: deduped_ofn = os.path.splitext(sorted_ofn)[0] + '_deduped.bam' md_metrics_ofn = os.path.splitext( sorted_ofn)[0] + '_deduped_metrics.txt' cmd = get_java_cmd() cmd += ' -jar /MarkDuplicates.jar I={0} O={1} METRICS_FILE={2} ASSUME_SORTED=true VALIDATION_STRINGENCY=LENIENT REMOVE_DUPLICATES=true '.format( sorted_ofn, deduped_ofn, md_metrics_ofn) print cmd subprocess.check_call(cmd, shell=True) bam_file = dxpy.dxlink(dxpy.upload_local_file(deduped_ofn).get_id()) metrics_file = dxpy.dxlink( dxpy.upload_local_file(md_metrics_ofn).get_id()) final_ofn = deduped_ofn else: bam_file = dxpy.dxlink(dxpy.upload_local_file(sorted_ofn).get_id()) metrics_file = None return { 'output_bam': bam_file, 'dedup_metrics_file': metrics_file, 'qc_uniquely_mapped_reads': num_uniquely_mapped_reads, 'qc_pcr_bottleneck_coefficient': pcr_bottleneck_coefficient }
def process(scattered_input, dme_ix, ncpus, reads_root): # Fill in code here to process the input and create output. if DEBUG: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) dme_ix = dxpy.DXFile(dme_ix) # The following line(s) download your file inputs to the local file system # using variable names for the filenames. dxpy.download_dxfile(dme_ix.get_id(), "index.tgz") fq = dxpy.DXFile(scattered_input) name = fq.describe()['name'] dxpy.download_dxfile(fq.get_id(), name) bam_root = name + '_techrep' logger.info("* === Calling DNAnexus and ENCODE independent script... ===") logger.debug("** DIR: %s" % os.listdir('./')) logger.debug(subprocess.check_output(shlex.split('head %s' % name))) if os.path.isfile(ALIGN_SCRIPT): logger.debug("** Executable %s exists" % ALIGN_SCRIPT) else: logger.debug("** Executable %s DOES NOT exist" % ALIGN_SCRIPT) exit(1) align_cmd = '%s index.tgz %s %s %s no_stats' % (ALIGN_SCRIPT, name, str(ncpus), bam_root) logger.debug('** command line: %s' % align_cmd) map_out = subprocess.check_output(shlex.split(align_cmd)) logger.info("* === Returned from dname_align_se ===") # As always, you can choose not to return output if the # "postprocess" stage does not require any input, e.g. rows have # been added to a GTable that has been created in advance. Just # make sure that the "postprocess" job does not run until all # "process" jobs have finished by making it wait for "map" to # finish using the depends_on argument (this is already done for # you in the invocation of the "postprocess" job in "main"). logger.debug("** DIR: %s" % os.listdir('./')) logger.debug("** OUTPUT DIR: %s" % os.listdir('output/')) os.rename(bam_root+'_bismark.bam', bam_root+'.bam') return { "bam_file": dxpy.dxlink(dxpy.upload_local_file(bam_root+'.bam')), "report_file": dxpy.dxlink(dxpy.upload_local_file(bam_root+'_bismark_map_report.txt')) }
def map_reads_minimap2(reads, genome_fastagz, genome_mmi, datatype): # Download inputs reads = [ dx_utils.download_and_gunzip_file(f, skip_decompress=True) for f in reads ] ref_genome = dx_utils.download_and_gunzip_file(genome_fastagz) ref_genome_mmi = dx_utils.download_and_gunzip_file(genome_mmi) # configure preset params if datatype == 'PacBio': preset_param = 'map-pb' else: preset_param = 'map-ont' # Iterate over reads files output_ofns = [] for read in reads: output_prefix = re.sub("\.(fastq|fasta|fa|fq){1}(.gz)?$", "", read) ofn = '{0}.mapped.bam'.format(output_prefix) # Get help info dx_utils.run_cmd(['minimap2', '-h']) # Call minimap2 minimap2_cmd = ['minimap2', '-ax', preset_param, ref_genome, read] view_cmd = [ 'sambamba', 'view', '--sam-input', '--format=bam', '--compression-level=0', '/dev/stdin' ] sort_cmd = [ 'sambamba', 'sort', '-m', '{0}G'.format(int(dx_utils.get_memory(suffix='G'))), '-o', ofn, '-t', str(multiprocessing.cpu_count()), '/dev/stdin' ] dx_utils.run_pipe(minimap2_cmd, view_cmd, sort_cmd) # index dx_utils.run_cmd(['sambamba', 'index', ofn]) # append to outputs output_ofns.append(ofn) return { 'mapped_reads': [dxpy.dxlink(dxpy.upload_local_file(ofn)) for ofn in output_ofns], 'mapped_reads_index': [ dxpy.dxlink(dxpy.upload_local_file(ofn + '.bai')) for ofn in output_ofns ] }
def combine_files(countDXlinks, resultfn): """The 'gather' subjob of the applet. Arguments: countDXlinks (list[dict]): list of DXlinks to process job output files. resultfn (str): Filename to use for job output file. Returns: DXLink for the main function to return as the job output. Note: Only the DXLinks are passed as parameters. Subjobs work on a fresh instance so files must be downloaded to the machine """ if resultfn.endswith(".bam"): resultfn = resultfn[:-4] + '.txt' sum_reads = 0 with open(resultfn, 'w') as f: for i, dxlink in enumerate(countDXlinks): dxfile = dxpy.DXFile(dxlink) filename = "countfile{0}".format(i) dxpy.download_dxfile(dxfile, filename) with open(filename, 'r') as fsub: for line in fsub: sum_reads += parse_line_for_readcount(line) f.write(line) f.write('Total Reads: {0}'.format(sum_reads)) countDXFile = dxpy.upload_local_file(resultfn) countDXlink = dxpy.dxlink(countDXFile.get_id()) return {"countDXLink": countDXlink}
def main(fastq, genomeindex_targz): print "something else" fastq_dxfile = dxpy.DXFile(fastq) dxpy.download_dxfile(fastq_dxfile.get_id(), "input.fastq") genome_dxfile = dxpy.DXFile(genomeindex_targz) dxpy.download_dxfile(genome_dxfile.get_id(), "genome.tar.gz") os.makedirs("genome") tar_cmd = "tar xzvf genome.tar.gz -C genome" subprocess.check_call(tar_cmd, shell=True) genome_file = glob.glob("genome/*.bwt")[0] genome_file = re.sub("\.bwt$", "", genome_file) bwa_cmd = ("bwa mem -t {nproc} {genome} {fastq} | " "samtools view -u -S - | " "samtools sort -m 256M -@ {nproc} - output".format( nproc=multiprocessing.cpu_count(), genome=genome_file, fastq="input.fastq")) subprocess.check_call(bwa_cmd, shell=True) bam = dxpy.upload_local_file("output.bam") # The following line fills in some basic dummy output and assumes # that you have created variables to represent your output with # the same name as your output fields. output = {} output["bam"] = dxpy.dxlink(bam) return output
def main(BAMs, params='USE_THREADING=true SORT_ORDER=coordinate VALIDATION_STRINGENCY=LENIENT'): inputFiles = [] for i in range(len(BAMs)): fh = dxpy.DXFile(BAMs[i]) dxpy.download_dxfile(fh.get_id(), "input%d.bam" % (i)) name = dxpy.DXFile(BAMs[0]).describe()['name'].rstrip(".bam") # Fill in your application code here. command = "java -Xmx4g -jar /opt/jar/MergeSamFiles.jar OUTPUT=%s.bam %s" % (name, params) for i in range(len(BAMs)): command += " INPUT=input%d.bam" % (i) subprocess.check_call(command, shell=True) # The following line(s) use the Python bindings to upload your file outputs # after you have created them on the local file system. It assumes that you # have used the output field name for the filename for each output, but you # can change that behavior to suit your needs. BAM = dxpy.upload_local_file("%s.bam" % name); # The following line fills in some basic dummy output and assumes # that you have created variables to represent your output with # the same name as your output fields. output = {} output["BAM"] = dxpy.dxlink(BAM) return output
def run_applet_with_flags(self, flag_list, num_files, file_size_bytes): with temporary_project( 'TestDXBashHelpers.test_app1 temporary project') as dxproj: env = update_environ(DX_PROJECT_CONTEXT_ID=dxproj.get_id()) # Upload file self.create_file_of_size("A.txt", file_size_bytes) remote_file = dxpy.upload_local_file(filename="A.txt", project=dxproj.get_id(), folder='/') # Build the applet, patching in the bash helpers from the # local checkout applet_id = build_app_with_bash_helpers( os.path.join(TEST_APPS, 'benchmark'), dxproj.get_id()) # Add several files to the output applet_args = [] applet_args.extend(['-iref=A.txt'] * num_files) cmd_args = [ 'dx', 'run', '--yes', '--watch', '--instance-type=mem1_ssd1_x2', applet_id ] cmd_args.extend(applet_args) cmd_args.extend(flag_list) run(cmd_args, env=env)
def geneBody_coverage(BAM_file, BED_file): dxpy.download_dxfile(BED_file, "genes.bed") dxpy.download_dxfile(BAM_file, "mappings.bam") # split mappings into chunks that can be done on a single worker # all mappings are loaded into RAM so can only do 5 million at a time run_shell(" ".join(["samtools", "view", "mappings.bam", "|", "split", "-l 10000000", "-", "split_map"])) run_shell(" ".join(["samtools", "view", "-H", "mappings.bam", ">", "header_only.sam"])) files = os.listdir(".") jobs = [] for f in files: if f.startswith("split_map"): # add header run_shell(" ".join(["cat", "header_only.sam", f, ">", "temp.sam"])) # convert to BAM run_shell(" ".join(["samtools", "view", "-S", "-b", "temp.sam", ">", "temp.bam"])) # upload file split_bam = dxpy.upload_local_file("temp.bam") # run analysis jobs.append(dxpy.new_dxjob({"BAM_file":dxpy.dxlink(split_bam.get_id()), "BED_file":BED_file}, "run_gbc")) run_shell( "ls -l" ) gbc_agg_input = {"sub_reports":[]} for j in jobs: gbc_agg_input["sub_reports"].append({"job":j.get_id(), "field":"file"}) agg_job = dxpy.new_dxjob(gbc_agg_input, "gbc_agg").get_id() return {"results":{"job":agg_job, "field":"cover"}}
def main(input_file): # The following line(s) initialize your data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. input_file = dxpy.DXFile(input_file) # The following line(s) download your file inputs to the local file system # using variable names for the filenames. dxpy.download_dxfile(input_file.get_id(), "input_file") # Fill in your application code here. subprocess.check_call( "fastq_quality_trimmer -t 20 -Q 33 -i input_file -o output_file", shell=True) # The following line(s) use the Python bindings to upload your file outputs # after you have created them on the local file system. It assumes that you # have used the output field name for the filename for each output, but you # can change that behavior to suit your needs. output_file = dxpy.upload_local_file("output_file") # The following line fills in some basic dummy output and assumes # that you have created variables to represent your output with # the same name as your output fields. output = {} output["output_file"] = dxpy.dxlink(output_file) return output
def scatter(orig_reads, split_size): # Fill in code here to do whatever is necessary to scatter the # input. if DEBUG: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) splitsize = split_size * 1000000 * 4 # each FQ read is 4 lines os.mkdir('splits') for f in orig_reads: reads_filename = dxpy.describe(f)['name'] reads_basename = strip_extensions(reads_filename, STRIP_EXTENSIONS) dxpy.download_dxfile(dxpy.DXFile(f).get_id(), reads_filename) reads_root_name = simplify_name() or reads_basename logger.info('* RUNNING /bin/zcat %s | /usr/bin/split -l %d -d - %s ' % (reads_filename, splitsize, 'splits/' + reads_root_name)) split_out = subprocess.check_output('/bin/zcat %s | /usr/bin/split -l %d -d - %s ' % (reads_filename, splitsize, 'splits/' + reads_root_name), shell=True) # can't shlex because of | logger.info(split_out) splits = os.listdir('splits') logger.info("* Return from scatter: %s *" % splits) # SHould we gzip here? return { "array_of_scattered_input": [ dxpy.dxlink(dxpy.upload_local_file('splits/' + split_file)) for split_file in splits] }
def make_indexed_reference( ref_ID ): run_shell("dx-contigset-to-fasta %s reference.fasta" % ref_ID) ref_details = dxpy.DXRecord(ref_ID).get_details() ref_name = dxpy.DXRecord(ref_ID).describe()['name'] # call bowtie2-build run_shell("bowtie2-build reference.fasta indexed_ref") # package it into an archive for uploading run_shell("XZ_OPT=-0 tar -cJf reference.tar.xz indexed_ref*") indexed_ref_dxfile = dxpy.upload_local_file("reference.tar.xz", hidden=True, wait_on_close=True) indexed_ref_record = dxpy.new_dxrecord(name=ref_name + " (indexed for Bowtie2)", types=["BowtieLetterContigSetV2"], details={'index_archive': dxpy.dxlink(indexed_ref_dxfile.get_id()), 'original_contigset': dxpy.dxlink(ref_ID)}) indexed_ref_record.close() ''' # TODO: dxpy project workspace convenience functions if "projectWorkspace" in job: indexed_ref_record.clone(job["projectWorkspace"]) ''' return indexed_ref_record.get_id()
def main(inputs, prefix=None): input_filenames = [] for input_file in inputs: dxf = dxpy.DXFile(input_file) input_filenames.append(dxf.name) dxpy.download_dxfile(dxf.get_id(), dxf.name) # uses last extension - presumably they are all the same extension = splitext(splitext(input_filenames[-1])[0])[1] if prefix: pooled_filename = prefix + "_pooled%s.gz" % (extension) else: pooled_filename = \ '-'.join([splitext(splitext(fn)[0])[0] for fn in input_filenames]) + "_pooled%s.gz" % (extension) out, err = common.run_pipe([ 'gzip -dc %s' % (' '.join(input_filenames)), 'gzip -cn'], outfile=pooled_filename) pooled = dxpy.upload_local_file(pooled_filename) output = { "pooled": dxpy.dxlink(pooled) } return output
def main(contig_set): # The following line(s) initialize your data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. contig_set = dxpy.DXRecord(contig_set) # The following line extracts the name from the file object so that # outputs can be named intelligently. It is not automatically generated by # the app wizard. name = contig_set.describe()['name'].replace(".fa", "") # Fill in your application code here. subprocess.check_call("dx-contigset-to-fasta %s %s.fa" % (contig_set.get_id(), name), shell=True) subprocess.check_call("gzip %s.fa" % name, shell=True) # The following line(s) use the Python bindings to upload your file outputs # after you have created them on the local file system. It assumes that you # have used the output field name for the filename for each output, but you # can change that behavior to suit your needs. fasta_gz = dxpy.upload_local_file("%s.fa.gz" % name); # The following line fills in some basic dummy output and assumes # that you have created variables to represent your output with # the same name as your output fields. output = {} output["fasta_gz"] = dxpy.dxlink(fasta_gz) return output
def main(quants_a, quants_b): # tool_versions.py --applet $script_name --appver $script_ver sw_versions = subprocess.check_output(['tool_versions.py', '--dxjson', 'dnanexus-executable.json']) dxfile_a = dxpy.DXFile(quants_a) dxfile_b = dxpy.DXFile(quants_b) print "* Downloading files..." dxpy.download_dxfile(dxfile_a.get_id(), "quants_a") dxpy.download_dxfile(dxfile_b.get_id(), "quants_b") # Create and appropriate name for output files out_root = root_name_from_pair(dxfile_a.name.split('.')[0],dxfile_b.name.split('.')[0]) mad_plot_file = out_root + '_mad_plot.png' # DX/ENCODE independent script is found in resources/usr/bin print "* Runnning MAD.R..." mad_output = subprocess.check_output(['Rscript', '/usr/bin/MAD.R', 'quants_a', 'quants_b']) subprocess.check_call(['mv', "MAplot.png", mad_plot_file]) print "* package properties..." qc_metrics = {} qc_metrics["MAD.R"] = json.loads(mad_output) meta_string = json.dumps(qc_metrics) print json.dumps(qc_metrics,indent=4) props = {} props["SW"] = sw_versions print "* Upload Plot..." plot_dxfile = dxpy.upload_local_file(mad_plot_file,properties=props,details=qc_metrics) return { "metadata": meta_string, "mad_plot": plot_dxfile }
def main(psmcfa, psmc, outname, xchr, timemax, window): # The following line(s) initialize your data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. psmcfa = dxpy.DXFile(psmcfa) psmc = dxpy.DXFile(psmc) # The following line(s) download your file inputs to the local file system # using variable names for the filenames. dxpy.download_dxfile(psmcfa.get_id(), "psmcfa") dxpy.download_dxfile(psmc.get_id(), "psmc") # Fill in your application code here. (tmaxNew, parfile) = writeRecalFile('psmc', timemax, window, xchr) subprocess.check_call(['psmc', '-t', str(round(tmaxNew,4)), '-i', parfile, '-o', outname, 'psmcfa']) # The following line(s) use the Python bindings to upload your file outputs # after you have created them on the local file system. It assumes that you # have used the output field name for the filename for each output, but you # can change that behavior to suit your needs. outfile = dxpy.upload_local_file(outname); # The following line fills in some basic dummy output and assumes # that you have created variables to represent your output with # the same name as your output fields. output = {} output["outfile"] = dxpy.dxlink(outfile) return output
def produce_qc_report(individual_json_outputs, sample_name, output_project, output_folder, properties={}): """Combine the various statistics collected into a single dict for output.""" output = {'Sample name': sample_name} misc_subfolder = output_folder + '/miscellany' for j in individual_json_outputs: for k in j: if k in output: output[k].update(j[k]) else: output[k] = j[k] ofn = sample_name + '_stats.json' with open(ofn, 'w') as output_fh: output_fh.write(json.dumps(output)) properties['file_type'] = 'qc_stats' output_json_file = dxpy.upload_local_file(filename=ofn, project=output_project, properties=properties, folder=misc_subfolder, parents=True) return {'combined_json_file': dxpy.dxlink(output_json_file)}
def main(fastq, genomeindex_targz): print "something else" fastq_dxfile = dxpy.DXFile(fastq) dxpy.download_dxfile(fastq_dxfile.get_id(), "input.fastq") genome_dxfile = dxpy.DXFile(genomeindex_targz) dxpy.download_dxfile(genome_dxfile.get_id(), "genome.tar.gz") os.makedirs("genome") tar_cmd = "tar xzvf genome.tar.gz -C genome" subprocess.check_call(tar_cmd, shell=True) genome_file = glob.glob("genome/*.bwt")[0] genome_file = re.sub("\.bwt$", "", genome_file) bwa_cmd = ( "bwa mem -t {nproc} {genome} {fastq} | " "samtools view -u -S - | " "samtools sort -m 256M -@ {nproc} - output".format( nproc=multiprocessing.cpu_count(), genome=genome_file, fastq="input.fastq" ) ) subprocess.check_call(bwa_cmd, shell=True) bam = dxpy.upload_local_file("output.bam") # The following line fills in some basic dummy output and assumes # that you have created variables to represent your output with # the same name as your output fields. output = {} output["bam"] = dxpy.dxlink(bam) return output
def calc_mismatch_per_cycle_stats(bam_file, aligner, output_project, output_folder, properties={}): logger = [] misc_subfolder = output_folder + '/miscellany' bam_file = dxpy.DXFile(bam_file) bam_filename = bam_file.describe()['name'] dxpy.download_dxfile(bam_file.get_id(), bam_filename) ofn = os.path.splitext(bam_filename)[0] + '.mm_stats' # Change permissions cmd = 'chmod +x /bwa_mismatches' run_cmd(cmd, logger) cmd = '/bwa_mismatches -o {0} -m {1} {2}'.format(ofn, ALIGNERS[aligner], bam_filename) run_cmd(cmd, logger) properties['file_type'] = 'mismatch_stats' mismatch_per_cycle_stats = dxpy.upload_local_file(filename=ofn, project=output_project, folder=misc_subfolder, properties=properties, parents=True) return { 'mismatch_per_cycle_stats': mismatch_per_cycle_stats, "tools_used": logger }
def main(quants_a, quants_b): # tool_versions.py --applet $script_name --appver $script_ver sw_versions = subprocess.check_output(['tool_versions.py', '-a', APP_SCRIPT, '-av', APP_VER]) dxfile_a = dxpy.DXFile(quants_a) dxfile_b = dxpy.DXFile(quants_b) print "* Downloading files..." dxpy.download_dxfile(dxfile_a.get_id(), "quants_a") dxpy.download_dxfile(dxfile_b.get_id(), "quants_b") print "* Runnning MAD.R..." mad_output = subprocess.check_output(['Rscript', '/usr/bin/MAD.R', 'quants_a', 'quants_b']) quants_a_name = dxfile_a.name.split('.') quants_b_name = dxfile_b.name.split('.') filename = quants_a_name[0] + '_' + quants_b_name[0] + '_' + quants_a_name[1] + '_mad_plot.png' subprocess.check_call(['mv', "MAplot.png", filename]) print "* package properties..." qc_metrics = {} qc_metrics["MAD.R"] = json.loads(mad_output) meta_string = json.dumps(qc_metrics) print json.dumps(qc_metrics,indent=4) props = {} props["SW"] = sw_versions print "* Upload Plot..." plot_dxfile = dxpy.upload_local_file(filename,properties=props,details=qc_metrics) return { "metadata": meta_string, "mad_plot": plot_dxfile }
def create_tools_used_json_file(tools_used, output_project, output_folder, properties={}): misc_subfolder = output_folder + '/miscellany' tools_used_dict = {} tools_used_dict['name'] = get_app_title() tools_used_dict['commands'] = [] for tools in tools_used: tools_used_dict['commands'] += tools fn = tools_used_dict['name'] + '_tools_used.json' with open(fn, 'w') as fh: fh.write(json.dumps(tools_used_dict)) properties['file_type'] = 'tools_used' tools_used_json_file = dxpy.upload_local_file(filename=fn, project=output_project, folder=misc_subfolder, properties=properties, parents=True) return {'tools_used_json_file': tools_used_json_file}
def read_duplication(BAM_file): dxpy.download_dxfile(BAM_file, "mappings.bam") run_shell( " ".join(["read_duplication.py", "-i mappings.bam", "-o read_dup"])) run_shell( " ".join(["cat", "read_dup.pos.DupRate.xls", "read_dup.seq.DupRate.xls", ">", "read_dup.txt"])) results_id = dxpy.upload_local_file("read_dup.txt", wait_on_close=True).get_id() return {"results":results_id}
def main( BAMs, params='USE_THREADING=true SORT_ORDER=coordinate VALIDATION_STRINGENCY=LENIENT' ): inputFiles = [] for i in range(len(BAMs)): fh = dxpy.DXFile(BAMs[i]) dxpy.download_dxfile(fh.get_id(), "input%d.bam" % (i)) name = dxpy.DXFile(BAMs[0]).describe()['name'].rstrip(".bam") # Fill in your application code here. command = "java -Xmx4g -jar /opt/jar/MergeSamFiles.jar OUTPUT=%s.bam %s" % ( name, params) for i in range(len(BAMs)): command += " INPUT=input%d.bam" % (i) subprocess.check_call(command, shell=True) # The following line(s) use the Python bindings to upload your file outputs # after you have created them on the local file system. It assumes that you # have used the output field name for the filename for each output, but you # can change that behavior to suit your needs. BAM = dxpy.upload_local_file("%s.bam" % name) # The following line fills in some basic dummy output and assumes # that you have created variables to represent your output with # the same name as your output fields. output = {} output["BAM"] = dxpy.dxlink(BAM) return output
def main(input_file): # The following line(s) initialize your data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. input_file = dxpy.DXFile(input_file) # The following line(s) download your file inputs to the local file system # using variable names for the filenames. dxpy.download_dxfile(input_file.get_id(), "input_file") # Fill in your application code here. subprocess.check_call("fastq_quality_trimmer -t 20 -Q 33 -i input_file -o output_file", shell=True) # The following line(s) use the Python bindings to upload your file outputs # after you have created them on the local file system. It assumes that you # have used the output field name for the filename for each output, but you # can change that behavior to suit your needs. output_file = dxpy.upload_local_file("output_file"); # The following line fills in some basic dummy output and assumes # that you have created variables to represent your output with # the same name as your output fields. output = {} output["output_file"] = dxpy.dxlink(output_file) return output
def _format_data_file(self, df: DataFile) -> dict: if isinstance(df.localizer, UrlLocalizer): ul = cast(UrlLocalizer, df.localizer) if ul.url.startswith("dx://"): return dxpy.dxlink(*ul.url[5:].split(":")) file_name = df.local_path.name existing_files = list(dxpy.find_data_objects( classname="file", state="closed", name=file_name, project=self._project_id, folder=self._folder, recurse=False )) if not existing_files: # TODO: batch uploads and use dxpy.sugar.transfers.Uploader for # parallelization return dxpy.dxlink(dxpy.upload_local_file( str(df.path), name=file_name, project=self._project_id, folder=self._folder, parents=True, wait_on_close=True )) elif len(existing_files) == 1: return dxpy.dxlink(existing_files[0]["id"], self._project_id) else: raise RuntimeError( f"Multiple files with name {file_name} found in " f"{self._project_id}:{self._folder}" )
def upload_lane_html(self, raw_properties, tags): '''Upload lane.html file to DNAnexus project. Args: local_file_path (str): Local path of sample sheet. raw_properties (dict): Properties with values of different types. Returns: str: DXLink to lane.html file on DNAnexus object store. ''' # Convert all property values to strings properties = {key: str(value) for key, value in raw_properties.items()} properties['file_type'] = 'lane_html' project_folder = '{}/miscellany'.format(self.project_path) local_file_path = ( '{}/Reports/html/'.format(LOCAL_OUTPUT) + '{}/all/all/all/lane.html'.format(properties['flowcell_id'])) remote_file_name = '{}_L{}.lane.html'.format(properties['run_name'], properties['lane_index']) lane_html_dxid = dxpy.upload_local_file(filename=local_file_path, name=remote_file_name, properties=properties, tags=tags, project=self.project_dxid, folder=project_folder, parents=True) return dxpy.dxlink(lane_html_dxid)
def main(inputs): # The following line(s) initialize your data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. input_filenames = [] for input_file in inputs: dxf = dxpy.DXFile(input_file) input_filenames.append(dxf.name) dxpy.download_dxfile(dxf.get_id(), dxf.name) extension = splitext(splitext(input_filenames[-1])[0])[ 1] #uses last extension - presumably they are all the same pooled_filename = '-'.join( [splitext(splitext(fn)[0])[0] for fn in input_filenames]) + "_pooled%s.gz" % (extension) out, err = run_pipe( ['gzip -dc %s' % (' '.join(input_filenames)), 'gzip -c'], outfile=pooled_filename) pooled = dxpy.upload_local_file(pooled_filename) # The following line fills in some basic dummy output and assumes # that you have created variables to represent your output with # the same name as your output fields. output = {} output["pooled"] = dxpy.dxlink(pooled) return output
def upload_tools_used(self, tools_used_dict, raw_properties): '''Write console commands to Tools Used file & upload. Args: tools_used_dict (dict): Description of executables & configurations. raw_properties (dict): Properties with values of different types. Returns: str: DXLink to "tools used" file on DNAnexus object store. ''' # Convert all property values to strings properties = {key: str(value) for key, value in raw_properties.items()} properties['file_type'] = 'lane_html' # Write file local_file_path = 'bcl2fastq_tools_used.json' with open(local_file_path, 'w') as TOOLS: TOOLS.write(json.dumps(tools_used_dict)) # Upload file properties['file_type'] = 'tools_used' project_folder = '{}/miscellany'.format(self.project_path) tools_used_dxid = dxpy.upload_local_file(filename=local_file_path, properties=properties, project=self.project_dxid, folder=project_folder, parents=True) return dxpy.dxlink(tools_used_dxid)
def postprocess(**inputs): kwargs = inputs["kwargs"] subjob_outputs = inputs["subjob_outputs"] print "\nMerging outputs from {n} subjobs".format(n=len(subjob_outputs)) output_prefix = kwargs["output_prefix"] variant_suffixes = kwargs["variant_suffixes"] app_output_fn = {} for subjob_output in subjob_outputs: for type, id in subjob_output.iteritems(): file_id = id["$dnanexus_link"] filename = output_prefix + "_" + variant_suffixes[type] print "Downloading " + str(file_id) + " into " + filename dxpy.download_dxfile(dxid=file_id, filename=filename, append=True) app_output_fn[type] = filename postprocess_outputs = {} need_to_renumber = ["deletions", "short_inserts", "tandem_duplications", "inversions", "large_inserts"] for type, fn in app_output_fn.iteritems(): out_fn = fn if type in need_to_renumber: out_fn = RenumberMergedOutput(fn, fn+"_renumbered") print "\nUploading {file} as {fn}".format(file=out_fn, fn=fn) postprocess_outputs[type] = dxpy.dxlink(dxpy.upload_local_file(out_fn, name=fn)) if kwargs["export_vcf"]: DownloadRefFasta(kwargs["reference_fasta"]) postprocess_outputs["vcf"] = ExportVCF(kwargs=kwargs, output_path=output_prefix, ref_fn="reference_fasta") return postprocess_outputs
def s3cp(accession, key=None): (AUTHID, AUTHPW, SERVER) = common.processkey(key, KEYFILE) keypair = (AUTHID, AUTHPW) url = SERVER + '/search/?type=file&accession=%s&format=json&frame=embedded&limit=all' % ( accession) #get the file object response = common.encoded_get(url, keypair) logger.debug(response) #select your file result = response.get('@graph') if not result: logger.error('Failed to find %s at %s' % (accession, url)) return None else: f_obj = result[0] logger.debug(f_obj) #make the URL that will get redirected - get it from the file object's href property encode_url = urlparse.urljoin(SERVER, f_obj.get('href')) logger.debug("URL: %s" % (encode_url)) logger.debug("%s:%s" % (AUTHID, AUTHPW)) #stream=True avoids actually downloading the file, but it evaluates the redirection r = requests.get(encode_url, auth=(AUTHID, AUTHPW), headers={'content-type': 'application/json'}, allow_redirects=True, stream=True) try: r.raise_for_status except: logger.error('%s href does not resolve' % (f_obj.get('accession'))) logger.debug("Response: %s", (r)) #this is the actual S3 https URL after redirection s3_url = r.url logger.debug(s3_url) #release the connection r.close() #split up the url into components o = urlparse.urlparse(s3_url) #pull out the filename filename = os.path.basename(o.path) #hack together the s3 cp url (with the s3 method instead of https) bucket_url = S3_SERVER.rstrip('/') + o.path #cp the file from the bucket subprocess.check_call(shlex.split('aws s3 cp %s . --quiet' % (bucket_url)), stderr=subprocess.STDOUT) subprocess.check_call(shlex.split('ls -l %s' % (filename))) dx_file = dxpy.upload_local_file(filename) return dx_file