def main(fwd_reads, rev_reads, ref_genome): fwd_bam = dxpy.new_dxjob( { 'reads': fwd_reads, 'ref_genome': ref_genome }, 'run_bwa', name='Map forward reads').get_output_ref('output_bam') rev_bam = dxpy.new_dxjob( { 'reads': rev_reads, 'ref_genome': ref_genome }, 'run_bwa', name='Map reverse reads').get_output_ref('output_bam') merge_job = dxpy.new_dxjob({ 'fwd_bam': fwd_bam, 'rev_bam': rev_bam }, 'combine_bams', name='Combine bams') output = { 'output_bam': merge_job.get_output_ref('output_bam'), 'output_bai': merge_job.get_output_ref('output_bai') } return output
def main(DX_APP_WIZARD_INPUT_SIGNATURE): DX_APP_WIZARD_INITIALIZE_INPUTDX_APP_WIZARD_DOWNLOAD_ANY_FILES # Split your work into parallel tasks. As an example, the # following generates 10 subjobs running with the same dummy # input. subjobs = [] for i in range(10): subjob_input = { "input1": True } subjobs.append(dxpy.new_dxjob(subjob_input, "process")) # The following line creates the job that will perform the # "postprocess" step of your app. We've given it an input field # that is a list of job-based object references created from the # "process" jobs we just created. Assuming those jobs have an # output field called "output", these values will be passed to the # "postprocess" job. Because these values are not ready until the # "process" jobs finish, the "postprocess" job WILL NOT RUN until # all job-based object references have been resolved (i.e. the # jobs they reference have finished running). # # If you do not plan to have the "process" jobs create output that # the "postprocess" job will require, then you can explicitly list # the dependencies to wait for those jobs to finish by setting the # "depends_on" field to the list of subjobs to wait for (it # accepts either dxpy handlers or string IDs in the list). We've # included this parameter in the line below as well for # completeness, though it is unnecessary if you are providing # job-based object references in the input that refer to the same # set of jobs. postprocess_job = dxpy.new_dxjob(fn_input={ "process_outputs": [subjob.get_output_ref("output") for subjob in subjobs] }, fn_name="postprocess", depends_on=subjobs)
def geneBody_coverage(BAM_file, BED_file): dxpy.download_dxfile(BED_file, "genes.bed") dxpy.download_dxfile(BAM_file, "mappings.bam") # split mappings into chunks that can be done on a single worker # all mappings are loaded into RAM so can only do 5 million at a time run_shell(" ".join(["samtools", "view", "mappings.bam", "|", "split", "-l 10000000", "-", "split_map"])) run_shell(" ".join(["samtools", "view", "-H", "mappings.bam", ">", "header_only.sam"])) files = os.listdir(".") jobs = [] for f in files: if f.startswith("split_map"): # add header run_shell(" ".join(["cat", "header_only.sam", f, ">", "temp.sam"])) # convert to BAM run_shell(" ".join(["samtools", "view", "-S", "-b", "temp.sam", ">", "temp.bam"])) # upload file split_bam = dxpy.upload_local_file("temp.bam") # run analysis jobs.append(dxpy.new_dxjob({"BAM_file":dxpy.dxlink(split_bam.get_id()), "BED_file":BED_file}, "run_gbc")) run_shell( "ls -l" ) gbc_agg_input = {"sub_reports":[]} for j in jobs: gbc_agg_input["sub_reports"].append({"job":j.get_id(), "field":"file"}) agg_job = dxpy.new_dxjob(gbc_agg_input, "gbc_agg").get_id() return {"results":{"job":agg_job, "field":"cover"}}
def main(DX_APP_WIZARD_INPUT_SIGNATURE): DX_APP_WIZARD_INITIALIZE_INPUTDX_APP_WIZARD_DOWNLOAD_ANY_FILES # We first create the "scatter" job which will scatter some input # (replace with your own input as necessary). input_to_scatter = "placeholder value" scatter_job = dxpy.new_dxjob(fn_input={ "input_to_scatter": input_to_scatter }, fn_name="scatter") # We will want to call "process" on each output of "scatter", so # we call the "map" entry point to do so. We can also provide # here additional input that we want each "process" entry point to # receive, e.g. a file ID to which the "process" function should # add rows of data. map_input = { "array_of_scattered_input": scatter_job.get_output_ref("array_of_scattered_input"), "process_input": { "additional_input": "file ID, for example" } } map_job = dxpy.new_dxjob(fn_input=map_input, fn_name="map") # Finally, we want the "postprocess" job to run after "map" is # done calling "process" on each of its inputs. Note that a job # is marked as "done" only after all of its child jobs are also # marked "done". postprocess_input = { "process_outputs": map_job.get_output_ref("process_outputs"), "additional_input": "file ID, for example" } postprocess_job = dxpy.new_dxjob(fn_input=postprocess_input, fn_name="postprocess", depends_on=[map_job])
def main(workers, max_files_per_worker=None, threads_per_worker=8, worker_launch_delay_seconds=0, smallest=False): mkdirs() worker_instance_type = "mem2_hdd2_x4" if smallest: # debugging - run on default instances worker_instance_type = None # launch workers, each to process a subset of the files subjobs = [] for i in range(workers): subjob_input = { "workers": workers, "max_files_per_worker": max_files_per_worker, "whoami": i, "threads_per_worker": threads_per_worker, "smallest": smallest } subjobs.append(dxpy.new_dxjob(subjob_input, "process", instance_type=worker_instance_type)) if worker_launch_delay_seconds > 0 and i < (workers-1): # delay launching each worker to smooth out the load on the remote # server time.sleep(worker_launch_delay_seconds) # schedule postprocessing to reduce statistics output_fields = ["files_skipped", "files_transferred", "bytes_transferred"] postprocess_job = dxpy.new_dxjob(fn_input={k:[subjob.get_output_ref(k) for subjob in subjobs] for k in output_fields}, fn_name="postprocess") return {k:postprocess_job.get_output_ref(k) for k in output_fields}
def main(DX_APP_WIZARD_INPUT_SIGNATURE): DX_APP_WIZARD_INITIALIZE_INPUTDX_APP_WIZARD_DOWNLOAD_ANY_FILES # Split your input to be solved by the next stage of your app. # The following assumes you are splitting the input by giving, # 100000 rows of a GenomicTable per subjob running the "process" # entry point. num_rows = DX_APP_WIZARD_||_INPUT.describe()["length"] subjobs = [] for i in range(num_rows / row_chunk_size + (0 if num_rows % row_chunk_size == 0 else 1)): subjob_input = { "gtable_id": DX_APP_WIZARD_||_INPUT.get_id(), "start_row": row_chunk_size * i, "end_row": min(row_chunk_size * (i + 1), num_rows)} subjobs.append(dxpy.new_dxjob(subjob_input, "process")) # The following line creates the job that will perform the # "postprocess" step of your app. We've given it an input field # that is a list of job-based object references created from the # "process" jobs we just created. Assuming those jobs have an # output field called "output", these values will be passed to the # "postprocess" job. Because these values are not ready until the # "process" jobs finish, the "postprocess" job WILL NOT RUN until # all job-based object references have been resolved (i.e. the # jobs they reference have finished running). # # If you do not plan to have the "process" jobs create output that # the "postprocess" job will require, then you can explicitly list # the dependencies to wait for those jobs to finish by setting the # "depends_on" field to the list of subjobs to wait for (it # accepts either dxpy handlers or string IDs in the list). We've # included this parameter in the line below as well for # completeness, though it is unnecessary if you are providing # job-based object references in the input that refer to the same # set of jobs. postprocess_job = dxpy.new_dxjob(fn_input={"process_outputs": [subjob.get_output_ref("output") for subjob in subjobs]}, fn_name="postprocess", depends_on=subjobs) # If you would like to include any of the output fields from the # postprocess_job as the output of your app, you should return it # here using a job-based object reference. If the output field is # called "answer", you can pass that on here as follows: # # return {"app_output_field": postprocess_job.get_output_ref("answer"), ...} # # Tip: you can include in your output at this point any open # objects (such as GTables) which are closed by a job that # finishes later. The system will check to make sure that the # output object is closed and will attempt to clone it out as # output into the parent container only after all subjobs have # finished. output = {}
def main(worker_max, f_ids, bandwidth, species_name=None): """ Input variables removed: """ _run_cmd('aws --version', True) print('file ids: ' + str(f_ids)) # Remove any files which are already symlinks f_ids = filter(lambda x: not _is_symlink(x), f_ids) if species_name is None: species_name = _get_species_name() # Set upload root to user specified directory or project projdx = dxpy.DXProject(os.environ['DX_PROJECT_CONTEXT_ID']) dir_file = os.path.join(S3_ROOT_FOLDER, species_name, projdx.name) # Trim trailing / in upload dir dir_file = dir_file.strip('/') print('Upload directory: ' + dir_file) # Programatically split files into equal list based on size and max workers split_list_dxlinks = _split_partition(f_ids, worker_max) # Select instance type based on user input trans_worker_inst = instance_from_bandwidth(bandwidth) # Run subjobs on list uploadjobs = [dxpy.new_dxjob( fn_input={'target_s3': TARGET_S3, 'assigned_files': f_group, 'up_dir': dir_file}, fn_name='s3_upload', instance_type=trans_worker_inst) for f_group in split_list_dxlinks] # Merge S3 status upload reports from subjobs report_fileDXLinks = [subjob.get_output_ref('report_file_link') for subjob in uploadjobs] print('Creating S3 upload report') report_job = dxpy.new_dxjob( fn_input={'filelinks': report_fileDXLinks}, fn_name='create_upload_report') # Output merged report print('Output final report') finalreportDXLink = report_job.get_output_ref('reportDXLink') output = {} output['upload_report'] = finalreportDXLink return output
def concat_pdfs_link(pdf_refs, name): job = dxpy.new_dxjob(fn_name='concat_pdfs', fn_input={ 'pdfs': pdf_refs, 'name': name }) return job.get_output_ref('pdf')
def main(bam_set, map_report_set, dme_ix, uncompress_bam=True): # tool_versions.py --applet $script_name --appver $script_ver props = {} if os.path.isfile('/usr/bin/tool_versions.py'): sw_versions = subprocess.check_output(['tool_versions.py', '--dxjson', 'dnanexus-executable.json']) props["SW"] = sw_versions print "* Value of bam_set: '" + str(bam_set) + "'" print "* Value of map_report_set: '" + str(map_report_set) + "'" print "* Value of dme_ix: '" + str(dme_ix) + "'" print "* Value of uncompress_bam: '" + str(uncompress_bam) + "'" print "* Calling merge_extract()..." inp = { 'bam_set': bam_set, 'map_report_set': map_report_set, 'dme_ix_dxlink': dme_ix, 'uncompress_bam': uncompress_bam, 'props': props } extract_job = dxpy.new_dxjob(inp, "merge_extract") print "* Kicked off extract() and waiting..." extract_job.wait_on_done() # Wait because we want the qc_metrics to pass to other jobs. extract_out = extract_job.describe()['output'] target_root = extract_out['target_root'] qc_metrics = extract_out['qc_metrics'] print "* Calling post_extraction()..." post_extraction_out = post_extraction(extract_out["CpG_context_dxlink"], \ extract_out["CHG_context_dxlink"], \ extract_out["CHH_context_dxlink"], \ dme_ix, target_root, qc_metrics, props) print "* Check storage..." run_cmd('ls -l') run_cmd('df -k .') print "* Finished." return { # from extract() #"bam_biorep": extract_out['biorep_bam_dxlink'], "bam_biorep_qc": extract_out['biorep_bam_qc_dxlink'], "map_biorep": extract_out['biorep_map_dxlink'], "mbias_report": extract_out["mbias_report_dxlink"], # from post_extraction() "signal": post_extraction_out["bigWig_dxlink"], "CpG_bed": post_extraction_out["CpG_bed_dxlink"], "CHG_bed": post_extraction_out["CHG_bed_dxlink"], "CHH_bed": post_extraction_out["CHH_bed_dxlink"], "CpG_bb": post_extraction_out["CpG_bb_dxlink"], "CHG_bb": post_extraction_out["CHG_bb_dxlink"], "CHH_bb": post_extraction_out["CHH_bb_dxlink"], "metadata": json.dumps(qc_metrics) }
def main(**job_inputs): # If we weren't provided a mmi index for the reference, generate it. if 'genome_mmi' not in job_inputs: mmi_input = {'genome_fastagz': job_inputs['genome_fastagz']} minimap_index_job = dxpy.new_dxjob(mmi_input, 'run_minimap_index') job_inputs['genome_mmi'] = minimap_index_job.get_output_ref( 'genome_mmi') output = {'genome_mmi': job_inputs['genome_mmi']} # check if we're dealing with pacbio or ONT reads and what the filetype is datatype = job_inputs['datatype'] one_reads_file = dxpy.DXFile(job_inputs['reads'][0]).describe()['name'] try: file_ext = re.search("(fastq|fasta|fa|fq){1}(.gz)?$", one_reads_file, flags=re.I).group(1).lower() except AttributeError: raise dxpy.AppError("Invalid filetype extension supplied.") # for fasta and fastq inputs, run jobs using native minimap2 jobs = run_minimap2_subjobs(job_inputs) output['bam_files'] = [j.get_output_ref('mapped_reads') for j in jobs] output['bai_files'] = [ j.get_output_ref('mapped_reads_index') for j in jobs ] return output
def map_entry_point(array_of_scattered_input, process_input): # The following calls "process" for each of the items in # *array_of_scattered_input*, using as input the item in the # array, as well as the rest of the fields in *process_input*. if DEBUG: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) logger.debug("** in map entry point with %s *" % process_input) process_jobs = [] for item in array_of_scattered_input: logger.debug("** scattering: %s *" % item) process_input["scattered_input"] = item process_jobs.append( dxpy.new_dxjob(fn_input=process_input, fn_name="process")) logger.info("* %s scatter jobs started *" % len(array_of_scattered_input)) bams = [] reports = [] for subjob in process_jobs: bams.append(subjob.get_output_ref('bam_file')) reports.append(subjob.get_output_ref('report_file')) return { "bam_files": bams, "report_files": reports, }
def main(fastq_gz_left_reads, fastq_gz_right_reads, indexed_reference, reads_per_chunk=25000000, aln_params="", sampe_params="-r '@RG\tID:1\tPL:ILLUMINA\tPU:None\tLB:1\tSM:1'"): picard_merge = applet("picard_merge_sam_files") if picard_merge == None: raise dxpy.AppError("unable to find applet called 'picard_merge_sam_files'. Please copy into your project from the collection of developer applets") splitter = applet("fastq_splitter") if splitter == None: raise dxpy.AppError("unable to find applet called 'fastq_splitter'. Please copy into your project from the collection of developer applets") bwa_aligner = applet("bwa_aligner") if bwa_aligner == None: raise dxpy.AppError("unable to find applet called 'bwa_aligner'. Please copy into your project from the collection of developer applets") bwa_controller_input = {"left_reads": [], "right_reads": [], "indexed_reference": indexed_reference, "aln_params":aln_params, "sampe_params":sampe_params, "bwa_aligner": bwa_aligner.get_id()} bwa_subjobs = [] for x, y in zip(fastq_gz_left_reads, fastq_gz_right_reads): left_job = splitter.run({"fastqgz": x, "reads_per_chunk": reads_per_chunk}) right_job = splitter.run({"fastqgz": y, "reads_per_chunk": reads_per_chunk}) bwa_controller_input["left_reads"].append(left_job.get_id()) bwa_controller_input["right_reads"].append(right_job.get_id()) bwa_subjobs.extend([left_job, right_job]) bwa_controller_job = dxpy.new_dxjob(fn_input=bwa_controller_input, fn_name='bwa_controller', depends_on=bwa_subjobs) picard_merge_job = picard_merge.run({"BAMs": {"job": bwa_controller_job.get_id(), "field": "BAMs"}}) print picard_merge_job.get_id() output = {"BAM": {"job": picard_merge_job.get_id(), "field": "BAM"}} return output
def map_entry_point(array_of_scattered_input, process_input): # The following calls "process" for each of the items in # *array_of_scattered_input*, using as input the item in the # array, as well as the rest of the fields in *process_input*. if DEBUG: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) logger.debug("** in map entry point with %s *" % process_input) process_jobs = [] for item in array_of_scattered_input: logger.debug("** scattering: %s *" % item) process_input["scattered_input"] = item process_jobs.append(dxpy.new_dxjob(fn_input=process_input, fn_name="process")) logger.info("* %s scatter jobs started *" % len(array_of_scattered_input)) bams = [] reports = [] for subjob in process_jobs: bams.append(subjob.get_output_ref('bam_file')) reports.append(subjob.get_output_ref('report_file')) return { "bam_files": bams, "report_files": reports, }
def run_pbmm2_subjobs(job_inputs): pbi_filenames = {} if job_inputs.get('reads_indices'): filenames = _get_filenames(job_inputs['reads_indices']) for indx, name in enumerate(filenames): pbi_filenames[name] = job_inputs['reads_indices'][indx] else: pbi_filenames = {} # now set up and run pbmm2 subjobs for mapping reads # group inputs into filesizes jobs = [] # set default target size to 5GB for group in _group_movies(job_inputs['reads'], job_inputs['chunk_size']): group_fns = _get_filenames(group) group_pbis = [pbi_filenames.get(f + '.pbi') for f in group_fns] map_reads_input = { 'bam_files': group, 'pbi_files': group_pbis, 'genome_fastagz': job_inputs['genome_fastagz'], 'genome_mmi': job_inputs['genome_mmi'] } job = dxpy.new_dxjob(map_reads_input, 'map_reads_pbmm2') jobs.append(job) return jobs
def map_entry_point(array_of_scattered_input, process_input): # The following calls "process" for each of the items in # *array_of_scattered_input*, using as input the item in the # array, as well as the rest of the fields in *process_input*. process_jobs = [] for item in array_of_scattered_input: process_input["scattered_input"] = item process_jobs.append(dxpy.new_dxjob(fn_input=process_input, fn_name="process")) return { "process_outputs": [subjob.get_output_ref("process_output") for subjob in process_jobs] }
def main(reads1, reference_tar, bwa_aln_params, bwa_version, samtools_version, reads2=None): # Main entry-point. Parameter defaults assumed to come from dxapp.json. # reads1, reference_tar, reads2 are links to DNAnexus files or None # This spawns only one or two subjobs for single- or paired-end, # respectively. It could also download the files, chunk the reads, # and spawn multiple subjobs. # Files are downloaded later by subjobs into their own filesystems # and uploaded to the project. # Initialize file handlers for input files. paired_end = reads2 is not None unmapped_reads = [r for r in [reads1, reads2] if r] subjobs = [] for reads in unmapped_reads: subjob_input = {"reads_file": reads, "reference_tar": reference_tar, "bwa_aln_params": bwa_aln_params, "bwa_version": bwa_version} print "Submitting:" print subjob_input subjobs.append(dxpy.new_dxjob(subjob_input, "process")) # Create the job that will perform the "postprocess" step. depends_on=subjobs, so blocks on all subjobs postprocess_job = dxpy.new_dxjob(fn_input={ "indexed_reads": [subjob.get_output_ref("output") for subjob in subjobs], "unmapped_reads": unmapped_reads, "reference_tar": reference_tar, "bwa_version": bwa_version, "samtools_version": samtools_version }, fn_name="postprocess", depends_on=subjobs) mapped_reads = postprocess_job.get_output_ref("mapped_reads") mapping_statistics = postprocess_job.get_output_ref("mapping_statistics") output = { "mapped_reads": mapped_reads, "mapping_statistics": mapping_statistics, "paired_end": paired_end } print "Exiting with output: %s" %(output) return output
def geneBody_coverage(BAM_file, BED_file): dxpy.download_dxfile(BED_file, "genes.bed") dxpy.download_dxfile(BAM_file, "mappings.bam") # split mappings into chunks that can be done on a single worker # all mappings are loaded into RAM so can only do 5 million at a time run_shell(" ".join([ "samtools", "view", "mappings.bam", "|", "split", "-l 10000000", "-", "split_map" ])) run_shell(" ".join( ["samtools", "view", "-H", "mappings.bam", ">", "header_only.sam"])) files = os.listdir(".") jobs = [] for f in files: if f.startswith("split_map"): # add header run_shell(" ".join(["cat", "header_only.sam", f, ">", "temp.sam"])) # convert to BAM run_shell(" ".join( ["samtools", "view", "-S", "-b", "temp.sam", ">", "temp.bam"])) # upload file split_bam = dxpy.upload_local_file("temp.bam") # run analysis jobs.append( dxpy.new_dxjob( { "BAM_file": dxpy.dxlink(split_bam.get_id()), "BED_file": BED_file }, "run_gbc")) run_shell("ls -l") gbc_agg_input = {"sub_reports": []} for j in jobs: gbc_agg_input["sub_reports"].append({ "job": j.get_id(), "field": "file" }) agg_job = dxpy.new_dxjob(gbc_agg_input, "gbc_agg").get_id() return {"results": {"job": agg_job, "field": "cover"}}
def main(nSimulations, nWorkers): # To achieve a target of nSimulations total simulations, # an approximately equal share of simulations is delegated to each # worker initiated subjobs = [] workerLoads = splitIntoGroups(nSimulations, nWorkers) for load in workerLoads: subjob_input = { "workerLoad": load} subjobs.append(dxpy.new_dxjob(subjob_input, "process")) # The postprocess job depends on all subjobs being "done" postprocess_job = dxpy.new_dxjob(fn_input={ "process_outputs": [subjob.get_output_ref("output") for subjob in subjobs], "total_count": nSimulations }, fn_name="postprocess", depends_on=subjobs) output = {"estimatedPi": postprocess_job.get_output_ref("answer")} return output
def run_minimap2_subjobs(job_inputs): # group subjobs by filesize chunks files_and_filesizes = zip(job_inputs['reads'], _get_filesizes(job_inputs['reads'])) jobs = [] for group in dx_utils.schedule_lpt(files_and_filesizes, job_inputs['chunk_size']): map_reads_input = { 'reads': group, 'genome_fastagz': job_inputs['genome_fastagz'], 'genome_mmi': job_inputs['genome_mmi'], 'datatype': job_inputs['datatype'] } job = dxpy.new_dxjob(map_reads_input, 'map_reads_minimap2') jobs.append(job) return jobs
def main(**job_inputs): # If we weren't provided a mmi index for the reference, generate it. if 'genome_mmi' not in job_inputs: mmi_input = {'genome_fastagz': job_inputs['genome_fastagz']} minimap_index_job = dxpy.new_dxjob(mmi_input, 'run_minimap_index') job_inputs['genome_mmi'] = minimap_index_job.get_output_ref( 'genome_mmi') output = {'genome_mmi': job_inputs['genome_mmi']} # check if we're dealing with pacbio or ONT reads and what the filetype is datatype = job_inputs['datatype'] one_reads_file = dxpy.DXFile(job_inputs['reads'][0]).describe()['name'] try: file_ext = re.search("(bam|fastq|fasta|fa|fq){1}(.gz)?$", one_reads_file, flags=re.I).group(1).lower() except AttributeError: raise dxpy.AppError("Unknown filetype extension supplied.") if file_ext == 'bam': # input bam files must be pacbio raw reads if datatype == 'ONT': raise dxpy.AppError("Invalid file input for provided datatype.") # for bam input, run jobs using pbmm2 jobs = run_pbmm2_subjobs(job_inputs) else: # for fasta and fastq inputs, run jobs using native minimap2 if job_inputs['pbbamify']: print( 'WARNING: The "Run pbbamify" option is only valid for BAM input' ) jobs = run_minimap2_subjobs(job_inputs) output['bam_files'] = [j.get_output_ref('mapped_reads') for j in jobs] output['bai_files'] = [ j.get_output_ref('mapped_reads_index') for j in jobs ] return output
def map_contaminant(Contig, Reads): # get ID of our mapper try: bwa = dxpy.DXApp(dxpy.find_apps(name="bwa").next()['id']) except StopIteration: raise dxpy.AppError("Unable to find app 'bwa'. Please install it to enable contaminant mapping") # TODO: find optimal chunk size so we don't launch too many bwa jobs map_job = bwa.run({"reads":Reads, "reference": Contig, "discard_unmapped_rows":True, "chunk_size":10000000}) total_reads = 0 for r in Reads: desc = dxpy.DXGTable(r).describe() current_reads = desc['length'] if 'sequence2' in desc['columns']: current_reads *= 2 total_reads += current_reads # launch a job to wait for the mapping and will calculate what % has mapped calc_job = dxpy.new_dxjob({"num_reads":total_reads, "mappings":{"job":map_job.get_id(), "field":"mappings"}}, "calc_contam") return calc_job.get_id()
def map_contaminant(Contig, Reads): # get ID of our mapper try: bwa = dxpy.DXApp( dxpy.find_apps(name="bwa_mem_fastq_read_mapper").next()['id']) except StopIteration: raise dxpy.AppError( "Unable to find app 'bwa_mem_fastq_read_mapper'. Please install it to enable contaminant mapping" ) # TODO: find optimal chunk size so we don't launch too many bwa jobs map_job = bwa.run({ "reads": Reads, "reference": Contig, "discard_unmapped_rows": True, "chunk_size": 10000000 }) total_reads = 0 for r in Reads: desc = dxpy.DXGTable(r).describe() current_reads = desc['length'] if 'sequence2' in desc['columns']: current_reads *= 2 total_reads += current_reads # launch a job to wait for the mapping and will calculate what % has mapped calc_job = dxpy.new_dxjob( { "num_reads": total_reads, "mappings": { "job": map_job.get_id(), "field": "mappings" } }, "calc_contam") return calc_job.get_id()
def main(exp_acc, files_to_fetch=None, skipvalidate=True, key='www', debug=False): # Splits the work into parallel tasks: one for each file to fetch. if debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) proj_id = os.environ['DX_PROJECT_CONTEXT_ID'] project = dxpy.DXProject(proj_id) ## should be default logger.debug("* Project: " + proj_id) if files_to_fetch != None: logger.debug("* f2f_json: " + files_to_fetch) file_objs = json.loads(files_to_fetch.encode('ascii')) # Expect [ {},{},{},... ] logger.debug(file_objs) # f_obj = { "accession": ,"dx_folder": ,"dx_file_name": ,"enc_file_name": ,"bucket_url": } subjobs = [] if file_objs: for f_obj in file_objs: skipvalidate_this = skipvalidate dx_file_name = f_obj["dx_file_name"] if dx_file_name.endswith(".fastq.gz") or dx_file_name.endswith(".fq.gz"): skipvalidate_this = True logger.debug(f_obj["bucket_url"] + " " + f_obj["enc_file_name"]) #process(f_obj["enc_file_name"], f_obj["bucket_url"], project.get_id(), f_obj["dx_folder"], f_obj["accession"], \ # f_obj["dx_file_name"], skipvalidate_this) subjob_input = { "enc_file_name": f_obj["enc_file_name"], "bucket_url": f_obj["bucket_url"], "proj_id": project.get_id(), "dx_folder": f_obj["dx_folder"], "file_acc": f_obj["accession"], "dx_file_name": f_obj["dx_file_name"], "skipvalidate": skipvalidate_this } subjobs.append(dxpy.new_dxjob(subjob_input, "process")) #subjobs.append(dxpy.new_dxjob(subjob_input, "noop")) # This does not wait for subjob completion as I thought. files_fetched = [subjob.get_output_ref("file") for subjob in subjobs] logger.debug("Attempting to fetch %d file(s)" % (len(files_fetched))) if skipvalidate: output = { "fetched_count": len(files_fetched), "files": files_fetched } else: output = { "fetched_count": len(files_fetched), "files": files_fetched, "reports": [subjob.get_output_ref("report") for subjob in subjobs], "summaries": [subjob.get_output_ref("summary") for subjob in subjobs], "zips": [subjob.get_output_ref("zip") for subjob in subjobs], } return output
def RunWithBamInput(kwargs): mappings_ids = kwargs["mappings_files"] mappings_names = sorted([dxpy.describe(id)["name"] for id in mappings_ids]) num_threads = kwargs["num_threads_per_instance"] bam_config_fn = "bam_config.txt" if "bam_config_file" in kwargs: print "\nInput has a BAM config file. Need to download and validate bam config file" dxpy.download_dxfile(kwargs["bam_config_file"], bam_config_fn) ValidateBamConfig(bam_config_fn=bam_config_fn, bam_name_array=mappings_names) else: if "insert_size" not in kwargs: raise dxpy.AppError("Input files are bam files but neither a bam configuration file, nor an insert size was given as an app input.") if kwargs["bam_not_produced_by_bwa"]: return RunWithPindelInput(kwargs, sam2pindel=True) else: bam_config_fn = WriteConfigFile(mappings_names=mappings_names, fn=bam_config_fn, insert_size=kwargs["insert_size"]) need_to_index=True if "bam_index_files" in kwargs: bam_idx_ids = kwargs["bam_index_files"] idx_names = sorted([dxpy.describe(id)["name"] for id in bam_idx_ids]) if CheckBamIdxMatch(bam_names=mappings_names, idx_names=idx_names): need_to_index = False mappings_names = DownloadFilesFromArray(mappings_ids) bam_idx_names = DownloadFilesFromArray(bam_idx_ids) if need_to_index: mappings_names = DownloadFilesFromArray(mappings_ids) if not kwargs["assume_sorted"]: mappings_names = SortBams(bam_names=mappings_names, num_threads=num_threads) mappings_names, bam_idx_names = IndexBams(mappings_names) chrom = kwargs["chromosome"] if "chromosome" in kwargs else "ALL" if "chromosome" in kwargs or kwargs["num_instances"] == 1: command, output_path = BuildPindelCommand(kwargs=kwargs, chrom=chrom, input_fn=bam_config_fn, is_pindel_input_type=False) output_path = RunPindel(kwargs=kwargs, pindel_command=command, output_path=output_path) app_outputs = UploadPindelOutputs(kwargs=kwargs, output_path=output_path) if kwargs["export_vcf"]: app_outputs["vcf"] = ExportVCF(kwargs=kwargs, output_path=output_path, ref_fn="reference_fasta") else: subjob_ids = SplitBamForSubjobs(kwargs, mappings_names, bam_config_fn) postprocess_inputs = {"subjob_outputs": [job.get_output_ref("subjob_output") for job in subjob_ids], "kwargs": kwargs} postprocess_job = dxpy.new_dxjob(fn_input = postprocess_inputs, fn_name = "postprocess") app_outputs = {"deletions" : {"job": postprocess_job.get_id(), "field": "deletions"}, "short_inserts" : {"job": postprocess_job.get_id(), "field": "short_inserts"}, "tandem_duplications" : {"job": postprocess_job.get_id(), "field": "tandem_duplications"}, "large_inserts" : {"job": postprocess_job.get_id(), "field": "large_inserts"}, "inversions" : {"job": postprocess_job.get_id(), "field": "inversions"}, "breakpoints" : {"job": postprocess_job.get_id(), "field": "breakpoints"} } if kwargs["report_close_mapped_reads"] or kwargs["report_only_close_mapped_reads"]: app_outputs["close_mapped_reads"] = {"job": postprocess_job.get_id(), "field": "close_mapped_reads"} if kwargs["export_vcf"]: app_outputs["vcf"] = {"job": postprocess_job.get_id(), "field": "vcf"} #if "breakdancer_calls_file" in kwargs: # app_outputs["breakdancer_outputs"] = {"job": postprocess_job.get_id(), "field": "breakdancer_outputs"} dxlinks = [] if need_to_index: if not kwargs["assume_sorted"]: for bam in mappings_names: uploaded_bam = dxpy.upload_local_file(bam, name=bam.rstrip('.bam')+"_sorted.bam") dxlinks.append(dxpy.dxlink(uploaded_bam)) for idx in bam_idx_names: uploaded_idx = dxpy.upload_local_file(idx, name=idx.rstrip('.bam.bai')+"_sorted.bam.bai") dxlinks.append(dxpy.dxlink(uploaded_idx)) app_outputs["sortedbam_and_index_files"] = dxlinks return app_outputs
def main(reads1, crop_length, reference_tar, bwa_version, bwa_aln_params, samtools_version, debug, reads2=None): # Main entry-point. Parameter defaults assumed to come from dxapp.json. # reads1, reference_tar, reads2 are links to DNAnexus files or None if debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) # This spawns only one or two subjobs for single- or paired-end, # respectively. It could also download the files, chunk the reads, # and spawn multiple subjobs. # Files are downloaded later by subjobs into their own filesystems # and uploaded to the project. # Initialize file handlers for input files. paired_end = reads2 is not None if crop_length == 'native': crop_subjob = None unmapped_reads = [reads1, reads2] else: crop_subjob_input = { "reads1_file": reads1, "reads2_file": reads2, "crop_length": crop_length, "debug": debug } logger.info("Crop job input: %s" % (crop_subjob_input)) crop_subjob = dxpy.new_dxjob(crop_subjob_input, "crop") unmapped_reads = [crop_subjob.get_output_ref("cropped_reads1")] if paired_end: unmapped_reads.append(crop_subjob.get_output_ref("cropped_reads2")) else: unmapped_reads.append(None) unmapped_reads = [r for r in unmapped_reads if r] mapping_subjobs = [] for reads in unmapped_reads: mapping_subjob_input = { "reads_file": reads, "reference_tar": reference_tar, "bwa_aln_params": bwa_aln_params, "bwa_version": bwa_version, "debug": debug } logger.info("Mapping job input: %s" % (mapping_subjob_input)) if crop_subjob: mapping_subjobs.append( dxpy.new_dxjob(fn_input=mapping_subjob_input, fn_name="process", depends_on=[crop_subjob])) else: mapping_subjobs.append( dxpy.new_dxjob(fn_input=mapping_subjob_input, fn_name="process")) # Create the job that will perform the "postprocess" step. # depends_on=mapping_subjobs, so blocks on all mapping subjobs postprocess_job = dxpy.new_dxjob(fn_input={ "indexed_reads": [ subjob.get_output_ref("suffix_array_index") for subjob in mapping_subjobs ], "unmapped_reads": unmapped_reads, "reference_tar": reference_tar, "bwa_version": bwa_version, "samtools_version": samtools_version, "debug": debug }, fn_name="postprocess", depends_on=mapping_subjobs) mapped_reads = postprocess_job.get_output_ref("mapped_reads") mapping_statistics = postprocess_job.get_output_ref("mapping_statistics") n_mapped_reads = postprocess_job.get_output_ref("n_mapped_reads") output = { "mapped_reads": mapped_reads, "crop_length": crop_length, "mapping_statistics": mapping_statistics, "paired_end": paired_end, "n_mapped_reads": n_mapped_reads } logger.info("Exiting with output: %s" % (output)) return output
def main(bam_set, map_report_set, dme_ix, uncompress_bam=True): # tool_versions.py --applet $script_name --appver $script_ver props = {} if os.path.isfile('/usr/bin/tool_versions.py'): sw_versions = subprocess.check_output( ['tool_versions.py', '--dxjson', 'dnanexus-executable.json']) props["SW"] = sw_versions print "* Value of bam_set: '" + str(bam_set) + "'" print "* Value of map_report_set: '" + str(map_report_set) + "'" print "* Value of dme_ix: '" + str(dme_ix) + "'" print "* Value of uncompress_bam: '" + str(uncompress_bam) + "'" print "* Calling merge_extract_full()..." inp = { 'bam_set': bam_set, 'map_report_set': map_report_set, 'dme_ix_dxlink': dme_ix, 'uncompress_bam': uncompress_bam, 'props': props } extract_job = dxpy.new_dxjob(inp, "merge_extract_full") print "* Kicked off extract() and waiting..." extract_job.wait_on_done( ) # Wait because we want the qc_metrics to pass to other jobs. extract_out = extract_job.describe()['output'] target_root = extract_out['target_root'] qc_metrics = extract_out['qc_metrics'] print "* Calling bedmethyl()..." # What is cheaper? bedmethyl and signal in main or farm one out to a separate process? bedmethyl_out = bedmethyl_io(extract_out["cx_report_dxlink"], extract_out["chrom_sizes_dxlink"], target_root, qc_metrics, props) #inp = { # 'cx_report_dxlink': extract_out["cx_report_dxlink"], # 'chrom_sizes_dxlink': extract_out["chrom_sizes_dxlink"], # 'target_root': target_root, # 'qc_metrics': qc_metrics, # 'props': props #} #bedmethyl_job = dxpy.new_dxjob(inp, "bedmethyl_io") #print "* Kicked off bedmethyl() but not waiting waiting..." print "* Calling signal()..." signal_out = signal_io(extract_out["bedgraph_gz_dxlink"], extract_out["chrom_sizes_dxlink"], target_root, qc_metrics, props) print "* Check storage..." run_cmd('ls -l') run_cmd('df -k .') #bedmethyl_job.wait_on_done() # Wait because we want the qc_metrics to pass to other jobs. #bedmethyl_out = bedmethyl_job.describe()['output'] print "* Finished." return { # from extract() #"bam_biorep": extract_out['biorep_bam_dxlink'], "bam_biorep_qc": extract_out['biorep_bam_qc_dxlink'], "map_biorep": extract_out['biorep_map_dxlink'], "mbias_report": extract_out["mbias_report_dxlink"], # from signal() "signal": signal_out["bigWig_dxlink"], # from bedmethyl() "CpG_bed": bedmethyl_out["CpG_bed_dxlink"], "CHG_bed": bedmethyl_out["CHG_bed_dxlink"], "CHH_bed": bedmethyl_out["CHH_bed_dxlink"], "CpG_bb": bedmethyl_out["CpG_bb_dxlink"], "CHG_bb": bedmethyl_out["CHG_bb_dxlink"], "CHH_bb": bedmethyl_out["CHH_bb_dxlink"], "metadata": json.dumps(qc_metrics) }
def main(accession, key=None, debug=False, skipvalidate=False): # The following line(s) initialize your data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. #files = [dxpy.DXFile(item) for item in files] # The following line(s) download your file inputs to the local file system # using variable names for the filenames. #for i, f in enumerate(files): # dxpy.download_dxfile(f.get_id(), "files-" + str(i)) # Split your work into parallel tasks. As an example, the # following generates 10 subjobs running with the same dummy # input. if debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) (AUTHID, AUTHPW, SERVER) = processkey(key) url = SERVER + 'experiments/%s/?format=json&frame=embedded' % (accession) #get the experiment object logger.debug("%s - %s" % (url, AUTHID)) response = encoded_get(url, AUTHID, AUTHPW) logger.debug(response) exp = response.json() reps = exp.get('replicates') # for some reason cannot write exp json to STDERR/logger logger.debug(reps or "No replicates") ''' Derive replicate structure and make directories ''' project = dxpy.DXProject( os.environ['DX_PROJECT_CONTEXT_ID']) ## should be default exp_folder = "%s/%s" % (ROOT_FOLDER, accession) #rf = find_or_create_folder(project, ROOT_FOLDER) #project.new_fo #f = find_or_create_folder(project, exp_folder, root_folder='/'+ROOT_FOLDER) for rep in exp['replicates']: rep_folder = "%s/rep%s_%s" % (exp_folder, rep['biological_replicate_number'], rep['technical_replicate_number']) project.new_folder(rep_folder, parents=True) subjobs = [] files = exp.get('files') if reps and files: for ff in files: if ff['file_format'] == 'fastq': folder = "%s/rep%s_%s" % ( exp_folder, ff['replicate']['biological_replicate_number'], ff['replicate']['technical_replicate_number']) file_name, bucket_url = get_bucket(SERVER, AUTHID, AUTHPW, ff) subjob_input = { "filename": file_name, "bucket_url": bucket_url, "project": project.get_id(), "folder": folder, "skipvalidate": skipvalidate } subjobs.append(dxpy.new_dxjob(subjob_input, "process")) # The following line creates the job that will perform the # "postprocess" step of your app. We've given it an input field # that is a list of job-based object references created from the # "process" jobs we just created. Assuming those jobs have an # output field called "output", these values will be passed to the # "postprocess" job. Because these values are not ready until the # "process" jobs finish, the "postprocess" job WILL NOT RUN until # all job-based object references have been resolved (i.e. the # jobs they reference have finished running). # # If you do not plan to have the "process" jobs create output that # the "postprocess" job will require, then you can explicitly list # the dependencies to wait for those jobs to finish by setting the # "depends_on" field to the list of subjobs to wait for (it # accepts either dxpy handlers or string IDs in the list). We've # included this parameter in the line below as well for # completeness, though it is unnecessary if you are providing # job-based object references in the input that refer to the same # set of jobs. # If you would like to include any of the output fields from the # postprocess_job as the output of your app, you should return it # here using a job-based object reference. If the output field in # the postprocess function is called "answer", you can pass that # on here as follows: # #return { "FastQC_reports": [ dxpy.dxlink(item) for item in postprocess_job.get_output_ref("report") ]} # # Tip: you can include in your output at this point any open # objects (such as gtables) which will be closed by a job that # finishes later. The system will check to make sure that the # output object is closed and will attempt to clone it out as # output into the parent container only after all subjobs have # finished. if skipvalidate: output = { "files": [subjob.get_output_ref("file") for subjob in subjobs] } else: output = { "files": [subjob.get_output_ref("file") for subjob in subjobs], "reports": [subjob.get_output_ref("report") for subjob in subjobs], "summaries": [subjob.get_output_ref("summary") for subjob in subjobs], "zips": [subjob.get_output_ref("zip") for subjob in subjobs], } return output
def main(**job_inputs): output = {} reportInput = {} run_shell("dx-spans-to-bed --output genes.bed " + job_inputs["gene_model"]["$dnanexus_link"]) bed_id = dxpy.upload_local_file("genes.bed").get_id() mappings_id = job_inputs["mappings"]["$dnanexus_link"] # get contaminant mapping started if we're doing it: if "contaminants" in job_inputs: if not "original_reads" in job_inputs: raise dxpy.AppError("Original Reads must be input to calculate contamination levels. Please also supply the reads object that corresponds to these RNA-Seq mappings") name_input = [] contam_input = [] #spawn mappings job for each ContigSet for contaminant in job_inputs['contaminants']: calc_job = map_contaminant(Reads=job_inputs['original_reads'], Contig=contaminant) name_input.append(dxpy.DXRecord(contaminant).describe()['name']) contam_input.append({"job":calc_job, "field":"percent_mapped"}) reportInput['contam'] = contam_input reportInput['names'] = name_input else: reportInput['contam'] = None reportInput['names'] = None # output mappings as SAM for analysis modules run_shell(" ".join(["dx-mappings-to-sam", "--discard_unmapped", "--output mappings.sam", mappings_id])) run_shell(" ".join(["samtools", "view", "-S", "-b", "mappings.sam", ">", "mappings.bam"])) bam_id = dxpy.upload_local_file("mappings.bam", wait_on_close=True).get_id() job1 = dxpy.new_dxjob( {'BED_file':bed_id, "BAM_file":dxpy.dxlink(bam_id)}, "geneBody_coverage" ) # if paired then do inner distance calculation if "chr2" in dxpy.DXGTable(mappings_id).get_col_names(): job2 = dxpy.new_dxjob( {'BED_file':bed_id, "BAM_file":dxpy.dxlink(bam_id)}, "inner_distance" ) else: job2 = None job3 = dxpy.new_dxjob( {'BED_file':bed_id, "BAM_file":dxpy.dxlink(bam_id)}, "junction_annotation" ) job4 = dxpy.new_dxjob( {"BAM_file":dxpy.dxlink(bam_id)}, "read_duplication" ) # implement this one when we can request a large RAM instance - requires 19GB for human genome job5 = dxpy.new_dxjob( {'BED_file':bed_id, "BAM_file":dxpy.dxlink(bam_id)}, "read_distribution") # {"systemRequirements": {"instanceType":"dx_m2.2xlarge"}} ) reportInput['geneBody'] = {"job":job1.get_id(), "field":"results"} if job2 != None: reportInput['inner_dist'] = {"job":job2.get_id(), "field":"results"} else: reportInput['inner_dist'] = None reportInput['junc_ann'] = {"job":job3.get_id(), "field":"results"} reportInput['read_dup'] = {"job":job4.get_id(), "field":"results"} reportInput['read_dist'] = {"job":job5.get_id(), "field":"results"} reportInput['mappings'] = job_inputs["mappings"] reportJob = dxpy.new_dxjob( reportInput, "generate_report" ) output['report'] = {"job":reportJob.get_id(), "field": "Report"} return output
def main(reads, dme_ix, ncpus, splitsize): # The following line(s) initialize your data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. #dx_reads = [dxpy.DXFile(item) for item in reads] # The following line(s) download your file inputs to the local file system # using variable names for the filenames. # We first create the "scatter" job which will scatter some input # (replace with your own input as necessary). logger.info("* Start Scatter with %d files %sM read splits *" % (len(reads), splitsize)) scatter_job = dxpy.new_dxjob(fn_input={ 'orig_reads': reads, 'split_size': splitsize, }, fn_name="scatter") # We will want to call "process" on each output of "scatter", so # we call the "map" entry point to do so. We can also provide # here additional input that we want each "process" entry point to # receive, e.g. a GTable ID to which the "process" function should # add rows of data. reads_root = simplify_name() or strip_extensions( dxpy.describe(reads[0])['name'], STRIP_EXTENSIONS) map_input = { "array_of_scattered_input": scatter_job.get_output_ref("array_of_scattered_input"), "process_input": { "reads_root": reads_root, "ncpus": ncpus, "dme_ix": dme_ix } } logger.info("* Start Map with: %s *" % map_input) map_job = dxpy.new_dxjob(fn_input=map_input, fn_name="map") # Finally, we want the "postprocess" job to run after "map" is # done calling "process" on each of its inputs. Note that a job # is marked as "done" only after all of its child jobs are also # marked "done". logger.info("* Waiting for map job to finish...") postprocess_input = { "bam_files": map_job.get_output_ref("bam_files"), "report_files": map_job.get_output_ref("report_files"), "bam_root": reads_root + '_techrep' } logger.info("* Start Post process with: %s *" % postprocess_input) postprocess_job = dxpy.new_dxjob(fn_input=postprocess_input, fn_name="postprocess", depends_on=[map_job]) # If you would like to include any of the output fields from the # postprocess_job as the output of your app, you should return it # here using a job-based object reference. # # return { "app_output_field": postprocess_job.get_output_ref("final_output"), ...} # # Tip: you can include in your output at this point any open # objects (such as gtables) which will be closed by a job that # finishes later. The system will check to make sure that the # output object is closed and will attempt to clone it out as # output into the parent container only after all subjobs have # finished. output = {} output["bam_techrep"] = dxpy.dxlink( postprocess_job.get_output_ref("bam_techrep")) output["bam_techrep_qc"] = dxpy.dxlink( postprocess_job.get_output_ref("bam_techrep_qc")) output["map_techrep"] = dxpy.dxlink( postprocess_job.get_output_ref("map_techrep")) output["reads"] = postprocess_job.get_output_ref("reads") output["metadata"] = postprocess_job.get_output_ref("metadata") return output
def concat_pdfs_link(pdf_refs, name): job = dxpy.new_dxjob(fn_name='concat_pdfs', fn_input={'pdfs': pdf_refs, 'name': name}) return job.get_output_ref('pdf')
def SplitBamForSubjobs(kwargs, bam_names, bam_config_fn=None): num_threads = kwargs["num_threads_per_instance"] print "\nSplitting bam for subjobs" # Assuming that all bam files have the same chromosomes (is this safe?) subprocess.check_output("samtools view -H {input_bam} > header.txt".format( input_bam=bam_names[0]), shell=True) with open('header.txt') as fh: header = [line.rstrip('\n') for line in fh] print "Input header: " for line in header: print line print "Save unmapped reads as bam files to merge into subjob files" unmapped = {} for bam in bam_names: fn = bam.rstrip('.bam') + '_unmapped' command = "samtools view -@ {n} -u -b -f 4 {bam} > {unmapped}".format( n=num_threads, bam=bam, unmapped=fn) print command subprocess.check_call(command, shell=True) unmapped[bam] = fn groups = SplitGenomeFromSam(header, kwargs["num_instances"]) subjobs = [] subjob_no = 0 for group in groups: group = " ".join(group) subjob_bam_fn = [] for bam in bam_names: start_time = time.time() print "\nMerging {bam} with unmapped reads for pindel subjobs".format( bam=bam) out_fn = bam.rstrip('.bam') + '_' + str(subjob_no) + '.bam' command = "samtools view -@ {n} -bh {bam} {group} > tmp.bam".format( n=num_threads, bam=bam, group=group) subprocess.check_call(command, shell=True) split_command = "samtools merge -@ {n} {out} {unmapped} tmp.bam ".format( n=num_threads, out=out_fn, unmapped=unmapped[bam]) print split_command subprocess.check_call(split_command, shell=True) print "Samtools view and merge ran in: {min} minutes".format( min=float((time.time() - start_time) / 60)) subjob_bam_fn.append(out_fn) subjob_kwargs = kwargs.copy() subjob_bam_fn, subjob_bam_idx_fn = IndexBams(bam_names=subjob_bam_fn) print "Uploading split bam files: " + str(subjob_bam_fn) subjob_bam_ids = [ dxpy.dxlink(dxpy.upload_local_file(bam)) for bam in subjob_bam_fn ] print "Uploading split bam index files: " + str(subjob_bam_idx_fn) subjob_bam_idx_ids = [ dxpy.dxlink(dxpy.upload_local_file(idx)) for idx in subjob_bam_idx_fn ] subjob_kwargs["mappings_files"] = subjob_bam_ids subjob_kwargs["bam_index_files"] = subjob_bam_idx_ids print "Updating bam config file for subjob" if bam_config_fn: new_config_fn = "subjob_config_" + str(subjob_no) + '.txt' with open(bam_config_fn, 'r') as config_fh, open(new_config_fn, 'w') as write_fh: for line in config_fh: line = line.split('\t') bam_name = line[0] out_fn = bam_name.rstrip('.bam') + '_' + str( subjob_no) + '.bam' write_fh.write(out_fn + '\t' + "\t".join(line[1:]) + '\n') print "Uploading new config file: " + str(new_config_fn) subjob_kwargs["bam_config_file"] = dxpy.dxlink( dxpy.upload_local_file(new_config_fn)) job = dxpy.new_dxjob(subjob_kwargs, "process") print "Started subjob #{n}: {job_id}".format(n=subjob_no, job_id=job.get_id()) subjobs.append(job) subjob_no += 1 return subjobs
def main(bam_set, map_report_set, dme_ix, uncompress_bam=True): # tool_versions.py --applet $script_name --appver $script_ver props = {} if os.path.isfile('/usr/bin/tool_versions.py'): sw_versions = subprocess.check_output(['tool_versions.py', '--dxjson', 'dnanexus-executable.json']) props["SW"] = sw_versions print "* Value of bam_set: '" + str(bam_set) + "'" print "* Value of map_report_set: '" + str(map_report_set) + "'" print "* Value of dme_ix: '" + str(dme_ix) + "'" print "* Value of uncompress_bam: '" + str(uncompress_bam) + "'" print "* Calling merge_extract_full()..." inp = { 'bam_set': bam_set, 'map_report_set': map_report_set, 'dme_ix_dxlink': dme_ix, 'uncompress_bam': uncompress_bam, 'props': props } extract_job = dxpy.new_dxjob(inp, "merge_extract_full") print "* Kicked off extract() and waiting..." extract_job.wait_on_done() # Wait because we want the qc_metrics to pass to other jobs. extract_out = extract_job.describe()['output'] target_root = extract_out['target_root'] qc_metrics = extract_out['qc_metrics'] print "* Calling bedmethyl()..." # What is cheaper? bedmethyl and signal in main or farm one out to a separate process? bedmethyl_out = bedmethyl_io(extract_out["cx_report_dxlink"], extract_out["chrom_sizes_dxlink"], target_root, qc_metrics, props) #inp = { # 'cx_report_dxlink': extract_out["cx_report_dxlink"], # 'chrom_sizes_dxlink': extract_out["chrom_sizes_dxlink"], # 'target_root': target_root, # 'qc_metrics': qc_metrics, # 'props': props #} #bedmethyl_job = dxpy.new_dxjob(inp, "bedmethyl_io") #print "* Kicked off bedmethyl() but not waiting waiting..." print "* Calling signal()..." signal_out = signal_io(extract_out["bedgraph_gz_dxlink"],extract_out["chrom_sizes_dxlink"],target_root,qc_metrics,props) print "* Check storage..." run_cmd('ls -l') run_cmd('df -k .') #bedmethyl_job.wait_on_done() # Wait because we want the qc_metrics to pass to other jobs. #bedmethyl_out = bedmethyl_job.describe()['output'] print "* Finished." return { # from extract() #"bam_biorep": extract_out['biorep_bam_dxlink'], "bam_biorep_qc": extract_out['biorep_bam_qc_dxlink'], "map_biorep": extract_out['biorep_map_dxlink'], "mbias_report": extract_out["mbias_report_dxlink"], # from signal() "signal": signal_out["bigWig_dxlink"], # from bedmethyl() "CpG_bed": bedmethyl_out["CpG_bed_dxlink"], "CHG_bed": bedmethyl_out["CHG_bed_dxlink"], "CHH_bed": bedmethyl_out["CHH_bed_dxlink"], "CpG_bb": bedmethyl_out["CpG_bb_dxlink"], "CHG_bb": bedmethyl_out["CHG_bb_dxlink"], "CHH_bb": bedmethyl_out["CHH_bb_dxlink"], "metadata": json.dumps(qc_metrics) }
def RunWithBamInput(kwargs): mappings_ids = kwargs["mappings_files"] mappings_names = sorted([dxpy.describe(id)["name"] for id in mappings_ids]) num_threads = kwargs["num_threads_per_instance"] bam_config_fn = "bam_config.txt" if "bam_config_file" in kwargs: print "\nInput has a BAM config file. Need to download and validate bam config file" dxpy.download_dxfile(kwargs["bam_config_file"], bam_config_fn) ValidateBamConfig(bam_config_fn=bam_config_fn, bam_name_array=mappings_names) else: if "insert_size" not in kwargs: raise dxpy.AppError( "Input files are bam files but neither a bam configuration file, nor an insert size was given as an app input." ) if kwargs["bam_not_produced_by_bwa"]: return RunWithPindelInput(kwargs, sam2pindel=True) else: bam_config_fn = WriteConfigFile(mappings_names=mappings_names, fn=bam_config_fn, insert_size=kwargs["insert_size"]) need_to_index = True if "bam_index_files" in kwargs: bam_idx_ids = kwargs["bam_index_files"] idx_names = sorted([dxpy.describe(id)["name"] for id in bam_idx_ids]) if CheckBamIdxMatch(bam_names=mappings_names, idx_names=idx_names): need_to_index = False mappings_names = DownloadFilesFromArray(mappings_ids) bam_idx_names = DownloadFilesFromArray(bam_idx_ids) if need_to_index: mappings_names = DownloadFilesFromArray(mappings_ids) if not kwargs["assume_sorted"]: mappings_names = SortBams(bam_names=mappings_names, num_threads=num_threads) mappings_names, bam_idx_names = IndexBams(mappings_names) chrom = kwargs["chromosome"] if "chromosome" in kwargs else "ALL" if "chromosome" in kwargs or kwargs["num_instances"] == 1: command, output_path = BuildPindelCommand(kwargs=kwargs, chrom=chrom, input_fn=bam_config_fn, is_pindel_input_type=False) output_path = RunPindel(kwargs=kwargs, pindel_command=command, output_path=output_path) app_outputs = UploadPindelOutputs(kwargs=kwargs, output_path=output_path) if kwargs["export_vcf"]: app_outputs["vcf"] = ExportVCF(kwargs=kwargs, output_path=output_path, ref_fn="reference_fasta") else: subjob_ids = SplitBamForSubjobs(kwargs, mappings_names, bam_config_fn) postprocess_inputs = { "subjob_outputs": [job.get_output_ref("subjob_output") for job in subjob_ids], "kwargs": kwargs } postprocess_job = dxpy.new_dxjob(fn_input=postprocess_inputs, fn_name="postprocess") app_outputs = { "deletions": { "job": postprocess_job.get_id(), "field": "deletions" }, "short_inserts": { "job": postprocess_job.get_id(), "field": "short_inserts" }, "tandem_duplications": { "job": postprocess_job.get_id(), "field": "tandem_duplications" }, "large_inserts": { "job": postprocess_job.get_id(), "field": "large_inserts" }, "inversions": { "job": postprocess_job.get_id(), "field": "inversions" }, "breakpoints": { "job": postprocess_job.get_id(), "field": "breakpoints" } } if kwargs["report_close_mapped_reads"] or kwargs[ "report_only_close_mapped_reads"]: app_outputs["close_mapped_reads"] = { "job": postprocess_job.get_id(), "field": "close_mapped_reads" } if kwargs["export_vcf"]: app_outputs["vcf"] = { "job": postprocess_job.get_id(), "field": "vcf" } #if "breakdancer_calls_file" in kwargs: # app_outputs["breakdancer_outputs"] = {"job": postprocess_job.get_id(), "field": "breakdancer_outputs"} dxlinks = [] if need_to_index: if not kwargs["assume_sorted"]: for bam in mappings_names: uploaded_bam = dxpy.upload_local_file(bam, name=bam.rstrip('.bam') + "_sorted.bam") dxlinks.append(dxpy.dxlink(uploaded_bam)) for idx in bam_idx_names: uploaded_idx = dxpy.upload_local_file(idx, name=idx.rstrip('.bam.bai') + "_sorted.bam.bai") dxlinks.append(dxpy.dxlink(uploaded_idx)) app_outputs["sortedbam_and_index_files"] = dxlinks return app_outputs
def main(files): # The following line(s) initialize your data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. #files = [dxpy.DXFile(item) for item in files] # The following line(s) download your file inputs to the local file system # using variable names for the filenames. #for i, f in enumerate(files): # dxpy.download_dxfile(f.get_id(), "files-" + str(i)) # Split your work into parallel tasks. As an example, the # following generates 10 subjobs running with the same dummy # input. subjobs = [] for fastq in files: subjob_input = {"fastq": fastq} subjobs.append(dxpy.new_dxjob(subjob_input, "process")) # The following line creates the job that will perform the # "postprocess" step of your app. We've given it an input field # that is a list of job-based object references created from the # "process" jobs we just created. Assuming those jobs have an # output field called "output", these values will be passed to the # "postprocess" job. Because these values are not ready until the # "process" jobs finish, the "postprocess" job WILL NOT RUN until # all job-based object references have been resolved (i.e. the # jobs they reference have finished running). # # If you do not plan to have the "process" jobs create output that # the "postprocess" job will require, then you can explicitly list # the dependencies to wait for those jobs to finish by setting the # "depends_on" field to the list of subjobs to wait for (it # accepts either dxpy handlers or string IDs in the list). We've # included this parameter in the line below as well for # completeness, though it is unnecessary if you are providing # job-based object references in the input that refer to the same # set of jobs. ''' postprocess_job = dxpy.new_dxjob(fn_input={ "report": [subjob.get_output_ref("report") for subjob in subjobs], "summary": [subjob.get_output_ref("summary") for subjob in subjobs], "zips": [subjob.get_output_ref("zips") for subjob in subjobs], }, fn_name="postprocess", depends_on=subjobs) ''' # If you would like to include any of the output fields from the # postprocess_job as the output of your app, you should return it # here using a job-based object reference. If the output field in # the postprocess function is called "answer", you can pass that # on here as follows: # #return { "FastQC_reports": [ dxpy.dxlink(item) for item in postprocess_job.get_output_ref("report") ]} # # Tip: you can include in your output at this point any open # objects (such as gtables) which will be closed by a job that # finishes later. The system will check to make sure that the # output object is closed and will attempt to clone it out as # output into the parent container only after all subjobs have # finished. output = { "reports": [subjob.get_output_ref("report") for subjob in subjobs], "summaries": [subjob.get_output_ref("summary") for subjob in subjobs], "zips": [subjob.get_output_ref("zip") for subjob in subjobs], } ''' for job in postprocess_job.get_output_ref("reports"): item = dxpy.dxlink(job) output['FastQC_reports'].append(item['report']) output['FastQC_zip'].append(item['zip']) output['FastQC_summary'].append(item['summary']) ''' # output["FastQC_reports"] = [ dxpy.dxlink(item) for item in FastQC_reports] # output["FastQC_reports"] = FastQC_reports # output["FastQC_zip"] = FastQC_zip # output["FastQC_summary"] = FastQC_summary return output
def main(fastq_gz_left_reads, fastq_gz_right_reads, indexed_reference, reads_per_chunk=25000000, aln_params="", sampe_params="-r '@RG\tID:1\tPL:ILLUMINA\tPU:None\tLB:1\tSM:1'"): picard_merge = applet("picard_merge_sam_files") if picard_merge == None: raise dxpy.AppError( "unable to find applet called 'picard_merge_sam_files'. Please copy into your project from the collection of developer applets" ) splitter = applet("fastq_splitter") if splitter == None: raise dxpy.AppError( "unable to find applet called 'fastq_splitter'. Please copy into your project from the collection of developer applets" ) bwa_aligner = applet("bwa_aligner") if bwa_aligner == None: raise dxpy.AppError( "unable to find applet called 'bwa_aligner'. Please copy into your project from the collection of developer applets" ) bwa_controller_input = { "left_reads": [], "right_reads": [], "indexed_reference": indexed_reference, "aln_params": aln_params, "sampe_params": sampe_params, "bwa_aligner": bwa_aligner.get_id() } bwa_subjobs = [] for x, y in zip(fastq_gz_left_reads, fastq_gz_right_reads): left_job = splitter.run({ "fastqgz": x, "reads_per_chunk": reads_per_chunk }) right_job = splitter.run({ "fastqgz": y, "reads_per_chunk": reads_per_chunk }) bwa_controller_input["left_reads"].append(left_job.get_id()) bwa_controller_input["right_reads"].append(right_job.get_id()) bwa_subjobs.extend([left_job, right_job]) bwa_controller_job = dxpy.new_dxjob(fn_input=bwa_controller_input, fn_name='bwa_controller', depends_on=bwa_subjobs) picard_merge_job = picard_merge.run( {"BAMs": { "job": bwa_controller_job.get_id(), "field": "BAMs" }}) print picard_merge_job.get_id() output = {"BAM": {"job": picard_merge_job.get_id(), "field": "BAM"}} return output
def main(bam_set, map_report_set, dme_ix, uncompress_bam=True): # tool_versions.py --applet $script_name --appver $script_ver props = {} if os.path.isfile('/usr/bin/tool_versions.py'): sw_versions = subprocess.check_output(['tool_versions.py', '--dxjson', 'dnanexus-executable.json']) props["SW"] = sw_versions print "* Value of bam_set: '" + str(bam_set) + "'" print "* Value of map_report_set: '" + str(map_report_set) + "'" print "* Value of dme_ix: '" + str(dme_ix) + "'" print "* Value of uncompress_bam: '" + str(uncompress_bam) + "'" print "* Calling merge_extract()..." inp = { 'bam_set': bam_set, 'map_report_set': map_report_set, 'dme_ix_dxlink': dme_ix, 'uncompress_bam': uncompress_bam, 'props': props } extract_job = dxpy.new_dxjob(inp, "merge_extract") print "* Kicked off extract() and waiting..." extract_job.wait_on_done() # Wait because we want the qc_metrics to pass to other jobs. extract_out = extract_job.describe()['output'] target_root = extract_out['target_root'] qc_metrics = extract_out['qc_metrics'] print "* Calling coverage()..." inp = { 'CpG_context_dxlink': extract_out["CpG_context_dxlink"], 'CHG_context_dxlink': extract_out["CHG_context_dxlink"], 'CHH_context_dxlink': extract_out["CHH_context_dxlink"], 'dme_ix_dxlink': dme_ix, 'target_root': target_root, #'qc_metrics': extract_job.get_output_ref("qc_metrics"), #'props': props } coverage_job = dxpy.new_dxjob(inp, "coverage") print "* Kicked off coverage() and waiting..." print "* Calling bedmethyl()..." inp = { 'cx_report_dxlink': coverage_job.get_output_ref("cx_report_dxlink"), #'cx_report_dxlink': extract_job.get_output_ref("cx_report_dxlink"), 'chrom_sizes_dxlink': extract_out["chrom_sizes_dxlink"], 'target_root': target_root, 'qc_metrics': extract_out["qc_metrics"], 'props': props, } bedmethyl_job = dxpy.new_dxjob(inp, "bedmethyl_io") print "* Kicked off bedmethyl() but not waiting waiting..." coverage_job.wait_on_done() # Already finished by this point coverage_out = coverage_job.describe()['output'] print "* Calling signal()..." # No need for a separate instance unless storage is limited or can use a an instance cheaper than mem3_hdd2_x8! signal_out = signal_io(coverage_out["bedgraph_gz_dxlink"],extract_out["chrom_sizes_dxlink"],target_root,qc_metrics,props) print "* Check storage..." run_cmd('ls -l') run_cmd('df -k .') print "* Finished." return { # from extract() #"bam_biorep": extract_out['biorep_bam_dxlink'], "bam_biorep_qc": extract_out['biorep_bam_qc_dxlink'], "map_biorep": extract_out['biorep_map_dxlink'], "mbias_report": extract_out["mbias_report_dxlink"], # from signal() "signal": signal_out["bigWig_dxlink"], # from bedmethyl() "CpG_bed": bedmethyl_job.get_output_ref("CpG_bed_dxlink"), "CHG_bed": bedmethyl_job.get_output_ref("CHG_bed_dxlink"), "CHH_bed": bedmethyl_job.get_output_ref("CHH_bed_dxlink"), "CpG_bb": bedmethyl_job.get_output_ref("CpG_bb_dxlink"), "CHG_bb": bedmethyl_job.get_output_ref("CHG_bb_dxlink"), "CHH_bb": bedmethyl_job.get_output_ref("CHH_bb_dxlink"), "metadata": json.dumps(qc_metrics) }
def main(reads, dme_ix, ncpus, splitsize): # The following line(s) initialize your data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. #dx_reads = [dxpy.DXFile(item) for item in reads] # The following line(s) download your file inputs to the local file system # using variable names for the filenames. # We first create the "scatter" job which will scatter some input # (replace with your own input as necessary). logger.info("* Start Scatter with %d files %sM read splits *" % (len(reads), splitsize)) scatter_job = dxpy.new_dxjob(fn_input={ 'orig_reads': reads, 'split_size': splitsize, }, fn_name="scatter") # We will want to call "process" on each output of "scatter", so # we call the "map" entry point to do so. We can also provide # here additional input that we want each "process" entry point to # receive, e.g. a GTable ID to which the "process" function should # add rows of data. reads_root = simplify_name() or strip_extensions(dxpy.describe(reads[0])['name'], STRIP_EXTENSIONS) map_input = { "array_of_scattered_input": scatter_job.get_output_ref("array_of_scattered_input"), "process_input": { "reads_root": reads_root, "ncpus": ncpus, "dme_ix": dme_ix } } logger.info("* Start Map with: %s *" % map_input) map_job = dxpy.new_dxjob(fn_input=map_input, fn_name="map") # Finally, we want the "postprocess" job to run after "map" is # done calling "process" on each of its inputs. Note that a job # is marked as "done" only after all of its child jobs are also # marked "done". logger.info("* Waiting for map job to finish...") postprocess_input = { "bam_files": map_job.get_output_ref("bam_files"), "report_files": map_job.get_output_ref("report_files"), "bam_root": reads_root + '_techrep' } logger.info("* Start Post process with: %s *" % postprocess_input) postprocess_job = dxpy.new_dxjob(fn_input=postprocess_input, fn_name="postprocess", depends_on=[map_job]) # If you would like to include any of the output fields from the # postprocess_job as the output of your app, you should return it # here using a job-based object reference. # # return { "app_output_field": postprocess_job.get_output_ref("final_output"), ...} # # Tip: you can include in your output at this point any open # objects (such as gtables) which will be closed by a job that # finishes later. The system will check to make sure that the # output object is closed and will attempt to clone it out as # output into the parent container only after all subjobs have # finished. output = {} output["bam_techrep"] = dxpy.dxlink(postprocess_job.get_output_ref("bam_techrep")) output["bam_techrep_qc"] = dxpy.dxlink(postprocess_job.get_output_ref("bam_techrep_qc")) output["map_techrep"] = dxpy.dxlink(postprocess_job.get_output_ref("map_techrep")) output["reads"] = postprocess_job.get_output_ref("reads") output["metadata"] = postprocess_job.get_output_ref("metadata") return output
def main(case_bams=None, normal_bams=None, snv_vcfs=None, cn_reference=None, baits=None, fasta=None, annotation=None, seq_method='hybrid', segment_method='cbs', haploid_x_reference=False, drop_low_coverage=False, exclude_access=None, antitarget_avg_size=None, target_avg_size=None, purity=None, ploidy=None, do_parallel=True): cnvkit("version") # Validate inputs # (from cnvlib.commands._cmd_batch) if cn_reference: bad_flags = [flag for is_used, flag in ( (normal_bams is not None, 'normal_bams'), (fasta, 'fasta'), (baits, 'baits'), (annotation, 'annotation'), (exclude_access, 'exclude_access'), (target_avg_size, 'target_avg_size'), (antitarget_avg_size, 'antitarget_avg_size'), ) if is_used] if bad_flags: raise dxpy.AppError( "If 'cn_reference' is given, options to construct a new " "reference (%s) should not be used:" % ", ".join(bad_flags)) else: if not fasta: raise dxpy.AppError( "Input 'fasta' must be given with the reference genome " "sequence if an existing copy number reference profile " "('cn_reference') is not given.") if seq_method in ('hybrid', 'amplicon') and not baits: raise dxpy.AppError( "For the '%r' sequencing method, input 'baits' (at least) " "must be given with the captured genomic regions if an " "existing copy number reference profile ('cn_reference') " "is not given." % baits) if case_bams: purities = validate_per_tumor(purity, len(case_bams), "purity values", lambda p: 0 < p <= 1) ploidies = validate_per_tumor(ploidy, len(case_bams), "ploidy values", lambda p: p > 0) snv_vcfs = validate_per_tumor(snv_vcfs, len(case_bams), "VCF files") else: purities = ploidies = None # If reference is not given, create one if not cn_reference: print("** About to call 'make_region_beds'") # DBG targets, antitargets = make_region_beds( normal_bams, seq_method, fasta, baits, annotation, exclude_access, antitarget_avg_size, target_avg_size) print("** Finished calling 'make_region_beds'") # DBG normal_cvgs = [] if normal_bams: # 'coverage' of each normal bam in a subjob for nbam in normal_bams: print("** About to launch 'run_coverage'") # DBG job_cvg = dxpy.new_dxjob(fn_name='run_coverage', fn_input={ 'bam': nbam, 'targets': targets, 'antitargets': antitargets, 'do_parallel': do_parallel, }) normal_cvgs.append(job_cvg.get_output_ref('coverages')) print("** Got output ref from 'run_coverage'") # DBG print("** About to launch 'run_reference'") # DBG job_ref = dxpy.new_dxjob(fn_name='run_reference', fn_input={'coverages': normal_cvgs, 'fasta': fasta, 'targets': targets, 'antitargets': (antitargets if seq_method == 'hybrid' else None), 'haploid_x_reference': haploid_x_reference, }) cn_reference = job_ref.get_output_ref('cn_reference') print("** Got output ref from 'run_reference'") # DBG output = {'cn_reference': cn_reference, 'copy_ratios': [], 'copy_segments': [], 'call_segments': [], 'genemetrics': [], 'cnv_beds': [], 'cnv_vcfs': [], 'scatters_png': [], } # Process each test/case/tumor individually using the given/built reference if case_bams: print("** About to process", len(case_bams), "'case_bams'") # DBG for sample_bam, vcf, purity, ploidy in \ zip(case_bams, snv_vcfs, purities, ploidies): print("** About to launch 'run_sample'") # DBG job_sample = dxpy.new_dxjob(fn_name='run_sample', fn_input={ 'sample_bam': sample_bam, 'vcf': vcf, 'purity': purity, 'ploidy': ploidy, 'cn_reference': cn_reference, 'seq_method': seq_method, 'segment_method': segment_method, 'drop_low_coverage': drop_low_coverage, 'haploid_x_reference': haploid_x_reference, 'do_parallel': do_parallel, }) for field in ('copy_ratios', 'copy_segments', 'call_segments', 'genemetrics'): output[field].append(job_sample.get_output_ref(field)) output['scatters_png'].append(job_sample.get_output_ref('scatter')) output['cnv_beds'].append(job_sample.get_output_ref('bed')) output['cnv_vcfs'].append(job_sample.get_output_ref('vcf')) print("** Got outputs from 'run_sample'") # DBG # Consolidate multi-sample outputs print("** About to launch 'aggregate_outputs'") # DBG job_agg = dxpy.new_dxjob(fn_name='aggregate_outputs', fn_input={'copy_ratios': output['copy_ratios'], 'copy_segments': output['copy_segments'], 'haploid_x_reference': haploid_x_reference}) for field in ('seg', 'heatmap_pdf', 'metrics', 'sexes'): output[field] = job_agg.get_output_ref(field) print("** Got outputs from 'aggregate_outputs'") # DBG print("** All done! Returning output:") from pprint import pprint pprint(output) return output
def main(fastq_files, sample_name, output_project, output_folder, properties={}, aligner=None, genome_fasta_file=None, fastq_files2=None, bam_file=None): """Run the various QC programs and output the report files that they produce.""" output = {} json_outputs = [] tools_used = [] # Run fastqc fastqc_jobs = [] fastqc_input = { "fastq_files": fastq_files, "properties": properties, "output_project": output_project, "output_folder": output_folder } if not fastq_files2: fastqc_input["output_name"] = sample_name + "_fastqc.zip" else: fastqc_input["output_name"] = sample_name + "_fastqc_left.zip" fastqc_jobs.append(dxpy.new_dxjob(fastqc_input, "run_fastqc")) if fastq_files2: fastqc_input2 = { "fastq_files": fastq_files2, "output_name": sample_name + "_fastqc_right.zip", "properties": properties, "output_project": output_project, "output_folder": output_folder } fastqc_jobs.append(dxpy.new_dxjob(fastqc_input2, "run_fastqc")) output["fastqc_reports"] = [ job.get_output_ref("fastqc_report") for job in fastqc_jobs ] tools_used += [job.get_output_ref("tools_used") for job in fastqc_jobs] # These tools require a bam file. if (bam_file is not None) and (genome_fasta_file is not None): # Run CollectAlignmentSummaryMetrics casm_input = { "bam_file": bam_file, "genome_fasta_file": genome_fasta_file, "sample_name": sample_name, "properties": properties, "output_project": output_project, "output_folder": output_folder } casm_job = dxpy.new_dxjob(casm_input, "collect_alignment_summary_metrics") output["alignment_summary_metrics"] = casm_job.get_output_ref( "alignment_summary_metrics") json_outputs += [ casm_job.get_output_ref("json_alignment_summary_metrics") ] tools_used += [casm_job.get_output_ref("tools_used")] if (bam_file is not None) and (aligner is not None): # Run Collect Uniqueness Metrics uniqueness_input = { "bam_file": bam_file, "aligner": aligner #"output_project": output_project, #"output_folder": output_folder } uniqueness_job = dxpy.new_dxjob(uniqueness_input, "collect_uniqueness_metrics") json_outputs += [ uniqueness_job.get_output_ref("json_uniqueness_metrics") ] tools_used += [uniqueness_job.get_output_ref("tools_used")] # Run Calc Mismatch Per Cycle Stats mismatch_per_cycle_input = { "bam_file": bam_file, "aligner": aligner, "output_project": output_project, "output_folder": output_folder } mismatch_per_cycle_job = dxpy.new_dxjob( mismatch_per_cycle_input, 'calc_mismatch_per_cycle_stats') output['mismatch_metrics'] = mismatch_per_cycle_job.get_output_ref( 'mismatch_per_cycle_stats') tools_used += [mismatch_per_cycle_job.get_output_ref('tools_used')] # If paired-end reads, run CollectInsertSizeMetrics if (bam_file is not None) and (fastq_files2 is not None) and (genome_fasta_file is not None): cism_input = { "bam_file": bam_file, "genome_fasta_file": genome_fasta_file, "sample_name": sample_name, "properties": properties, "output_project": output_project, "output_folder": output_folder } cism_job = dxpy.new_dxjob(cism_input, "collect_insert_size_metrics") output["insert_size_metrics"] = cism_job.get_output_ref( "insert_size_metrics") json_outputs += [cism_job.get_output_ref("json_insert_size_metrics")] tools_used += [cism_job.get_output_ref("tools_used")] produce_qc_report_input = { "individual_json_outputs": json_outputs, "sample_name": sample_name, "output_project": output_project, "output_folder": output_folder } produce_qc_report_job = dxpy.new_dxjob(produce_qc_report_input, "produce_qc_report") output['json_output_file'] = produce_qc_report_job.get_output_ref( "combined_json_file") tools_used_input = { "tools_used": tools_used, "output_project": output_project, "output_folder": output_folder } tools_used_job = dxpy.new_dxjob(tools_used_input, "create_tools_used_json_file") output['tools_used'] = tools_used_job.get_output_ref( 'tools_used_json_file') print 'QC sample output: %s' % output return output
def main(DX_APP_WIZARD_INPUT_SIGNATURE): DX_APP_WIZARD_INITIALIZE_INPUTDX_APP_WIZARD_DOWNLOAD_ANY_FILES # First, create the output GTable that will contain your results. # NOTE: You must specify the columns and indices for a GTable when # you create it, and they are immutable thereafter. # # Note: If you are filtering a GTable or are otherwise happy with # using the same exact columns and indices as your input GTable, # you can easily initialize your new GTable as follows: # # DX_APP_WIZARD_||_OUTPUT = dxpy.new_dxgtable(init_from=DX_APP_WIZARD_||_INPUT) # # In the more general case, you may want to specify different # columns. The following lines assume you would like to create a # GTable with a genomic range index, i.e. there is a string column # for chromosome names and two integer columns for low and high # coordinates. columns = [dxpy.DXGTable.make_column_desc("chr", "string"), dxpy.DXGTable.make_column_desc("lo", "int"), dxpy.DXGTable.make_column_desc("hi", "int"), dxpy.DXGTable.make_column_desc("somedata", "string")] DX_APP_WIZARD_||_OUTPUT = dxpy.new_dxgtable(columns=columns, indices=[dxpy.DXGTable.genomic_range_index("chr", "lo", "hi")]) # Split your input to be solved by the next stage of your app. # The following assumes you are splitting the input by giving # 100000 rows of a GenomicTable per subjob running the # "process" entry point. num_rows = DX_APP_WIZARD_||_INPUT.describe()["length"] subjobs = [] for i in range(num_rows / row_chunk_size + (0 if num_rows % row_chunk_size == 0 else 1)): subjob_input = { "input_gtable_id": DX_APP_WIZARD_||_INPUT.get_id(), "start_row": row_chunk_size * i, "end_row": min(row_chunk_size * (i + 1), num_rows), "output_gtable_id": DX_APP_WIZARD_||_OUTPUT.get_id()} subjobs.append(dxpy.new_dxjob(subjob_input, "process")) # The next line creates the job that will perform the # "postprocess" step of your app. It assumes that you do not need # to aggregate any output from your "process" stages (other than # closing the output GTable), but you can add the output of those # stages to the input of your "postprocess" stage easily by adding # the following value as a field in the "fn_input" dict and adding # the parameter to your "postprocess" entry point. # # fn_input={"process_outputs": [subjob.get_output_ref("output") for subjob in subjobs], ...} # # With no other input other than the output GTable ID for the # "postprocess" stage, we will force it to run only after all the # "process" stages have finished running by providing the list of # their DXJob handlers to the "depends_on" field (it accepts # either dxpy handlers or string IDs in the list). postprocess_job = dxpy.new_dxjob(fn_input={ "output_gtable_id": DX_APP_WIZARD_||_OUTPUT.get_id() }, fn_name="postprocess", depends_on=subjobs) # If you would like to include any of the output fields from the # postprocess_job as the output of your app, you should return it # here using a job-based object reference. If the output field is # called "answer", you can pass that on here as follows: # # return {"app_output_field": postprocess_job.get_output_ref("answer"), ...} # # Tip: you can include in your output at this point any open # objects (such as GTables) which are closed by a job that # finishes later. The system will check to make sure that the # output object is closed and will attempt to clone it out as # output into the parent container only after all subjobs have # finished. output = {}
def main(mappings_bam, region_size, index_file=None): """The 'scatter' subjob of the applet The main function will perform logic to distribute our job across multiple workers (instances) Returns: output (dict): Contains key "count_file" with value DXLink to job output file. """ print('Creating workspace directory to store downloaded files') os.mkdir(u'workspace') os.chdir(u'workspace') mappings_bam_h = dxpy.DXFile(mappings_bam) filename = mappings_bam_h.name dxpy.download_dxfile(mappings_bam_h.get_id(), filename) # # SECTION: Scatter # ------------------------------------------------------ # Split regions into list of <region size> list # # Create index file if not provided by user. # In order to index bam file needs to be sorted already. # Sort BAM if necessary. # Upload dx file to pass to distributed jobs # regions = parseSAM_header_for_region(filename) split_regions = [regions[i:i + region_size] for i in range(0, len(regions), region_size)] if not index_file: mappings_bam, index_file = create_index_file(filename, mappings_bam) # # SECTION: Processing # ----------------------------------------------------------------------- # Run subjob for each distributed region. # # Note: inputs for subjobs are sent as a dictionary with key value pairs: # key: "region_list" value: [ [], [], ... ](region sections) # key: "mappings_bam" value: sorted bam # key: "index_file" value: bam bai index file # The dictionary keys must match the input of the subjob # # Collect outputs for downstream gather job using dxjob.get_output_ref() # # Note: Programmatically it's possible to intelligently split workload and # create optimized instance types. dxpy.new_dxjob takes the optional # parameter: instance_type # print('creating subjobs') subjobs = [dxpy.new_dxjob( fn_input={"region_list": split, "mappings_bam": mappings_bam, "index_file": index_file}, fn_name="samtoolscount_bam") for split in split_regions] fileDXLinks = [subjob.get_output_ref("readcount_fileDX") for subjob in subjobs] # # SECTION: Gather (Post-processing) # ------------------------------------------------------------------------- # Pass DNAnexus object references to post processing job to combine outputs # # Create dictionary to be returned as output for the job # Dictionary must contain keys matching outputs set in dxapp.json # print('combining outputs') postprocess_job = dxpy.new_dxjob( fn_input={"countDXlinks": fileDXLinks, "resultfn": filename}, fn_name="combine_files") countDXLink = postprocess_job.get_output_ref("countDXLink") output = {} output["count_file"] = countDXLink return output
def main(reads1, crop_length, reference_tar, bwa_version, bwa_aln_params, samtools_version, debug, reads2=None): # Main entry-point. Parameter defaults assumed to come from dxapp.json. # reads1, reference_tar, reads2 are links to DNAnexus files or None if debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) # This spawns only one or two subjobs for single- or paired-end, # respectively. It could also download the files, chunk the reads, # and spawn multiple subjobs. # Files are downloaded later by subjobs into their own filesystems # and uploaded to the project. # Initialize file handlers for input files. paired_end = reads2 is not None if crop_length == 'native': crop_subjob = None unmapped_reads = [reads1, reads2] else: crop_subjob_input = { "reads1_file": reads1, "reads2_file": reads2, "crop_length": crop_length, "debug": debug } logger.info("Crop job input: %s" % (crop_subjob_input)) crop_subjob = dxpy.new_dxjob(crop_subjob_input, "crop") unmapped_reads = [crop_subjob.get_output_ref("cropped_reads1")] if paired_end: unmapped_reads.append(crop_subjob.get_output_ref("cropped_reads2")) else: unmapped_reads.append(None) unmapped_reads = [r for r in unmapped_reads if r] mapping_subjobs = [] for reads in unmapped_reads: mapping_subjob_input = { "reads_file": reads, "reference_tar": reference_tar, "bwa_aln_params": bwa_aln_params, "bwa_version": bwa_version, "debug": debug } logger.info("Mapping job input: %s" % (mapping_subjob_input)) if crop_subjob: mapping_subjobs.append(dxpy.new_dxjob( fn_input=mapping_subjob_input, fn_name="process", depends_on=[crop_subjob])) else: mapping_subjobs.append(dxpy.new_dxjob( fn_input=mapping_subjob_input, fn_name="process")) # Create the job that will perform the "postprocess" step. # depends_on=mapping_subjobs, so blocks on all mapping subjobs postprocess_job = dxpy.new_dxjob( fn_input={ "indexed_reads": [ subjob.get_output_ref("suffix_array_index") for subjob in mapping_subjobs], "unmapped_reads": unmapped_reads, "reference_tar": reference_tar, "bwa_version": bwa_version, "samtools_version": samtools_version, "debug": debug}, fn_name="postprocess", depends_on=mapping_subjobs) mapped_reads = postprocess_job.get_output_ref("mapped_reads") mapping_statistics = postprocess_job.get_output_ref("mapping_statistics") n_mapped_reads = postprocess_job.get_output_ref("n_mapped_reads") output = { "mapped_reads": mapped_reads, "crop_length": crop_length, "mapping_statistics": mapping_statistics, "paired_end": paired_end, "n_mapped_reads": n_mapped_reads } logger.info("Exiting with output: %s" % (output)) return output
def main(record_link, worker_id, worker_project, fastqs, output_folder, mark_duplicates=False): output = {"bams": [], "bais": [], "tools_used": []} lane = FlowcellLane(record_link=record_link, fastqs=fastqs) fastq_files = [dxpy.DXFile(item) for item in fastqs] sample_dict = group_files_by_barcode(fastq_files) for barcode in sample_dict: print 'Processing sample: %s' % barcode read_dict = group_files_by_read(sample_dict[barcode]) fastq_files2 = None if "1" in read_dict and "2" in read_dict: # Sample is paired; there should be no files without a 'read' # property of "1" or "2" fastq_files = [dxpy.dxlink(item) for item in read_dict["1"]] fastq_files2 = [dxpy.dxlink(item) for item in read_dict["2"]] else: fastq_files = [dxpy.dxlink(item) for item in read_dict["1"]] print("fastq_files: {}".format(fastq_files)) print("fastq_files2: {}".format(fastq_files2)) mapped_files_properties = { 'barcode': barcode, 'run_date': lane.run_date, 'library_id': lane.library_id, 'lane_id': lane.lane_id, 'mapper': lane.mapper, 'mapping_reference': lane.mapping_reference, 'library_name': lane.library_name } print 'Initiating map sample job' sample_name = 'SCGPM_%s_%s_L%d_%s' % ( lane.library_name, lane.flowcell_id, lane.lane_index, barcode) map_sample_job = dxpy.new_dxjob(fn_input={ "project_id": lane.project_id, "output_folder": output_folder, "fastq_files": fastq_files, "fastq_files2": fastq_files2, "genome_fasta_file": lane.reference_genome_dxid, "genome_index_file": lane.reference_index_dxid, "mapper": lane.mapper, "sample_name": sample_name, "mark_duplicates": mark_duplicates, "applet_id": worker_id, "applet_project": worker_project, "properties": mapped_files_properties }, fn_name="run_map_sample") output["bams"].append({"job": map_sample_job.get_id(), "field": "bam"}) output["bais"].append({"job": map_sample_job.get_id(), "field": "bai"}) output["tools_used"].append({ "job": map_sample_job.get_id(), "field": "tools_used" }) return output
def main(rep1_peaks, rep2_peaks, pooled_peaks): # Initialize the data object inputs on the platform into # dxpy.DXDataObject instances. rep1_peaks_file = dxpy.DXFile(rep1_peaks) rep2_peaks_file = dxpy.DXFile(rep2_peaks) rep1_peaks_filename = rep1_peaks_file.name rep2_peaks_filename = rep2_peaks_file.name # Download the file inputs to the local file system. dxpy.download_dxfile(rep1_peaks_file.get_id(), rep1_peaks_filename) dxpy.download_dxfile(rep2_peaks_file.get_id(), rep2_peaks_filename) # Find the pooler and pseudoreplicator applets # (assumed to be in the same project as this applet) pool_applet = dxpy.find_one_data_object( classname='applet', name='pool', zero_ok=False, more_ok=False, return_handler=True) pseudoreplicator_applet = dxpy.find_one_data_object( classname='applet', name='pseudoreplicator', zero_ok=False, more_ok=False, return_handler=True) # Dispatch parallel tasks. subjobs = [] # True replicates # Pooled replciates pool_replicates_subjob = pool_applet.run({ "input1": rep1_peaks, "input2": rep2_peaks }) subjobs.append(pool_replicates_subjob) # The following line creates the job that will perform the # "postprocess" step of your app. We've given it an input field # that is a list of job-based object references created from the # "process" jobs we just created. Assuming those jobs have an # output field called "output", these values will be passed to the # "postprocess" job. Because these values are not ready until the # "process" jobs finish, the "postprocess" job WILL NOT RUN until # all job-based object references have been resolved (i.e. the # jobs they reference have finished running). # # If you do not plan to have the "process" jobs create output that # the "postprocess" job will require, then you can explicitly list # the dependencies to wait for those jobs to finish by setting the # "depends_on" field to the list of subjobs to wait for (it # accepts either dxpy handlers or string IDs in the list). We've # included this parameter in the line below as well for # completeness, though it is unnecessary if you are providing # job-based object references in the input that refer to the same # set of jobs. postprocess_job = dxpy.new_dxjob(fn_input={ "process_outputs": [subjob.get_output_ref("pooled") for subjob in subjobs] }, fn_name="postprocess", depends_on=subjobs) pooled_replicates = postprocess_job.get_output_ref("pooled") # The following line(s) use the Python bindings to upload your file outputs # after you have created them on the local file system. It assumes that you # have used the output field name for the filename for each output, but you # can change that behavior to suit your needs. subprocess.check_call('touch EM_fit_output',shell=True) subprocess.check_call('touch empirical_curves_output',shell=True) subprocess.check_call('touch EM_parameters_log',shell=True) subprocess.check_call('touch npeaks_pass',shell=True) subprocess.check_call('touch overlapped_peaks',shell=True) subprocess.check_call('touch IDR_output',shell=True) #subprocess.check_call('touch IDR_peaks',shell=True) EM_fit_output = dxpy.upload_local_file("EM_fit_output") empirical_curves_output = dxpy.upload_local_file("empirical_curves_output") EM_parameters_log = dxpy.upload_local_file("EM_parameters_log") npeaks_pass = dxpy.upload_local_file("npeaks_pass") overlapped_peaks = dxpy.upload_local_file("overlapped_peaks") IDR_output = dxpy.upload_local_file("IDR_output") #IDR_peaks = dxpy.upload_local_file("IDR_peaks") # If you would like to include any of the output fields from the # postprocess_job as the output of your app, you should return it # here using a job-based object reference. If the output field in # the postprocess function is called "answer", you can pass that # on here as follows: # # return { "app_output_field": postprocess_job.get_output_ref("answer"), ...} # # Tip: you can include in your output at this point any open # objects (such as gtables) which will be closed by a job that # finishes later. The system will check to make sure that the # output object is closed and will attempt to clone it out as # output into the parent container only after all subjobs have # finished. output = {} output["EM_fit_output"] = dxpy.dxlink(EM_fit_output) output["empirical_curves_output"] = dxpy.dxlink(empirical_curves_output) output["EM_parameters_log"] = dxpy.dxlink(EM_parameters_log) output["npeaks_pass"] = dxpy.dxlink(npeaks_pass) output["overlapped_peaks"] = dxpy.dxlink(overlapped_peaks) output["IDR_output"] = dxpy.dxlink(IDR_output) output["IDR_peaks"] = pooled_replicates logging.info("Exiting with output: %s", output) return output
def main(mappings_bam, region_size, index_file=None): """The 'scatter' subjob of the applet The main function will perform logic to distribute our job across multiple workers (instances) Returns: output (dict): Contains key "count_file" with value DXLink to job output file. """ print 'Creating workspace directory to store downloaded files' os.mkdir(u'workspace') os.chdir(u'workspace') mappings_bam_h = dxpy.DXFile(mappings_bam) filename = mappings_bam_h.name dxpy.download_dxfile(mappings_bam_h.get_id(), filename) # # SECTION: Scatter # ------------------------------------------------------ # Split regions into list of <region size> list # # Create index file if not provided by user. # In order to index bam file needs to be sorted already. # Sort BAM if necessary. # Upload dx file to pass to distributed jobs # regions = parseSAM_header_for_region(filename) split_regions = [regions[i:i + region_size] for i in xrange(0, len(regions), region_size)] if not index_file: mappings_bam, index_file = create_index_file(filename, mappings_bam) # # SECTION: Processing # ----------------------------------------------------------------------- # Run subjob for each distributed region. # # Note: inputs for subjobs are sent as a dictionary with key value pairs: # key: "region_list" value: [ [], [], ... ](region sections) # key: "mappings_bam" value: sorted bam # key: "index_file" value: bam bai index file # The dictionary keys must match the input of the subjob # # Collect outputs for downstream gather job using dxjob.get_output_ref() # # Note: Programmatically it's possible to intelligently split workload and # create optimized instance types. dxpy.new_dxjob takes the optional # parameter: instance_type # print 'creating subjobs' subjobs = [dxpy.new_dxjob( fn_input={"region_list": split, "mappings_bam": mappings_bam, "index_file": index_file}, fn_name="samtoolscount_bam") for split in split_regions] fileDXLinks = [subjob.get_output_ref("readcount_fileDX") for subjob in subjobs] # # SECTION: Gather (Post-processing) # ------------------------------------------------------------------------- # Pass DNAnexus object references to post processing job to combine outputs # # Create dictionary to be returned as output for the job # Dictionary must contain keys matching outputs set in dxapp.json # print 'combining outputs' postprocess_job = dxpy.new_dxjob( fn_input={"countDXlinks": fileDXLinks, "resultfn": filename}, fn_name="combine_files") countDXLink = postprocess_job.get_output_ref("countDXLink") output = {} output["count_file"] = countDXLink return output
def main(bam_files, sampleId, padding, reference, loglevel, number_of_nodes, downsample, downsample_fraction, regions_file=None, indel_vcf=None, dbsnp=None, advanced_rtc_options=None, advanced_ir_options=None, advanced_br_options=None, advanced_pr_options=None): """This is a dx applet that runs on the DNAnexus platform. This will run GATK3 best practices pipeline using scatter gather. This is very useful for processing WGS datasets. This function is the controller of the pipeline, which will scatter data, process it and then gather it for final processing. :param: `bam_files`: :param: `sampleId`: :param: `padding`: :param: `reference`: :param: `loglevel`: :param: `number_of_nodes` :param: `downsample`: :param: `downsample_fraction`: :param: `regions_file`: :param: `indel_vcf`: :param: `dbsnp`: :param: `advanced_rtc_options`: :param: `advanced_ir_options`: :param: `advanced_br_options`: :param: `advanced_pr_options`: """ logger.setLevel(loglevel) logger.info("GATK3 scatter gather controller. Number of nodes for scatter jobs: {0}".format(number_of_nodes)) # Balance jobs based on the file sizes of file from input file_sizes = {} file_objects = {} for bam_file in bam_files: file_size = int(dxpy.DXFile(bam_file).describe()["size"]) file_name = dxpy.DXFile(bam_file).describe()["name"] file_sizes[file_name] = file_size file_objects[file_name] = bam_file balanced_jobs_object = dx_scatter.distribute_files_by_size( file_sizes=file_sizes, dx_file_objects=file_objects, number_of_nodes=number_of_nodes) # GATK in/del realignment phase gatk_rtc_ir_jobs = [] for job_name, file_objects in balanced_jobs_object.items(): logger.info("Create GATK3 Realignment Node") gatk_rtc_ir_jobs.append( dxpy.new_dxjob( fn_input={ "bam_files": file_objects, "reference": reference, "regions_file": regions_file, "padding": padding, "indel_vcf": indel_vcf, "sampleId": sampleId, "advanced_rtc_options": advanced_rtc_options, "advanced_ir_options": advanced_ir_options, "downsample": downsample, "downsample_fraction": downsample_fraction, "loglevel": loglevel }, fn_name="gatk_realignment" ) ) # GATK3 BaseRecalibrator phase # This will gather the input from all the GATK3 Realignment nodes logger.info("Gather all GATK3 Realignment Output") kwargs = { "output_downsample_bams": [job.get_output_ref("output_downsample_bams") for job in gatk_rtc_ir_jobs], "output_realigned_bams": [job.get_output_ref("output_realigned_bams") for job in gatk_rtc_ir_jobs] } gather_gatk_rtc_ir_jobs = dxpy.new_dxjob( fn_input=kwargs, fn_name="gather", depends_on=gatk_rtc_ir_jobs ) # This will send all the realigned BAM files to the BaseRecalibrator node logger.info("Create GATK3 BaseRecalibrator Node") gatk_br_job = dxpy.new_dxjob( fn_input={ "bam_files": gather_gatk_rtc_ir_jobs.get_output_ref("output_downsample_bams") if downsample else gather_gatk_rtc_ir_jobs.get_output_ref("output_realigned_bams"), "reference": reference, "regions_file": regions_file, "padding": padding, "indel_vcf": indel_vcf, "dbsnp": dbsnp, "advanced_br_options": advanced_br_options, "loglevel": loglevel }, fn_name="gatk_base_recalibrator", depends_on=[gather_gatk_rtc_ir_jobs] ) # GATK Apply BQSR gatk_apply_bqsr_jobs = [] for gatk_rtc_ir_job in gatk_rtc_ir_jobs: logger.info("Create GATK3 Apply BQSR Node") gatk_apply_bqsr_jobs.append( dxpy.new_dxjob( fn_input={ "bam_files": gatk_rtc_ir_job.get_output_ref("output_realigned_bams"), "BR_output": gatk_br_job.get_output_ref("output_bqsr"), "reference": reference, "regions_file": regions_file, "padding": padding, "dbsnp": dbsnp, "sampleId": sampleId, "advanced_pr_options": advanced_pr_options, "loglevel": loglevel }, fn_name="gatk_apply_bqsr", depends_on = gatk_rtc_ir_jobs + [gatk_br_job] ) ) # Gather all Apply BQSR output and finish the pipeline logger.info("Gather all GATK Apply BQSR calling job outputs") kwargs = { "output_recalibrated_bam": [job.get_output_ref("output_recalibrated_bam") for job in gatk_apply_bqsr_jobs], "output_recalibrated_cram": [job.get_output_ref("output_recalibrated_cram") for job in gatk_apply_bqsr_jobs] } gather_gatk_apply_bqsr_jobs = dxpy.new_dxjob( fn_input=kwargs, fn_name="gather", depends_on=gatk_apply_bqsr_jobs ) output = {} output["output_recalibrated_bam"] = gather_gatk_apply_bqsr_jobs.get_output_ref("output_recalibrated_bam") output["output_recalibrated_cram"] = gather_gatk_apply_bqsr_jobs.get_output_ref("output_recalibrated_cram") return output
def main(files): # The following line(s) initialize your data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. #files = [dxpy.DXFile(item) for item in files] # The following line(s) download your file inputs to the local file system # using variable names for the filenames. #for i, f in enumerate(files): # dxpy.download_dxfile(f.get_id(), "files-" + str(i)) # Split your work into parallel tasks. As an example, the # following generates 10 subjobs running with the same dummy # input. subjobs = [] for file_obj in files: filename = dxpy.describe(file_obj)['name'] encff = re.compile('ENCFF[0-9]{3}[A-Z]{3}') try: file_acc = encff.match(filename).group() except: print "Filename %s is not an ENCODE file" % filename exit(0) file_meta = requests.get(SERVER+'/'+file_acc+'/?frame=embedded', \ auth=(auth['AUTHID'],auth['AUTHPW']), headers=HEADERS).json() subjob_input = { "file_obj": file_obj, "file_meta": file_meta } subjobs.append(dxpy.new_dxjob(subjob_input, "process")) # The following line creates the job that will perform the # "postprocess" step of your app. We've given it an input field # that is a list of job-based object references created from the # "process" jobs we just created. Assuming those jobs have an # output field called "output", these values will be passed to the # "postprocess" job. Because these values are not ready until the # "process" jobs finish, the "postprocess" job WILL NOT RUN until # all job-based object references have been resolved (i.e. the # jobs they reference have finished running). # # If you do not plan to have the "process" jobs create output that # the "postprocess" job will require, then you can explicitly list # the dependencies to wait for those jobs to finish by setting the # "depends_on" field to the list of subjobs to wait for (it # accepts either dxpy handlers or string IDs in the list). We've # included this parameter in the line below as well for # completeness, though it is unnecessary if you are providing # job-based object references in the input that refer to the same # set of jobs. postprocess_job = dxpy.new_dxjob(fn_input={ "report": [subjob.get_output_ref("report") for subjob in subjobs], "valid": [subjob.get_output_ref("validation") for subjob in subjobs] }, fn_name="postprocess", depends_on=subjobs) # If you would like to include any of the output fields from the # postprocess_job as the output of your app, you should return it # here using a job-based object reference. If the output field in # the postprocess function is called "answer", you can pass that # on here as follows: # #return { "FastQC_reports": [ dxpy.dxlink(item) for item in postprocess_job.get_output_ref("report") ]} # # Tip: you can include in your output at this point any open # objects (such as gtables) which will be closed by a job that # finishes later. The system will check to make sure that the # output object is closed and will attempt to clone it out as # output into the parent container only after all subjobs have # finished. validate_reports = [] validations = [] validate_reports.append(postprocess_job.get_output_ref("report")) validations.append(postprocess_job.get_output_ref("validation")) output = {} print validate_reports print validations # output["FastQC_reports"] = [ dxpy.dxlink(item) for item in FastQC_reports] output["validate_reports"] = validate_reports output["validate_errors"] = validations return output
def main(files): # The following line(s) initialize your data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. # files = [dxpy.DXFile(item) for item in files] # The following line(s) download your file inputs to the local file system # using variable names for the filenames. # for i, f in enumerate(files): # dxpy.download_dxfile(f.get_id(), "files-" + str(i)) # Split your work into parallel tasks. As an example, the # following generates 10 subjobs running with the same dummy # input. subjobs = [] for fastq in files: subjob_input = {"fastq": fastq} subjobs.append(dxpy.new_dxjob(subjob_input, "process")) # The following line creates the job that will perform the # "postprocess" step of your app. We've given it an input field # that is a list of job-based object references created from the # "process" jobs we just created. Assuming those jobs have an # output field called "output", these values will be passed to the # "postprocess" job. Because these values are not ready until the # "process" jobs finish, the "postprocess" job WILL NOT RUN until # all job-based object references have been resolved (i.e. the # jobs they reference have finished running). # # If you do not plan to have the "process" jobs create output that # the "postprocess" job will require, then you can explicitly list # the dependencies to wait for those jobs to finish by setting the # "depends_on" field to the list of subjobs to wait for (it # accepts either dxpy handlers or string IDs in the list). We've # included this parameter in the line below as well for # completeness, though it is unnecessary if you are providing # job-based object references in the input that refer to the same # set of jobs. """ postprocess_job = dxpy.new_dxjob(fn_input={ "report": [subjob.get_output_ref("report") for subjob in subjobs], "summary": [subjob.get_output_ref("summary") for subjob in subjobs], "zips": [subjob.get_output_ref("zips") for subjob in subjobs], }, fn_name="postprocess", depends_on=subjobs) """ # If you would like to include any of the output fields from the # postprocess_job as the output of your app, you should return it # here using a job-based object reference. If the output field in # the postprocess function is called "answer", you can pass that # on here as follows: # # return { "FastQC_reports": [ dxpy.dxlink(item) for item in postprocess_job.get_output_ref("report") ]} # # Tip: you can include in your output at this point any open # objects (such as gtables) which will be closed by a job that # finishes later. The system will check to make sure that the # output object is closed and will attempt to clone it out as # output into the parent container only after all subjobs have # finished. output = { "reports": [subjob.get_output_ref("report") for subjob in subjobs], "summaries": [subjob.get_output_ref("summary") for subjob in subjobs], "zips": [subjob.get_output_ref("zip") for subjob in subjobs], } """ for job in postprocess_job.get_output_ref("reports"): item = dxpy.dxlink(job) output['FastQC_reports'].append(item['report']) output['FastQC_zip'].append(item['zip']) output['FastQC_summary'].append(item['summary']) """ # output["FastQC_reports"] = [ dxpy.dxlink(item) for item in FastQC_reports] # output["FastQC_reports"] = FastQC_reports # output["FastQC_zip"] = FastQC_zip # output["FastQC_summary"] = FastQC_summary return output
def main(**job_inputs): job_outputs = {} mappingsTable = dxpy.open_dxgtable(job_inputs["mappings"]["$dnanexus_link"]) mappingsTableId = mappingsTable.get_id() # This controls the degree of parallelism chunks = int(mappingsTable.describe()["length"] / job_inputs["reads_per_job"]) + 1 try: contigSetId = mappingsTable.get_details()["original_contigset"]["$dnanexus_link"] originalContigSet = mappingsTable.get_details()["original_contigset"] except: raise Exception("The original reference genome must be attached as a detail") # In the next major section of code, we construct a variants table. As regions of the genome are passed to each worker # and variants are called on them, the workers will add rows to this table concurrently. variants_schema = [ {"name": "chr", "type": "string"}, {"name": "lo", "type": "int32"}, {"name": "hi", "type": "int32"}, {"name": "ref", "type": "string"}, {"name": "alt", "type": "string"}, {"name": "qual", "type": "double"}, {"name": "ids", "type": "string"}, ] # The information in these tags is elevated into specific columns, so additional columns for these tags will not be created elevatedTags = ["format_GT", "format_DP", "format_AD"] # The info and format tags are extracted from the header printed by samtools # If additional code will add a tag to the output of the program, modify this header to include the tag. # TODO: Allow the table to be created by the first job that finishes to avoid this step. headerInfo = extractHeader("/tmp/header.txt", elevatedTags) description = {} samples = [] indices = [dxpy.DXGTable.genomic_range_index("chr", "lo", "hi", "gri")] ##The following section creates the sample-specific table columns for k, v in headerInfo["tags"]["info"].iteritems(): variants_schema.append({"name": "info_" + k, "type": translateTagTypeToColumnType(v)}) description[k] = {"name": k, "description": v["description"], "type": v["type"], "number": v["number"]} # For each sample, add the sample-specific columns to the schema, at present only one sample is supported numSamples = 1 for i in range(numSamples): variants_schema.extend( [ {"name": "genotype_" + str(i), "type": "string"}, {"name": "phasing_" + str(i), "type": "string"}, {"name": "type_" + str(i), "type": "string"}, {"name": "variation_qual_" + str(i), "type": "double"}, {"name": "genotype_qual_" + str(i), "type": "double"}, {"name": "coverage_" + str(i), "type": "string"}, {"name": "total_coverage_" + str(i), "type": "int32"}, ] ) indices.append(dxpy.DXGTable.lexicographic_index([["type_" + str(i), "ASC"]], "type_" + str(i))) samples.append("Sample_0") for k, v in headerInfo["tags"]["format"].iteritems(): if "format_" + k not in elevatedTags: variants_schema.append({"name": "format_" + k + "_" + str(i), "type": translateTagTypeToColumnType(v)}) # TODO: Add lexicographic indices when secondary indices are supported variants = dxpy.new_dxgtable(variants_schema, indices=[dxpy.DXGTable.genomic_range_index("chr", "lo", "hi", "gri")]) tableId = variants.get_id() variants = dxpy.open_dxgtable(tableId) variants.add_types(["Variants", "gri"]) details = { "samples": samples, "original_contigset": job_inputs["reference"], "original_mappings": job_inputs["mappings"], "formats": headerInfo["tags"]["format"], "infos": headerInfo["tags"]["info"], } # if headerInfo.get('filters') != {}: # details['filters'] = headerInfo['filters'] variants.set_details(details) if "output_name" in job_inputs: variants.rename(job_inputs["output_name"]) else: variants.rename(mappingsTable.describe()["name"] + " variant calls by Samtools mpileup") # Split the genome into evenly sized regions genomeRegions = splitGenomeLengthLargePieces(originalContigSet, chunks) # Generate the command line arguments needed to run samtools and bcftools samOptions = makeSamtoolsParameters(**job_inputs) bcfOptions = makeBcftoolsParameters(**job_inputs) # The rest of the main function contains the map-reduce functionality. For each genome chunk, an input spec is created for a new child job. # Which specifies reduce_job_inputs = {} for i in range(len(genomeRegions)): if len(genomeRegions[i]) > 0: map_job_inputs = { "mappings_table_id": mappingsTableId, "original_contig_set": contigSetId, "interval": genomeRegions[i], "tableId": tableId, "compress_reference": job_inputs["compress_reference"], "compress_no_call": job_inputs["compress_no_call"], "infer_no_call": job_inputs["infer_no_call"], "sam_options": samOptions, "bcf_options": bcfOptions, "part_number": i, } # Run a "map" job for each chunk, passing in the inputspec from above and looking for a function entry point given as "map" (@dxpy.entry_point('map')) map_job = dxpy.new_dxjob(map_job_inputs, "map") reduce_job_inputs["mapJob" + str(i) + "TableId"] = {"job": map_job.get_id(), "field": "ok"} reduce_job_inputs["tableId"] = tableId # Run a "reduce" job, which only begins once all of the map jobs singal they have completed by sending 'ok':True # The reduce job closes the table. This step is explicitly needed because table closing must wait till the completion of the map jobs # By giving the reduce job the map jobs as input, the reduce job will wait to start. reduce_job = dxpy.new_dxjob(reduce_job_inputs, "reduce") job_outputs = {"variants": {"job": reduce_job.get_id(), "field": "variants"}} return job_outputs
def main(reads1=None, reference_tar=None, bwa_aln_params=None, bwa_version=None, samtools_version=None, reads2=None, input_JSON=None, debug=False): # Main entry-point. Parameter defaults assumed to come from dxapp.json. # reads1, reference_tar, reads2 are links to DNAnexus files or None if debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) # if there is input_JSON, it over-rides any explicit parameters if input_JSON: if 'reads1' in input_JSON: reads1 = input_JSON['reads1'] if 'reads2' in input_JSON: reads2 = input_JSON['reads2'] if 'reference_tar' in input_JSON: reference_tar = input_JSON['reference_tar'] if 'bwa_aln_params' in input_JSON: bwa_aln_params = input_JSON['bwa_aln_params'] if 'bwa_version' in input_JSON: bwa_version = input_JSON['bwa_version'] if 'samtools_version' in input_JSON: samtools_version = input_JSON['samtools_version'] if not reads1: logger.error('reads1 is required, explicitly or in input_JSON') raise Exception # This spawns only one or two subjobs for single- or paired-end, # respectively. It could also download the files, chunk the reads, # and spawn multiple subjobs. # Files are downloaded later by subjobs into their own filesystems # and uploaded to the project. # Initialize file handlers for input files. paired_end = reads2 is not None unmapped_reads = [r for r in [reads1, reads2] if r] subjobs = [] for reads in unmapped_reads: subjob_input = {"reads_file": reads, "reference_tar": reference_tar, "bwa_aln_params": bwa_aln_params, "bwa_version": bwa_version} print "Submitting:" print subjob_input subjobs.append(dxpy.new_dxjob(subjob_input, "process")) # Create the job that will perform the "postprocess" step. depends_on=subjobs, so blocks on all subjobs postprocess_job = dxpy.new_dxjob(fn_input={ "indexed_reads": [subjob.get_output_ref("output") for subjob in subjobs], "unmapped_reads": unmapped_reads, "reference_tar": reference_tar, "bwa_version": bwa_version, "samtools_version": samtools_version }, fn_name="postprocess", depends_on=subjobs) mapped_reads = postprocess_job.get_output_ref("mapped_reads") mapping_statistics = postprocess_job.get_output_ref("mapping_statistics") output = { "mapped_reads": mapped_reads, "mapping_statistics": mapping_statistics, "paired_end": paired_end } output.update({'output_JSON': output.copy()}) print "Exiting with output: %s" %(output) return output
def main(fastq_files, genome_fasta_file, genome_index_file, mapper, project_id, output_folder, mark_duplicates=False, fastq_files2=None, sample_name=None, properties=None): """Spawn subjobs to map each of the FASTQ files (and their pairs, if provided) and merge the BAM files into a single BAM file, which is output.""" if fastq_files2 != None: assert len(fastq_files2) == len(fastq_files), \ "fastq_files2 contains %s elements; expected %s" % (len(fastq_files2), len(fastq_files)) subjobs = [] for i in xrange(len(fastq_files)): subjob_input = { "project_id": project_id, "output_folder": output_folder, "fastq_file": fastq_files[i], "genome_fasta_file": genome_fasta_file, "genome_index_file": genome_index_file, "mapper": mapper, "sample_name": sample_name, "mark_duplicates": mark_duplicates, "properties": properties } if fastq_files2 != None: subjob_input["fastq_file2"] = fastq_files2[i] subjobs.append(dxpy.new_dxjob(subjob_input, "process")) if len(fastq_files) > 1: postprocess_input = { "project_id": project_id, "output_folder": output_folder, "bam_files": [subjob.get_output_ref("bam") for subjob in subjobs], "sample_name": sample_name, "properties": properties } postprocess_job = dxpy.new_dxjob(fn_input=postprocess_input, fn_name="postprocess", depends_on=subjobs) tools_used_input = { "project_id": project_id, "output_folder": output_folder, "tools_used": [ job.get_output_ref("tools_used") for job in (subjobs + [postprocess_job]) ] } tools_used_job = dxpy.new_dxjob(tools_used_input, "create_tools_used_json_file") return { "bam": postprocess_job.get_output_ref("bam"), "bai": postprocess_job.get_output_ref("bai"), "tools_used": tools_used_job.get_output_ref("tools_used_json_file") } else: tools_used_input = { "project_id": project_id, "output_folder": output_folder, "tools_used": [job.get_output_ref('tools_used') for job in subjobs] } tools_used_job = dxpy.new_dxjob(tools_used_input, "create_tools_used_json_file") return { "bam": subjobs[0].get_output_ref("bam"), "bai": subjobs[0].get_output_ref("bai"), "tools_used": tools_used_job.get_output_ref("tools_used_json_file") }
def SplitBamForSubjobs(kwargs, bam_names, bam_config_fn=None): num_threads = kwargs["num_threads_per_instance"] print "\nSplitting bam for subjobs" # Assuming that all bam files have the same chromosomes (is this safe?) subprocess.check_output("samtools view -H {input_bam} > header.txt".format(input_bam=bam_names[0]), shell=True) with open('header.txt') as fh: header = [line.rstrip('\n') for line in fh] print "Input header: " for line in header: print line print "Save unmapped reads as bam files to merge into subjob files" unmapped = {} for bam in bam_names: fn = bam.rstrip('.bam')+'_unmapped' command = "samtools view -@ {n} -u -b -f 4 {bam} > {unmapped}".format(n=num_threads, bam=bam, unmapped=fn) print command subprocess.check_call(command, shell=True) unmapped[bam] = fn groups = SplitGenomeFromSam(header, kwargs["num_instances"]) subjobs = [] subjob_no = 0 for group in groups: group = " ".join(group) subjob_bam_fn = [] for bam in bam_names: start_time = time.time() print "\nMerging {bam} with unmapped reads for pindel subjobs".format(bam=bam) out_fn = bam.rstrip('.bam') + '_' + str(subjob_no) + '.bam' command = "samtools view -@ {n} -bh {bam} {group} > tmp.bam".format(n=num_threads, bam=bam, group=group) subprocess.check_call(command, shell=True) split_command = "samtools merge -@ {n} {out} {unmapped} tmp.bam ".format(n=num_threads, out=out_fn, unmapped=unmapped[bam]) print split_command subprocess.check_call(split_command, shell=True) print "Samtools view and merge ran in: {min} minutes".format(min=float((time.time()-start_time)/60)) subjob_bam_fn.append(out_fn) subjob_kwargs = kwargs.copy() subjob_bam_fn, subjob_bam_idx_fn = IndexBams(bam_names=subjob_bam_fn) print "Uploading split bam files: " + str(subjob_bam_fn) subjob_bam_ids = [dxpy.dxlink(dxpy.upload_local_file(bam)) for bam in subjob_bam_fn] print "Uploading split bam index files: " + str(subjob_bam_idx_fn) subjob_bam_idx_ids = [dxpy.dxlink(dxpy.upload_local_file(idx)) for idx in subjob_bam_idx_fn] subjob_kwargs["mappings_files"] = subjob_bam_ids subjob_kwargs["bam_index_files"] = subjob_bam_idx_ids print "Updating bam config file for subjob" if bam_config_fn: new_config_fn = "subjob_config_" + str(subjob_no) + '.txt' with open(bam_config_fn, 'r') as config_fh, open(new_config_fn, 'w') as write_fh: for line in config_fh: line = line.split('\t') bam_name = line[0] out_fn = bam_name.rstrip('.bam') + '_' + str(subjob_no) + '.bam' write_fh.write(out_fn + '\t' + "\t".join(line[1:]) + '\n') print "Uploading new config file: " + str(new_config_fn) subjob_kwargs["bam_config_file"] = dxpy.dxlink(dxpy.upload_local_file(new_config_fn)) job = dxpy.new_dxjob(subjob_kwargs, "process") print "Started subjob #{n}: {job_id}".format(n=subjob_no, job_id=job.get_id()) subjobs.append(job) subjob_no += 1 return subjobs
def main(bam_set, map_report_set, dme_ix, uncompress_bam=True): # tool_versions.py --applet $script_name --appver $script_ver props = {} if os.path.isfile("/usr/bin/tool_versions.py"): sw_versions = subprocess.check_output(["tool_versions.py", "--dxjson", "dnanexus-executable.json"]) props["SW"] = sw_versions print "* Value of bam_set: '" + str(bam_set) + "'" print "* Value of map_report_set: '" + str(map_report_set) + "'" print "* Value of dme_ix: '" + str(dme_ix) + "'" print "* Value of uncompress_bam: '" + str(uncompress_bam) + "'" print "* Calling merge_extract_full()..." inp = { "bam_set": bam_set, "map_report_set": map_report_set, "dme_ix_dxlink": dme_ix, "uncompress_bam": uncompress_bam, "props": props, } extract_job = dxpy.new_dxjob(inp, "merge_extract_full") print "* Kicked off extract() and waiting..." extract_job.wait_on_done() # Wait because we want the qc_metrics to pass to other jobs. extract_out = extract_job.describe()["output"] target_root = extract_out["target_root"] qc_metrics = extract_out["qc_metrics"] print "* Calling bedmethyl()..." inp = { "cx_report_dxlink": extract_out["cx_report_dxlink"], "chrom_sizes_dxlink": extract_out["chrom_sizes_dxlink"], "target_root": target_root, "qc_metrics": qc_metrics, "props": props, } bedmethyl_job = dxpy.new_dxjob(inp, "bedmethyl_io") print "* Kicked off bedmethyl() but not waiting waiting..." print "* Calling signal()..." signal_out = signal_io( extract_out["bedgraph_gz_dxlink"], extract_out["chrom_sizes_dxlink"], target_root, qc_metrics, props ) print "* Check storage..." run_cmd("ls -l") run_cmd("df -k .") print "* Finished." return { # from extract() # "bam_biorep": extract_out['biorep_bam_dxlink'], "bam_biorep_qc": extract_out["biorep_bam_qc_dxlink"], "map_biorep": extract_out["biorep_map_dxlink"], "mbias_report": extract_out["mbias_report_dxlink"], # from signal() "signal": signal_out["bigWig_dxlink"], # from bedmethyl() "CpG_bed": bedmethyl_job.get_output_ref("CpG_bed_dxlink"), "CHG_bed": bedmethyl_job.get_output_ref("CHG_bed_dxlink"), "CHH_bed": bedmethyl_job.get_output_ref("CHH_bed_dxlink"), "CpG_bb": bedmethyl_job.get_output_ref("CpG_bb_dxlink"), "CHG_bb": bedmethyl_job.get_output_ref("CHG_bb_dxlink"), "CHH_bb": bedmethyl_job.get_output_ref("CHH_bb_dxlink"), "metadata": json.dumps(qc_metrics), }