def process(scattered_input, dme_ix, ncpus, reads_root): # Fill in code here to process the input and create output. if DEBUG: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) dme_ix = dxpy.DXFile(dme_ix) # The following line(s) download your file inputs to the local file system # using variable names for the filenames. dxpy.download_dxfile(dme_ix.get_id(), "index.tgz") fq = dxpy.DXFile(scattered_input) name = fq.describe()['name'] dxpy.download_dxfile(fq.get_id(), name) bam_root = name + '_techrep' logger.info("* === Calling DNAnexus and ENCODE independent script... ===") logger.debug("** DIR: %s" % os.listdir('./')) logger.debug(subprocess.check_output(shlex.split('head %s' % name))) if os.path.isfile(ALIGN_SCRIPT): logger.debug("** Executable %s exists" % ALIGN_SCRIPT) else: logger.debug("** Executable %s DOES NOT exist" % ALIGN_SCRIPT) exit(1) align_cmd = '%s index.tgz %s %s %s no_stats' % (ALIGN_SCRIPT, name, str(ncpus), bam_root) logger.debug('** command line: %s' % align_cmd) map_out = subprocess.check_output(shlex.split(align_cmd)) logger.info("* === Returned from dname_align_se ===") # As always, you can choose not to return output if the # "postprocess" stage does not require any input, e.g. rows have # been added to a GTable that has been created in advance. Just # make sure that the "postprocess" job does not run until all # "process" jobs have finished by making it wait for "map" to # finish using the depends_on argument (this is already done for # you in the invocation of the "postprocess" job in "main"). logger.debug("** DIR: %s" % os.listdir('./')) logger.debug("** OUTPUT DIR: %s" % os.listdir('output/')) os.rename(bam_root + '_bismark.bam', bam_root + '.bam') return { "bam_file": dxpy.dxlink(dxpy.upload_local_file(bam_root + '.bam')), "report_file": dxpy.dxlink( dxpy.upload_local_file(bam_root + '_bismark_map_report.txt')) }
def download_qc_report(self, download_dir): """ Downloads the QC report from the DNAnexus sequencing results project. Args: download_dir: `str` - The local directory path to download the QC report to. Returns: `str`. The filepath to the downloaded QC report. """ if not os.path.isdir(download_dir): os.makedirs(download_dir) res = dxpy.find_one_data_object(project=self.dx_project_id, folder=self.DX_QC_REPORT_FOLDER, name="*_QC_Report.pdf", name_mode="glob") #res will be something like {u'project': u'project-BzqVkxj08kVZbPXk54X0P2JY', u'id': u'file-BzqVkg800Fb0z4437GXJfGY6'} #dxpy.find_one_data_object() raises a dxpy.exceptions.DXSearchError() if nothing is found. dx_file = dxpy.DXFile(dxid=res["id"], project=res["project"]) download_file_name = os.path.join(download_dir, dx_file.name) msg = "{filename} to {download_dir}.".format(filename=dx_file.name, download_dir=download_dir) debug_logger.debug("Downloading " + msg) dxpy.bindings.dxfile_functions.download_dxfile( dxid=dx_file, filename=download_file_name) success_logger.info("Downloaded " + msg) return download_file_name
def main(input_bam, paired=True, params=''): # The following line(s) initialize your data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. input_bam = dxpy.DXFile(input_bam) base_name = remove_extensions(input_bam.describe()['name'], [".bam", ".BAM", ".sam", ".SAM"]) # The following line(s) download your file inputs to the local file system # using variable names for the filenames. dxpy.download_dxfile(input_bam.get_id(), "input.bam") # Fill in your application code here. command = "java -Xmx6g -jar /opt/jar/SamToFastq.jar INPUT=input.bam F=%s_1.fastq" % base_name if paired: command += " F2=%s_2.fastq" % base_name subprocess.check_call(command, shell=True) # The following line(s) use the Python bindings to upload your file outputs # after you have created them on the local file system. It assumes that you # have used the output field name for the filename for each output, but you # can change that behavior to suit your needs. output = {} fastq_file = dxpy.upload_local_file("%s_1.fastq" % base_name) output["fastq_file"] = dxpy.dxlink(fastq_file) if paired: paired_fastq_file = dxpy.upload_local_file("%s_2.fastq" % base_name) output["paired_fastq_file"] = dxpy.dxlink(paired_fastq_file) return output
def calc_mismatch_per_cycle_stats(bam_file, aligner, output_project, output_folder, properties={}): logger = [] misc_subfolder = output_folder + '/miscellany' bam_file = dxpy.DXFile(bam_file) bam_filename = bam_file.describe()['name'] dxpy.download_dxfile(bam_file.get_id(), bam_filename) ofn = os.path.splitext(bam_filename)[0] + '.mm_stats' # Change permissions cmd = 'chmod +x /bwa_mismatches' run_cmd(cmd, logger) cmd = '/bwa_mismatches -o {0} -m {1} {2}'.format(ofn, ALIGNERS[aligner], bam_filename) run_cmd(cmd, logger) properties['file_type'] = 'mismatch_stats' mismatch_per_cycle_stats = dxpy.upload_local_file(filename=ofn, project=output_project, folder=misc_subfolder, properties=properties, parents=True) return { 'mismatch_per_cycle_stats': mismatch_per_cycle_stats, "tools_used": logger }
def main(inputs): # The following line(s) initialize your data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. input_filenames = [] for input_file in inputs: dxf = dxpy.DXFile(input_file) input_filenames.append(dxf.name) dxpy.download_dxfile(dxf.get_id(), dxf.name) extension = splitext(splitext(input_filenames[-1])[0])[ 1] #uses last extension - presumably they are all the same pooled_filename = '-'.join( [splitext(splitext(fn)[0])[0] for fn in input_filenames]) + "_pooled%s.gz" % (extension) out, err = run_pipe( ['gzip -dc %s' % (' '.join(input_filenames)), 'gzip -c'], outfile=pooled_filename) pooled = dxpy.upload_local_file(pooled_filename) # The following line fills in some basic dummy output and assumes # that you have created variables to represent your output with # the same name as your output fields. output = {} output["pooled"] = dxpy.dxlink(pooled) return output
def test_upload_download_remove(self): num_test_objs = 10 min_obj_size = 50 with NamedTemporaryDirectory(change_dir=True) as tmp_d: self.create_dataset(tmp_d, num_test_objs, min_obj_size) self.test_dir.upload(['.']) which_obj = self.get_dataset_obj_names(num_test_objs)[-1] dx_p = self.test_dir / which_obj file_h = dxpy.DXFile(dxid=dx_p.canonical_resource, project=dx_p.canonical_project) file_h.wait_on_close(20) # wait for file to go to closed state for which_obj in self.get_dataset_obj_names(num_test_objs): self.assertTrue((self.test_dir / which_obj).exists()) with NamedTemporaryDirectory(change_dir=True) as tmp_d: self.test_dir.download(tmp_d) for which_obj in self.get_dataset_obj_names(num_test_objs): self.assertCorrectObjectContents(which_obj, which_obj, min_obj_size) (self.test_dir / which_obj).remove() # consistency check while (self.test_dir / which_obj).exists(): time.sleep(.5) self.assertFalse((self.test_dir / which_obj).exists())
def _download_symbolic_link(dxid, md5digest, project, dest_filename): dxfile = dxpy.DXFile(dxid) url, _headers = dxfile.get_download_url(preauthenticated=True, duration=6 * 3600, project=project) # Follow the redirection print('Following redirect for ' + url) wget_exe = _which("wget") if wget_exe is None: err_exit("wget is not installed on this system") cmd = ["wget", "--tries=5", "--quiet"] if os.path.isfile(dxid): # file already exists, resume upload. cmd += ["--continue"] cmd += ["-O", dest_filename, url] try: print("Downloading symbolic link with wget") subprocess.check_call(cmd, stderr=subprocess.STDOUT) except subprocess.CalledProcessError as e: msg = "" if e and e.output: msg = e.output.strip() err_exit("Failed to call wget: {cmd}\n{msg}\n".format(cmd=str(cmd), msg=msg)) if md5digest is not None: _verify(dest_filename, md5digest)
def flagstat_parse(dxlink): desc = dxpy.describe(dxlink) with dxpy.DXFile(desc['id'], mode='r') as flagstat_file: if not flagstat_file: return None qc_dict = { #values are regular expressions, will be replaced with scores [hiq, lowq] 'in_total': 'in total', 'duplicates': 'duplicates', 'mapped': 'mapped', 'paired_in_sequencing': 'paired in sequencing', 'read1': 'read1', 'read2': 'read2', 'properly_paired': 'properly paired', 'with_self_mate_mapped': 'with itself and mate mapped', 'singletons': 'singletons', 'mate_mapped_different_chr': 'with mate mapped to a different chr$', #i.e. at the end of the line 'mate_mapped_different_chr_hiQ': 'with mate mapped to a different chr \(mapQ>=5\)' #RE so must escape } flagstat_lines = flagstat_file.read().splitlines() for (qc_key, qc_pattern) in qc_dict.items(): qc_metrics = next(re.split(qc_pattern, line) for line in flagstat_lines if re.search(qc_pattern, line)) (hiq, lowq) = qc_metrics[0].split(' + ') qc_dict[qc_key] = [int(hiq.rstrip()), int(lowq.rstrip())] return qc_dict
def group_files_by_read(fastq_files): """ Function : Groups a list of FASTQ files by the values of their Read property that indicates the read number. Returns a dict mapping each observed value of the property (or 'none' if a file does not have a value for the property) to a list of the files with that value. Within each group, the files are sorted by their value of the Chunk property (to ensure that left and right reads of a given chunk are handled together. Args : fastq_files - a list of dxpy.DXFile objects representing FASTQ files. Returns : dict. """ #print("Grouping Fastq files by read number") fastq_dxfiles = [dxpy.DXFile(item) for item in fastq_files] read_dict = {} for fastq_dxfile in fastq_dxfiles: props = fastq_dxfile.get_properties() read_num = props["read"] if read_num not in ["1", "2", "none"]: raise dxpy.AppError("%s has invalid Read property: %s" % (fastq_dxfile.get_id(), read_num)) if read_num not in read_dict: read_dict[read_num] = [] fastq_dxlink = dxpy.dxlink(fastq_dxfile) read_dict[read_num].append(fastq_dxlink) #for read_num in read_dict: # read_dict[read_num] = sorted(read_dict[read_num], key=chunk_property) return read_dict
def main(): inputs_file = open("inputs_stats.txt", 'w') print sys.argv[2] workflow = dxpy.DXWorkflow(sys.argv[2].split(":")[-1]) fh = dxpy.DXFile(sys.argv[1].split(":")[-1]) if "/Results" in fh.describe()['folder']: return app_id = sys.argv[3] if "applet" in app_id: app = dxpy.DXApplet(app_id) else: app = dxpy.DXApp(app_id) w_id = sys.argv[1].split(":")[1] existing_inputs = [] for item in workflow.describe()['stages'][0]['input']: existing_inputs.append(item) print existing_inputs for x in app.describe()['inputSpec']: print x if x['class'] == 'file' and x['name'] not in existing_inputs: inputs_file.write(x['name'] + "\n") inputs_file.close()
def test_alignment_count(applet_id, project_id, folder, tmpdir): """Run BWA on a FASTQ file and verify that the number of alignments produced is correct. """ # Recall that applet_id is set in the associated conftest.py, which either # gets it from the command line or builds the applet and retrieves its id. # And tmpdir is some pytest magic. It's type is py.path.local.LocalPath. # It's strpath property just returns a string. applet = dxpy.DXApplet(applet_id) input_dict = { "fastq": dxpy.dxlink(SAMPLE_FASTQ), "genomeindex_targz": dxpy.dxlink(HS37D5_BWA_INDEX) } job = applet.run(input_dict, instance_type="mem1_ssd1_x16", folder=folder, project=project_id) job.wait_on_done() output_bam_dxfile = dxpy.DXFile(job.describe()["output"]["bam"]) local_filename = os.path.join(tmpdir.strpath, "test.bam") dxpy.download_dxfile(output_bam_dxfile.get_id(), local_filename) count_alignments_cmd = "samtools view {bam} | wc -l".format( bam=local_filename) num_alignments = int( subprocess.check_output(count_alignments_cmd, shell=True)) assert num_alignments == 1951476
def main(input_file): # The following line(s) initialize your data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. input_file = dxpy.DXFile(input_file) # The following line(s) download your file inputs to the local file system # using variable names for the filenames. dxpy.download_dxfile(input_file.get_id(), "input_file") # Fill in your application code here. subprocess.check_call( "fastq_quality_trimmer -t 20 -Q 33 -i input_file -o output_file", shell=True) # The following line(s) use the Python bindings to upload your file outputs # after you have created them on the local file system. It assumes that you # have used the output field name for the filename for each output, but you # can change that behavior to suit your needs. output_file = dxpy.upload_local_file("output_file") # The following line fills in some basic dummy output and assumes # that you have created variables to represent your output with # the same name as your output fields. output = {} output["output_file"] = dxpy.dxlink(output_file) return output
def download_url_create_symlink(self, url, sym_name): print("url = {}".format(url)) tmp_file = "localfile" # download [url] cmd = ["wget", "--tries=5", "--quiet", "-O", tmp_file, url] try: print("Downloading original link with wget") subprocess.check_call(cmd, stderr=subprocess.STDOUT) except subprocess.CalledProcessError as e: msg = "" if e and e.output: msg = e.output.strip() err_exit("Failed to download with wget: {cmd}\n{msg}\n".format( cmd=str(cmd), msg=msg)) # calculate its md5 checksum digest = md5_checksum(tmp_file) os.remove(tmp_file) # create a symlink on the platform, with the correct checksum input_params = { 'name': sym_name, 'project': self.proj_id, 'drive': "drive-PUBLISHED", 'md5sum': digest, 'symlinkPath': { 'object': url } } result = dxpy.api.file_new(input_params=input_params) return dxpy.DXFile(dxid=result["id"], project=self.proj_id)
def combine_files(countDXlinks, resultfn): """The 'gather' subjob of the applet. Arguments: countDXlinks (list[dict]): list of DXlinks to process job output files. resultfn (str): Filename to use for job output file. Returns: DXLink for the main function to return as the job output. Note: Only the DXLinks are passed as parameters. Subjobs work on a fresh instance so files must be downloaded to the machine """ if resultfn.endswith(".bam"): resultfn = resultfn[:-4] + '.txt' sum_reads = 0 with open(resultfn, 'w') as f: for i, dxlink in enumerate(countDXlinks): dxfile = dxpy.DXFile(dxlink) filename = "countfile{0}".format(i) dxpy.download_dxfile(dxfile, filename) with open(filename, 'r') as fsub: for line in fsub: sum_reads += parse_line_for_readcount(line) f.write(line) f.write('Total Reads: {0}'.format(sum_reads)) countDXFile = dxpy.upload_local_file(resultfn) countDXlink = dxpy.dxlink(countDXFile.get_id()) return {"countDXLink": countDXlink}
def main(**job_inputs): # If we weren't provided a mmi index for the reference, generate it. if 'genome_mmi' not in job_inputs: mmi_input = {'genome_fastagz': job_inputs['genome_fastagz']} minimap_index_job = dxpy.new_dxjob(mmi_input, 'run_minimap_index') job_inputs['genome_mmi'] = minimap_index_job.get_output_ref( 'genome_mmi') output = {'genome_mmi': job_inputs['genome_mmi']} # check if we're dealing with pacbio or ONT reads and what the filetype is datatype = job_inputs['datatype'] one_reads_file = dxpy.DXFile(job_inputs['reads'][0]).describe()['name'] try: file_ext = re.search("(fastq|fasta|fa|fq){1}(.gz)?$", one_reads_file, flags=re.I).group(1).lower() except AttributeError: raise dxpy.AppError("Invalid filetype extension supplied.") # for fasta and fastq inputs, run jobs using native minimap2 jobs = run_minimap2_subjobs(job_inputs) output['bam_files'] = [j.get_output_ref('mapped_reads') for j in jobs] output['bai_files'] = [ j.get_output_ref('mapped_reads_index') for j in jobs ] return output
def _move(self, dest): """Moves the data object to a different folder within project. Args: dest (Path): The destination file/folder path within same project Raises: ValueError: When attempting to move projects DNAnexusError: If attempting to move across projects """ if not self.resource: raise ValueError('Cannot move project ({})'.format(self)) if dest.canonical_project != self.canonical_project: # This can be implemented by clone and remove original raise DNAnexusError('Cannot move across different projects') if self == dest: return file_handler = dxpy.DXFile(dxid=self.canonical_resource, project=self.canonical_project) target_dest, should_rename = self._prep_for_copy(dest) with _wrap_dx_calls(): file_handler.move('/' + (target_dest.parent.resource or '')) if should_rename: file_handler.rename(dest.name) self.clear_cached_properties()
def _clone(self, dest): """Clones the data object into the destination path. The original file is retained. Args: dest (Path): The destination file/folder path in a different project Raises: ValueError: If attempting to clone a project DNAnexusError: If cloning within same project """ if not self.resource: raise ValueError('Cannot clone project ({})'.format(self)) if dest.canonical_project == self.canonical_project: raise DNAnexusError('Cannot clone within same project') file_handler = dxpy.DXFile(dxid=self.canonical_resource, project=self.canonical_project) target_dest, should_rename = self._prep_for_copy(dest) with _wrap_dx_calls(): new_file_h = file_handler.clone( project=dest.canonical_project, folder='/' + (target_dest.parent.resource or '')) # no need to rename if we changed destination to include original name if should_rename: new_file_h.rename(dest.name)
def main(inputs, prefix=None): input_filenames = [] for input_file in inputs: dxf = dxpy.DXFile(input_file) input_filenames.append(dxf.name) dxpy.download_dxfile(dxf.get_id(), dxf.name) # uses last extension - presumably they are all the same extension = splitext(splitext(input_filenames[-1])[0])[1] if prefix: pooled_filename = prefix + "_pooled%s.gz" % (extension) else: pooled_filename = \ '-'.join([splitext(splitext(fn)[0])[0] for fn in input_filenames]) + "_pooled%s.gz" % (extension) out, err = common.run_pipe([ 'gzip -dc %s' % (' '.join(input_filenames)), 'gzip -cn'], outfile=pooled_filename) pooled = dxpy.upload_local_file(pooled_filename) output = { "pooled": dxpy.dxlink(pooled) } return output
def download_and_gunzip_file(input_file, skip_decompress=False, additional_pipe=None, create_named_pipe=False, input_filename=None): input_file = dxpy.DXFile(input_file) if input_filename is None: input_filename = input_file.describe()['name'] ofn = input_filename cmd = 'dx download ' + input_file.get_id() + ' -o - ' if input_filename.endswith('.tar.gz'): ofn = 'tar_output_{0}'.format(ofn.replace('.tar.gz', '')) cmd += '| tar -zxvf - ' elif (os.path.splitext(input_filename)[-1] == '.gz') and not skip_decompress: cmd += '| gunzip ' ofn = os.path.splitext(ofn)[0] if additional_pipe is not None: cmd += '| ' + additional_pipe cmd += ' > "{0}"'.format(ofn) if create_named_pipe: named_pipe_cmd = 'mkfifo {0}'.format(ofn) run_cmd(named_pipe_cmd) cmd += '&' run_cmd(cmd) return ofn
def merge_bams(bam_files, bam_root, use_cat, use_sort, nthreads): fnames = [] for bam in bam_files: dxbam = dxpy.DXFile(bam) dxfn = dxbam.describe()['name'] logger.info("* Downloading %s... *" % dxfn) dxpy.download_dxfile(bam, dxfn) fnames.append(dxfn) outfile_name = bam_root logger.info("* Merged alignments file will be: %s *" % (outfile_name + '.bam')) if len(fnames) == 1: # UNTESTED rep_outfile_name = bam_root + '_bismark_biorep' logger.info("* Only one input file (%s), no merging required." % fnames[0]) os.rename(fnames[0], outfile_name + '.bam') else: if use_cat: for fn in fnames: if not os.path.isfile('sofar.bam'): os.rename(fn, 'sofar.bam') else: logger.info("* Merging...") # NOTE: keeps the first header cat_cmd = 'samtools cat sofar.bam %s' % fn subprocess.check_call(shlex.split(cat_cmd), stdout=open('merging.bam', 'a')) os.rename('merging.bam', 'sofar.bam') # At this point there is a 'sofar.bam' with one or more input bams logger.info("* Files merged into %s (via cat) *" % (outfile_name + '.bam')) else: # use samtools merge # UNTESTED filelist = " ".join(fnames) merge_cmd = 'samtools merge sofar.bam ' + filelist logger.info("Merging via merge: %s " % merge_cmd) mergeout = subprocess.check_output(shlex.split(merge_cmd)) # this gets renamed later logger.info(mergeout) if use_sort: # sorting needed due to samtools cat # UNTESTED sort_cmd = 'samtools sort -@ %s -m 6G -f sofar.bam sorted.bam' % nthreads logger.info("* Sorting merged bam: %s" % sort_cmd) sortout = subprocess.check_output(shlex.split(sort_cmd)) logger.info(sortout) os.rename('sorted.bam', outfile_name + '.bam') else: os.rename('sofar.bam', outfile_name + '.bam') return outfile_name + '.bam'
def main(quants_a, quants_b, annotations): # tool_versions.py --applet $script_name --appver $script_ver sw_versions = subprocess.check_output(['tool_versions.py', '--dxjson', 'dnanexus-executable.json']) dxfile_a = dxpy.DXFile(quants_a) dxfile_b = dxpy.DXFile(quants_b) dxfile_anno = dxpy.DXFile(annotations) print "* Downloading files..." dxpy.download_dxfile(dxfile_a.get_id(), "quants_a.tsv") dxpy.download_dxfile(dxfile_b.get_id(), "quants_b.tsv") dxpy.download_dxfile(dxfile_anno.get_id(), "annotations.gtf.gz") # Create and appropriate name for output files out_root = root_name_from_pair(dxfile_a.name.split('.')[0],dxfile_b.name.split('.')[0]) print "* Expecting output: '"+out_root+"_srna_mad_plot.png'" # Must move sub-scripts into current dir so they will be found by srna-mad-qc.sh subprocess.check_call(['mv', "/usr/bin/extract_gene_ids.awk", '.']) subprocess.check_call(['mv', "/usr/bin/sum_srna_expression.awk", '.']) subprocess.check_call(['mv', "/usr/bin/MAD.R", '.']) # DX/ENCODE independent script is found in resources/usr/bin print "* ===== Calling DNAnexus and ENCODE independent script... =====" subprocess.check_call(['srna_mad_qc.sh','annotations.gtf.gz','quants_a.tsv','quants_b.tsv',out_root]) print "* ===== Returned from dnanexus and encodeD independent script =====" mad_plot_file = out_root + '_mad_plot.png' mad_qc_file = out_root + '_mad_qc.txt' print "* package properties..." qc_metrics = {} f_qc = open(mad_qc_file, 'r') mad_output = f_qc.read() f_qc.close() mad_output = mad_output.replace("NA","-1") qc_metrics["MAD.R"] = json.loads(mad_output) meta_string = json.dumps(qc_metrics) print json.dumps(qc_metrics,indent=4) props = {} props["SW"] = sw_versions print "* Upload Plot..." plot_dxfile = dxpy.upload_local_file(mad_plot_file,properties=props,details=qc_metrics) return { "metadata": meta_string, "mad_plot": plot_dxfile }
def main(quants_a, quants_b): # tool_versions.py --applet $script_name --appver $script_ver sw_versions = subprocess.check_output( ['tool_versions.py', '--dxjson', 'dnanexus-executable.json']) dxfile_a = dxpy.DXFile(quants_a) dxfile_b = dxpy.DXFile(quants_b) print "* Downloading files..." dxpy.download_dxfile(dxfile_a.get_id(), "quants_a.tsv") dxpy.download_dxfile(dxfile_b.get_id(), "quants_b.tsv") # Create and appropriate name for output files out_root = root_name_from_pair( dxfile_a.name.split('.')[0], dxfile_b.name.split('.')[0]) out_root += '_mad' mad_plot_file = out_root + '_plot.png' # DX/ENCODE independent script is found in resources/usr/bin print "* Runnning MAD.R..." subprocess.check_call(["ls", "-l"]) #mad_output = subprocess.check_output(['Rscript', '/usr/bin/MAD.R', 'quants_a.tsv', 'quants_b.tsv']) #subprocess.check_call(['mv', "MAplot.png", mad_plot_file ]) subprocess.check_call( ['rampage_mad_qc.sh', 'quants_a.tsv', 'quants_b.tsv', out_root]) mad_json_file = out_root + '.json' print "* package properties..." qc_metrics = {} #qc_metrics["MAD.R"] = json.loads(mad_output) fileH = open(mad_json_file, 'r') qc_metrics["MAD.R"] = json.load(fileH) fileH.close() meta_string = json.dumps(qc_metrics) print json.dumps(qc_metrics, indent=4) props = {} props["SW"] = sw_versions print "* Upload Plot..." plot_dxfile = dxpy.upload_local_file(mad_plot_file, properties=props, details=qc_metrics) return {"metadata": meta_string, "mad_plot": plot_dxfile}
def file_get_details(fid, dxfile=None, proj_id=None): '''Returns dx file's details as json.''' if dxfile == None: if proj_id != None: dxfile = dxpy.DXFile(fid, project=proj_id) else: dxfile = file_handler_from_fid(fid) return dxfile.get_details()
def main(bam_file, ref_vcf_file, eval_vcf_file, qual_cutoff, depth_cutoff, bed_file=None): # The following line(s) initialize your data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. bam_file = dxpy.DXFile(bam_file) if bed_file is not None: bed_file = dxpy.DXFile(bed_file) ref_vcf_file = dxpy.DXFile(vcf_file) eval_vcf_file = dxpy.DXFile(eval_vcf_file) # The following line(s) download your file inputs to the local file system # using variable names for the filenames. dxpy.download_dxfile(bam_file.get_id(), "bam_file") dxpy.download_dxfile(vcf_file.get_id(), "vcf_file") if bed_file is not None: dxpy.download_dxfile(bed_file.get_id(), "bed_file") # Fill in your application code here. # The following line(s) use the Python bindings to upload your file outputs # after you have created them on the local file system. It assumes that you # have used the output field name for the filename for each output, but you # can change that behavior to suit your needs. sites_for_manual_review = dxpy.upload_local_file("sites_for_manual_review") # The following line fills in some basic dummy output and assumes # that you have created variables to represent your output with # the same name as your output fields. output = {} output["sites_for_manual_review"] = dxpy.dxlink(sites_for_manual_review) output["number_of_missed_sites"] = number_of_missed_sites output["found_sites"] = found_sites output["Sensitivity"] = Sensitivity output["specificity"] = specificity return output
def main(childbam, fatherbam, motherbam, reference, outputbase=None, targetbed=None): inputbams = [dxpy.DXFile(item) for item in (childbam, fatherbam, motherbam)] if len(inputbams) != 3: raise dxpy.exceptions.AppError("A trio must consist of three files (%d bam files were provided)"%len(inputbams)) scc("dx download '%s' -o - --no-progress | zcat > reference.fa"%dxpy.DXFile(reference).get_id(), shell=True) scc(["samtools", "faidx", "reference.fa"]) targetopt = '' if not targetbed is None: dxpy.download_dxfile(targetbed, "target.bed") targetopt = '-l target.bed' if outbase is None: outbase = inputbams[0].get_properties()['SAMPLE_NAME'] + '.trio' # order in trioconfig must be child, father, mother trioconfig = open('trioconfig', 'w') inputfiles = [] for i in range(len(inputbams)): dxpy.download_dxfile(inputbams[i].get_id(), "inputbams-%d.bam"%i) inputfiles.append("inputbams-%d.bam"%i) trioconfig.write(str(inputbams[i].get_properties()['SAMPLE_NAME']) + "\n") trioconfig.close() # could tee to an output bcf file if desired # FIXME - need to specify child's gender for non-PAR on X # trioxd for female child, trioxs for male child command = """samtools mpileup -uf reference.fa -D -V -C 50 %s %s | \ bcftools view -s trioconfig -T trioauto -vg - > %s.vcf \ """%(targetopt, ' '.join(inputfiles), outputbase) print "::: command is:\n\t" + command scc(command, shell=True) #bcfout = dxpy.upload_local_file("bcfout"); vcfout = dxpy.upload_local_file("%s.vcf"%outputbase); output = {} #output["bcfout"] = dxpy.dxlink(bcfout) output["vcfout"] = dxpy.dxlink(vcfout) return output
def file_get_properties(fid, dxfile=None, proj_id=None): '''Returns dx file's properties.''' if dxfile == None: if proj_id != None: dxfile = dxpy.DXFile(fid, project=proj_id) else: dxfile = file_handler_from_fid(fid) return dxfile.get_properties()
def test_AAA_DownloadResultResults(self): job_hash = self.job.describe() output_hash = job_hash["output"]["rds"] self.assertTrue(len(output_hash) == 1) f = dxpy.DXFile(output_hash.values()[0], project=job_hash["project"]) print "TestCase: Downloading %s" % f.name dxpy.download_dxfile(f.id, f.name, project=job_hash["project"]) self.assertTrue(os.path.isfile(f.name))
def main(**kwargs): dxpy.download_folder(DCC_CREDENTIALS_PROJECT, '.', folder=DCC_CREDENTIALS_FOLDER) if 'key' in kwargs: key = '-'.join([dxpy.api.system_whoami()['id'], kwargs.pop('key')]) else: key = dxpy.api.system_whoami()['id'] key_tuple = common.processkey(key, KEYFILE) if not key_tuple: logger.error("Key %s is not found in the keyfile %s" % (key, KEYFILE)) raise PortalCredentialsError("Supply a valid keypair ID") authid, authpw, server = key_tuple if 'url' in kwargs: server = kwargs.pop('url') keypair = (authid, authpw) tokens = ['python3 checkfiles.py'] for k, v in kwargs.iteritems(): if isinstance(v, bool): if v: tokens.append("--" + k.replace('_', '-')) continue if isinstance(v, str) or isinstance(v, unicode) or isinstance(v, int): tokens.append(' '.join(["--" + k.replace('_', '-'), str(v)])) if 'dx_file' in kwargs: dxfile = dxpy.DXFile(kwargs.get('dx_file')) local_file = dxpy.download_dxfile(dxfile, dxfile.name) tokens.append("--local-file %s" % (dxfile.name)) # this is just to get a command string to print that has no secrets tokens_safe = deepcopy(tokens) tokens_safe.append("--username %s --password %s" % ("." * len(authid), "." * len(authpw))) tokens_safe.append(server) logger.info(' '.join(tokens_safe)) tokens.append("--username %s --password %s" % (authid, authpw)) # this needs to be the last token tokens.append(server) checkfiles_command = ' '.join(tokens) subprocess.check_call(shlex.split(checkfiles_command)) output = {} outfilename = kwargs.get('out') errfilename = kwargs.get('err') if outfilename: out = dxpy.upload_local_file(outfilename) output.update({'out': dxpy.dxlink(out)}) if errfilename: err = dxpy.upload_local_file(errfilename) output.update({'err': dxpy.dxlink(err)}) return output
def download_file(file_dxid): """ Args : dx_file - a file object ID on DNAnexus to the current working directory. Returns : str. Path to downloaded file. """ dx_file = dxpy.DXFile(file_dxid) filename = dx_file.describe()['name'] dxpy.download_dxfile(dxid=dx_file.get_id(), filename=filename) return filename
def main ( fastq1_gz, fastq2_gz, bowtie_index ): input_fastq_file1 = dxpy.DXFile(fastq1_gz) input_fastq_file1_name = input_fastq_file1.describe()['name'] if input_fastq_file1_name.endswith('.gz'): dxpy.download_dxfile(fastq1_gz, "fastq1.gz") subprocess.call("gunzip -q fastq1.gz", shell=True) else: dxpy.download_dxfile(fastq1_gz, "fastq1") input_fastq_file2 = dxpy.DXFile(fastq2_gz) input_fastq_file2_name = input_fastq_file2.describe()['name'] if input_fastq_file2_name.endswith('.gz'): dxpy.download_dxfile(fastq2_gz, "fastq2.gz") subprocess.call("gunzip -q fastq2.gz", shell=True) else: dxpy.download_dxfile(fastq2_gz, "fastq2") dxpy.download_dxfile(bowtie_index, "bowtie_index.tgz") command1 = "tar -xzf bowtie_index.tgz" subprocess.call(command1, shell=True) command4 = "ls -1 *bt2 | head -1 | sed 's/.1.bt2//g' > file_containing_bowtie_index_name" subprocess.call(command4, shell=True) subprocess.call("ls",shell=True) subprocess.call("cat file_containing_bowtie_index_name",shell=True) outprefix='out' command5 = "cat file_containing_bowtie_index_name | xargs -i sh -c 'preprocessing.sh {{}} fastq1 fastq2 . {outprefix}'".format(outprefix=outprefix) subprocess.call(command5, shell=True) print(command5) sorted_bam_pe_filename = "{outprefix}_noDup.sort.bam".format(outprefix=outprefix) split_bam1_filename = "{outprefix}_pair1.bam".format(outprefix=outprefix) split_bam2_filename = "{outprefix}_pair2.bam".format(outprefix=outprefix) sorted_bam_pe_file = dxpy.upload_local_file(sorted_bam_pe_filename) split_bam1_file = dxpy.upload_local_file(split_bam1_filename) split_bam2_file = dxpy.upload_local_file(split_bam2_filename) return { "sorted_bam_pe": sorted_bam_pe_file, "split_bam1": split_bam1_file, "split_bam2": split_bam2_file }