def main(reorg_conf___=None, reorg_status___=None): job_describe = dxpy.describe(dxpy.JOB_ID) analysis_id = job_describe["analysis"] stages = dxpy.describe(analysis_id)["stages"] # key is the name of the output and the value is the link of the file. output_map = [ x['execution']['output'] for x in stages if x['id'] == 'stage-outputs' ][0] print(output_map) output_file = list(output_map.get('output_file').values()) output_config_file = list(output_map.get('output_config_file').values()) output_folder_1 = '/tests/test_reorg/out_1' output_folder_2 = '/tests/test_reorg/out_2' dx_container = dxpy.DXProject(dxpy.PROJECT_CONTEXT_ID) dx_container.move(destination=output_folder_1, objects=output_file) dx_container.move( objects=output_config_file, destination=output_folder_2, ) return { "outputs": [output_map.get('output_file'), output_map.get('output_config_file')] }
def job_2_app(job_id): try: app_id = dxpy.describe(job_id)['app'] except KeyError: app_id = dxpy.describe(job_id)['applet'] return app_id.strip()
def copy_files(fids, projectId, folder, overwrite=False): '''Copies array of dx file dicts to project:/folder, returning new array of dx file dicts.''' newFids = [] for fid in fids: fileDict = dxpy.describe(FILES[fid]) # FILES contain dxLinks if fileDict['project'] == projectId: # cannot copy into the same project!!! # so just leave in place and pretend that we did! #proj = dxpy.DXProject(projectId) #proj.move(folder,[fid]) newFids.append( fid ) continue # Check to see if file already exists. alreadyThere = find_file(folder+'/'+fileDict['name'],projectId) if alreadyThere is None or overwrite: # remove what is alreadyThere? #if alreadyThere is not None: # proj = dxpy.DXProject(projectId) # proj.remove_objects([alreadyThere]) dxFile = dxpy.get_handler(FILES[fid]) newLink = dxpy.dxlink(dxFile.clone(projectId, folder)) else: newLink = FILES(alreadyThere) if newLink == None: print "ERROR: Failed in copy of '" + fileDict['project'] + ":" + fileDict['name'] + \ "' to '" + projectId + ":" + folder + "'." sys.exit(1) newDict = dxpy.describe(newLink) FILES[newDict['id']] = newLink newFids.append( newDict['id'] ) return newFids
def connect(self): while True: self.error = False self.exception = None self.closed_code = None self.closed_reason = None try: self._app = WebSocketApp( self.url, on_open=self.opened, on_close=self.closed, on_error=self.errored, on_message=self.received_message ) self._app.run_forever(sslopt={"cert_reqs": ssl.CERT_NONE}) except: if not self.server_restarted(): raise finally: self._app = None if self.server_restarted(): # Instead of trying to reconnect in a retry loop with backoff, run an # API call that will do the same and block while it retries. logger.warn("Server restart, reconnecting...") time.sleep(1) dxpy.describe(self.job_id) else: break
def copy_files(fids, projectId, folder, overwrite=False): '''Copies array of dx file dicts to project:/folder, returning new array of dx file dicts.''' newFids = [] for fid in fids: fileDict = dxpy.describe(FILES[fid]) # FILES contain dxLinks if fileDict['project'] == projectId: # cannot copy into the same project!!! # so just leave in place and pretend that we did! #proj = dxpy.DXProject(projectId) #proj.move(folder,[fid]) newFids.append(fid) continue # Check to see if file already exists. alreadyThere = find_file(folder + '/' + fileDict['name'], projectId) if alreadyThere is None or overwrite: # remove what is alreadyThere? #if alreadyThere is not None: # proj = dxpy.DXProject(projectId) # proj.remove_objects([alreadyThere]) dxFile = dxpy.get_handler(FILES[fid]) newLink = dxpy.dxlink(dxFile.clone(projectId, folder)) else: newLink = FILES(alreadyThere) if newLink == None: print "ERROR: Failed in copy of '" + fileDict['project'] + ":" + fileDict['name'] + \ "' to '" + projectId + ":" + folder + "'." sys.exit(1) newDict = dxpy.describe(newLink) FILES[newDict['id']] = newLink newFids.append(newDict['id']) return newFids
def closed(self, code, reason): self.closed_code, self.closed_reason = code, reason if not (self.closed_code == 1000 or getattr(self.stream.closing, 'code', None) == 1000): try: error = json.loads(self.closed_reason) raise DXJobLogStreamingException( "Error while streaming job logs: {type}: {message}\n". format(**error)) except (KeyError, ValueError): error = "Error while streaming job logs: {code} {reason}\n".format( code=self.closed_code, reason=self.closed_reason) raise DXJobLogStreamingException(error) elif self.print_job_info: if self.job_id not in self.seen_jobs: self.seen_jobs[self.job_id] = {} for job_id in self.seen_jobs.keys(): self.seen_jobs[job_id] = dxpy.describe(job_id) print( get_find_executions_string(self.seen_jobs[job_id], has_children=False, show_outputs=True)) else: self.seen_jobs[self.job_id] = dxpy.describe(self.job_id) if self.seen_jobs[self.job_id].get('state') in [ 'failed', 'terminated' ]: err_exit(code=3)
def reconnect(self): # Instead of trying to reconnect in a retry loop with backoff, run an API call that will do the same # and block while it retries. time.sleep(1) dxpy.describe(self.job_id) WebSocketBaseClient.__init__(self, self.url, protocols=None, extensions=None) self.connect()
def app_2_version(app_id): if app_id == 'job_not_found': return 'job_not_found' try: version = dxpy.describe(app_id)['version'] except KeyError: version = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(dxpy.describe(app_id)['created']/1000.0)) return version
def app_2_version(app_id): try: version = dxpy.describe(app_id)['version'] except KeyError: version = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(dxpy.describe(app_id)['created'])) return version
def process(reads_file, reference_tar, bwa_aln_params, bwa_version): # reads_file, reference_tar should be links to file objects. # reference_tar should be a tar of files generated by bwa index and # the tar should be uncompressed to avoid repeating the decompression. print "In process" if bwa_version == "0.7.7": bwa = "bwa0.7.7" elif bwa_version == "0.7.10": bwa = "bwa0.7.10" else: bwa = "bwa0.7.7" print "Using bwa version %s" %(bwa_version) # Generate filename strings and download the files to the local filesystem reads_filename = dxpy.describe(reads_file)['name'] reads_basename = reads_filename # the order of this list is important. It strips from the right inward, so # the expected right-most extensions should appear first (like .gz) for extension in ['.gz', '.fq', '.fastq', '.fa', '.fasta']: reads_basename = reads_basename.rstrip(extension) reads_file = dxpy.download_dxfile(reads_file,reads_filename) reference_tar_filename = dxpy.describe(reference_tar)['name'] reference_tar_file = dxpy.download_dxfile(reference_tar,reference_tar_filename) # extract the reference files from the tar if reference_tar_filename.endswith('.gz') or reference_tar_filename.endswith('.tgz'): tar_command = 'tar -xzv --no-same-owner --no-same-permissions -f %s' %(reference_tar_filename) else: tar_command = 'tar -xv --no-same-owner --no-same-permissions -f %s' %(reference_tar_filename) print "Unpacking %s" %(reference_tar_filename) print tar_command print subprocess.check_output(shlex.split(tar_command)) reference_filename = resolve_reference() print "Using reference file: %s" %(reference_filename) print subprocess.check_output('ls -l', shell=True) #generate the suffix array index file sai_filename = '%s.sai' %(reads_basename) with open(sai_filename,'w') as sai_file: # Build the bwa command and call bwa bwa_command = "%s aln %s -t %d %s %s" \ %(bwa, bwa_aln_params, cpu_count(), reference_filename, reads_filename) print bwa_command subprocess.check_call(shlex.split(bwa_command), stdout=sai_file) print subprocess.check_output('ls -l', shell=True) # Upload the output to the DNAnexus project print "Uploading %s" %(sai_filename) sai_dxfile = dxpy.upload_local_file(sai_filename) process_output = { "output": dxpy.dxlink(sai_dxfile) } print "Returning from process:" print process_output return process_output
def job_2_app(job_id): if job_id == 'job_not_found': return 'job_not_found' try: app_id = dxpy.describe(job_id)['app'] except KeyError: app_id = dxpy.describe(job_id)['applet'] return app_id.strip()
def _check_suggestions(app_json, publish=False): """ Examines the specified dxapp.json file and warns about any violations of suggestions guidelines. :raises: AppBuilderException for data objects that could not be found """ for input_field in app_json.get('inputSpec', []): for suggestion in input_field.get('suggestions', []): if 'project' in suggestion: try: project = dxpy.api.project_describe(suggestion['project'], {"permissions": True}) if 'PUBLIC' not in project['permissions'] and publish: logger.warn('Project {name} NOT PUBLIC!'.format(name=project['name'])) except dxpy.exceptions.DXAPIError as e: if e.code == 404: logger.warn('Suggested project {name} does not exist, or not accessible by user'.format( name=suggestion['project'])) if 'path' in suggestion: try: check_folder_exists(suggestion['project'], suggestion['path'], '') except ResolutionError as e: logger.warn('Folder {path} could not be found in project {project}'.format( path=suggestion['path'], project=suggestion['project'])) if '$dnanexus_link' in suggestion: if suggestion['$dnanexus_link'].startswith(('file-', 'record-', 'gtable-')): try: dnanexus_link = dxpy.describe(suggestion['$dnanexus_link']) except dxpy.exceptions.DXAPIError as e: if e.code == 404: raise dxpy.app_builder.AppBuilderException( 'Suggested object {name} could not be found'.format( name=suggestion['$dnanexus_link'])) except Exception as e: raise dxpy.app_builder.AppBuilderException(str(e)) if 'value' in suggestion: if '$dnanexus_link' in suggestion['value']: # Check if we have JSON or string if isinstance(suggestion['value']['$dnanexus_link'], dict): if 'project' in suggestion['value']['$dnanexus_link']: try: dxpy.api.project_describe(suggestion['value']['$dnanexus_link']['project']) except dxpy.exceptions.DXAPIError as e: if e.code == 404: logger.warn('Suggested project {name} does not exist, or not accessible by user'.format( name=suggestion['value']['$dnanexus_link']['project'])) elif isinstance(suggestion['value']['$dnanexus_link'], basestring): if suggestion['value']['$dnanexus_link'].startswith(('file-', 'record-', 'gtable-')): try: dnanexus_link = dxpy.describe(suggestion['value']['$dnanexus_link']) except dxpy.exceptions.DXAPIError as e: if e.code == 404: raise dxpy.app_builder.AppBuilderException( 'Suggested object {name} could not be found'.format( name=suggestion['value']['$dnanexus_link'])) except Exception as e: raise dxpy.app_builder.AppBuilderException(str(e))
def get_mapping_analysis(bam): try: job_alias = next(detail['dx_job_id'] for detail in bam['step_run']['dx_applet_details']) except: logging.error('Failed to find step_run.dx_applet_details in bam %s' % (bam.get('accession'))) raise job_id = re.findall('job-\w*', job_alias)[0] analysis_id = dxpy.describe(job_id)['parentAnalysis'] return dxpy.describe(analysis_id)
def get_mapping_analysis(bam): try: job_alias = next(detail['dx_job_id'] for detail in bam['step_run']['dx_applet_details']) except: logging.error( 'Failed to find step_run.dx_applet_details in bam %s' % (bam.get('accession'))) raise job_id = re.findall('job-\w*', job_alias)[0] analysis_id = dxpy.describe(job_id)['parentAnalysis'] return dxpy.describe(analysis_id)
def BuildPindelCommand(kwargs, chrom, input_fn, is_pindel_input_type=False): # Download Reference FASTA reference_fasta_id = kwargs["reference_fasta"] ref_fn = DownloadRefFasta(reference_fasta_id) create_index = True if "fasta_index" in kwargs: fasta_idx_id = kwargs["fasta_index"] if dxpy.describe(fasta_idx_id)["name"].rstrip(".fa.fai") == dxpy.describe(reference_fasta_id)["name"].rstrip(".fa"): dxpy.download_dxfile(fasta_idx_id, ref_fn+".fai") create_index = False if create_index: print "No FASTA index was provided as input. Making one now." samtools_command = "samtools faidx {fasta}".format(fasta=ref_fn) subprocess.check_call(samtools_command, shell=True) print "\nBuilding pindel command from app inputs" command_args = ["pindel"] output_path = "output/" + kwargs["output_prefix"] command_args.append("-o {output_path}".format(output_path=output_path)) command_args.append("-f {fa}".format(fa=ref_fn)) if is_pindel_input_type: command_args.append("-P {pindel_config}".format(pindel_config=input_fn)) else: command_args.append("-i {bam_config}".format(bam_config=input_fn)) command_args.append("-T {option}".format(option=kwargs["num_threads_per_instance"])) command_args.append("-c {chrom}".format(chrom=chrom)) if kwargs["report_only_close_mapped_reads"]: command_args.append("-S {option}".format(option=kwargs["report_only_close_mapped_reads"])) else: command_args.append("-r {option}".format(option=kwargs["report_inversions"])) command_args.append("-t {option}".format(option=kwargs["report_duplications"])) command_args.append("-l {option}".format(option=kwargs["report_long_insertions"])) command_args.append("-k {option}".format(option=kwargs["report_breakpoints"])) command_args.append("-s {option}".format(option=kwargs["report_close_mapped_reads"])) if "breakdancer_calls_file" in kwargs: breakdancer_fn = DownloadFilesFromArray([kwargs["breakdancer_calls_file"]["$dnanexus_link"]])[0] print breakdancer_fn command_args.append("-b {option}".format(option=breakdancer_fn)) if "pindel_command_line" in kwargs: advanced_command = kwargs["pindel_command_line"] if advanced_command.startswith("pindel"): advanced_command = advanced_command.replace("pindel", "") command_args.append(advanced_command) command = " ".join(command_args) print command return command, output_path
def process(reads_file, reference_tar, bwa_aln_params, bwa_version): # reads_file, reference_tar should be links to file objects. # reference_tar should be a tar of files generated by bwa index and # the tar should be uncompressed to avoid repeating the decompression. print "In process" if bwa_version == "0.7.7": bwa = "bwa0.7.7" elif bwa_version == "0.7.10": bwa = "bwa0.7.10" else: bwa = "bwa0.7.7" print "Using bwa version %s" %(bwa_version) # Generate filename strings and download the files to the local filesystem reads_filename = dxpy.describe(reads_file)['name'] reads_basename = reads_filename.rstrip('.gz').rstrip('.fq').rstrip('.fastq') reads_file = dxpy.download_dxfile(reads_file,reads_filename) reference_tar_filename = dxpy.describe(reference_tar)['name'] reference_tar_file = dxpy.download_dxfile(reference_tar,reference_tar_filename) # extract the reference files from the tar if reference_tar_filename.endswith('.gz'): tar_command = 'tar -xzvf %s' %(reference_tar_filename) else: tar_command = 'tar -xvf %s' %(reference_tar_filename) print "Unpacking %s" %(reference_tar_filename) print subprocess.check_output(shlex.split(tar_command)) # assume the reference file is the only .fa file reference_filename = subprocess.check_output('ls *.fna', shell=True).rstrip() print subprocess.check_output('ls -l', shell=True) #generate the suffix array index file sai_filename = '%s.sai' %(reads_basename) with open(sai_filename,'w') as sai_file: # Build the bwa command and call bwa bwa_command = "%s aln %s -t %d %s %s" \ %(bwa, bwa_aln_params, cpu_count(), reference_filename, reads_filename) print bwa_command subprocess.check_call(shlex.split(bwa_command), stdout=sai_file) print subprocess.check_output('ls -l', shell=True) # Upload the output to the DNAnexus project print "Uploading %s" %(sai_filename) sai_dxfile = dxpy.upload_local_file(sai_filename) process_output = { "output": dxpy.dxlink(sai_dxfile) } print "Returning from process:" print process_output return process_output
def main(reorg_conf___=None, reorg_status___=None): # pylint: disable=unused-argument # find the output stage of the current analysis analysis_id = dxpy.describe(dxpy.JOB_ID)["analysis"] stages = dxpy.describe(analysis_id)["stages"] # retrieve the dictionary containing outputs output_map = [ x["execution"]["output"] for x in stages if x["id"] == "stage-outputs" ][0] folder_location = dxpy.describe(analysis_id)["folder"] project_container = dxpy.DXProject(dxpy.PROJECT_CONTEXT_ID) # move required outputfiles to their preferred permanent folders for file_identifiers in output_map.values(): if isinstance(file_identifiers, (list, tuple)): for indvfile in file_identifiers: try: default_location = dxpy.describe( dxpy.describe(indvfile["$dnanexus_link"])["createdBy"] ["job"])["runInput"]["default_location"] folder = folder_location + "/" + default_location except: folder = folder_location project_container.new_folder(folder, parents=True) file_container = dxpy.describe( indvfile["$dnanexus_link"])["project"] file_object = dxpy.bindings.DXFile(indvfile["$dnanexus_link"], project=file_container) if file_container == dxpy.PROJECT_CONTEXT_ID: file_object.move(folder) else: cloned_file = file_object.clone( # pylint: disable=unused-variable dxpy.PROJECT_CONTEXT_ID, folder=folder) elif isinstance(file_identifiers, dict): if '$dnanexus_link' in file_identifiers: try: default_location = dxpy.describe( dxpy.describe(file_identifiers["$dnanexus_link"]) ["createdBy"]["job"])["runInput"]["default_location"] folder = folder_location + "/" + default_location except: folder = folder_location project_container.new_folder(folder, parents=True) file_container = dxpy.describe( file_identifiers["$dnanexus_link"])["project"] file_object = dxpy.bindings.DXFile( file_identifiers["$dnanexus_link"], project=file_container) if file_container == dxpy.PROJECT_CONTEXT_ID: file_object.move(folder) else: cloned_file = file_object.clone(dxpy.PROJECT_CONTEXT_ID, folder=folder)
def test_dx_project_tagging(self): the_tags = [u"$my.tag", u"secoиdtag", u"тhird тagggg"] # tag run(u"dx tag : \\" + the_tags[0] + u" " + the_tags[1] + u" '" + the_tags[2] + u"'") mytags = dxpy.describe(self.project)['tags'] for tag in the_tags: self.assertIn(tag, mytags) # untag run(u"dx untag : \\" + the_tags[0] + u" '" + the_tags[2] + u"'") mytags = dxpy.describe(self.project)['tags'] self.assertIn(the_tags[1], mytags) for tag in [the_tags[0], the_tags[2]]: self.assertNotIn(tag, mytags)
def scatter(orig_reads, split_size): # Fill in code here to do whatever is necessary to scatter the # input. if DEBUG: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) splitsize = split_size * 1000000 * 4 # each FQ read is 4 lines os.mkdir('splits') for f in orig_reads: reads_filename = dxpy.describe(f)['name'] reads_basename = strip_extensions(reads_filename, STRIP_EXTENSIONS) dxpy.download_dxfile(dxpy.DXFile(f).get_id(), reads_filename) reads_root_name = simplify_name() or reads_basename logger.info('* RUNNING /bin/zcat %s | /usr/bin/split -l %d -d - %s ' % (reads_filename, splitsize, 'splits/' + reads_root_name)) split_out = subprocess.check_output('/bin/zcat %s | /usr/bin/split -l %d -d - %s ' % (reads_filename, splitsize, 'splits/' + reads_root_name), shell=True) # can't shlex because of | logger.info(split_out) splits = os.listdir('splits') logger.info("* Return from scatter: %s *" % splits) # SHould we gzip here? return { "array_of_scattered_input": [ dxpy.dxlink(dxpy.upload_local_file('splits/' + split_file)) for split_file in splits] }
def ExportVCF(kwargs, output_path, ref_fn): ref_name_version = dxpy.describe(kwargs["reference_fasta"])["name"] ref_name_version = ref_name_version.rstrip(".fa") vcf_out_fn = kwargs["output_prefix"] + '.pindel.vcf' command_args = ["pindel2vcf"] command_args.append("-r {input}".format(input=ref_fn)) command_args.append("-P {input}".format(input=output_path)) command_args.append("-v {input}".format(input=vcf_out_fn)) if kwargs["vcf_gatk_compatible"]: command_args.append("-G") if "export_vcf_advanced_options" in kwargs: command_args.append(kwargs["export_vcf_advanced_options"]) else: ref_date = str(datetime.date.today()) command_args.append("-R {input}".format(input=ref_name_version)) command_args.append("-d ''") try: vcf_command = " ".join(command_args) print "Executing: " + vcf_command print subprocess.check_output(vcf_command, stderr=subprocess.STDOUT, shell=True) except subprocess.CalledProcessError, e: print e print e.output raise dxpy.AppError( "APP ERROR: App was not able to convert pindel to vcf. Please check pindel2vcf inputs" )
def start_time(job_id): if job_id == 'job_not_found': return 'job_not_found' epoch_time = dxpy.describe(job_id)['startedRunning'] startedRunning = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(epoch_time / 1000.0)) return startedRunning
def merge_map_reports(map_report_set, target_root): '''Merges techrep map_reports.''' # Working on map_reports now all_reports="" biorep_map_report = target_root + '_map_report.txt' append_line("### Combined Bismark map report for several technical replicates ###\n",biorep_map_report) for techrep_map_report_dlink in map_report_set: file_desc = dxpy.describe(techrep_map_report_dlink) file_root = file_desc['name'] file_root = file_root.replace('_techrep_bismark_map_report.txt','') file_root = file_root.replace('_bismark_map_report.txt','') file_root = file_root.replace('_map_report.txt','') techrep_map_report = file_root + '_techrep_map_report.txt' append_line("###################################",biorep_map_report) append_line("### Map report for ${file_root} ###",biorep_map_report) print "* Downloading %s_techrep_bismark_map_report.txt file..." % file_root dxpy.download_dxfile(techrep_map_report_dlink, techrep_map_report) run_cmd('cat ' + techrep_map_report, out=biorep_map_report,append=True) if len(all_reports) == 0: all_reports = techrep_map_report else: all_reports += ',' + techrep_map_report if all_reports == techrep_map_report: # only one run_cmd('mv %s %s' % (techrep_map_report,biorep_map_report) ) all_reports = biorep_map_report return (biorep_map_report,all_reports)
def merge_map_reports(map_report_set, target_root): """Merges techrep map_reports.""" # Working on map_reports now all_reports = "" biorep_map_report = target_root + "_map_report.txt" append_line("### Combined Bismark map report for several technical replicates ###\n", biorep_map_report) for techrep_map_report_dlink in map_report_set: file_desc = dxpy.describe(techrep_map_report_dlink) file_root = file_desc["name"] file_root = file_root.replace("_techrep_bismark_map_report.txt", "") file_root = file_root.replace("_bismark_map_report.txt", "") file_root = file_root.replace("_map_report.txt", "") techrep_map_report = file_root + "_techrep_map_report.txt" append_line("###################################", biorep_map_report) append_line("### Map report for ${file_root} ###", biorep_map_report) print "* Downloading %s_techrep_bismark_map_report.txt file..." % file_root dxpy.download_dxfile(techrep_map_report_dlink, techrep_map_report) run_cmd("cat " + techrep_map_report, out=biorep_map_report, append=True) if len(all_reports) == 0: all_reports = techrep_map_report else: all_reports += "," + techrep_map_report if all_reports == techrep_map_report: # only one run_cmd("mv %s %s" % (techrep_map_report, biorep_map_report)) all_reports = biorep_map_report return (biorep_map_report, all_reports)
def test_set_assetbundle_tarball_property(self): asset_spec = { "name": "tarball_property_assetbundle", "title": "A human readable name", "description": "A detailed description about the asset", "version": "0.0.1", "distribution": "Ubuntu", "release": "12.04" } asset_dir = self.write_asset_directory("set_tarball_property", json.dumps(asset_spec)) asset_bundle_id = json.loads(run('dx build_asset --json ' + asset_dir))['id'] self.assertIn('record', asset_bundle_id) tarball_file_id = dxpy.describe(asset_bundle_id, fields={"details"})["details"]["archiveFileId"]["$dnanexus_link"] self.assertEqual(dxpy.describe(tarball_file_id, fields={"properties"})["properties"]["AssetBundle"], asset_bundle_id)
def main(): cmnd = get_args() ## resolve projects project = dxencode.resolve_project(PROJECT_NAME) print 'Project: ' + project.describe()['name'] pid = project.get_id() counts = {} n = 0 summaries = dxpy.find_data_objects(classname='file', folder='/runs', name='*_summary.txt', recurse=True, name_mode='glob', project=pid, return_handler=False) while summaries: try: flink = dxpy.dxlink(summaries.next()) n = n+1 except StopIteration: break fd = dxpy.describe(flink) fn = "fastqc/%s" % fd['name'] if not os.path.isfile(fn): print 'Downloading: %s from %s' % (fn, fd['folder']) try: dxpy.download_dxfile(flink, fn) except Exception, e: print "Error %s" % e parse_summary(fn, counts)
def received_message(self, message): message_dict = json.loads(message) if ( self.print_job_info and 'job' in message_dict and message_dict['job'] not in self.seen_jobs ): self.seen_jobs[message_dict['job']] = dxpy.describe(message_dict['job']) print( get_find_executions_string( self.seen_jobs[message_dict['job']], has_children=False, show_outputs=False ) ) if ( message_dict.get('source') == 'SYSTEM' and message_dict.get('msg') == 'END_LOG' ): self._app.keep_running = False elif self.msg_callback: self.msg_callback(message_dict) else: print(self.msg_output_format.format(**message_dict))
def process(fastq): # Change the following to process whatever input this stage # receives. You may also want to copy and paste the logic to download # and upload files here as well if this stage receives file input # and/or makes file output. print fastq reads_filename = dxpy.describe(fastq)["name"] reads_basename = reads_filename.rstrip(".gz").rstrip(".fq").rstrip(".fastq") reads_file = dxpy.download_dxfile(fastq, "fastq.gz") subprocess.check_call(["mkdir", "output"]) print "Run QC" fqc_command = "/usr/bin/FastQC/fastqc fastq.gz -o output" print fqc_command stdio = subprocess.check_output(shlex.split(fqc_command)) print stdio print subprocess.check_output(["ls", "-l", "output"]) subprocess.check_call(["unzip", "output/fastq_fastqc.zip"]) print "Upload results" subprocess.check_call(["mv", "fastq_fastqc/fastqc_data.txt", "%s_data.txt" % reads_basename]) subprocess.check_call(["mv", "fastq_fastqc/summary.txt", "%s_summary.txt" % reads_basename]) subprocess.check_call(["mv", "output/fastq_fastqc.zip", "%s_fastqc.zip" % reads_basename]) report_dxfile = dxpy.upload_local_file("%s_data.txt" % reads_basename) summary_dxfile = dxpy.upload_local_file("%s_summary.txt" % reads_basename) zip_dxfile = dxpy.upload_local_file("%s_fastqc.zip" % reads_basename) print report_dxfile return {"report": report_dxfile, "summary": summary_dxfile, "zip": zip_dxfile}
def main(): cmnd = get_args() ## resolve projects project = dxencode.resolve_project(PROJECT_NAME) print 'Project: ' + project.describe()['name'] pid = project.get_id() counts = {} n = 0 summaries = dxpy.find_data_objects(classname='file', folder='/runs', name='*_summary.txt', recurse=True, name_mode='glob', project=pid, return_handler=False) while summaries: try: flink = dxpy.dxlink(summaries.next()) n = n + 1 except StopIteration: break fd = dxpy.describe(flink) fn = "fastqc/%s" % fd['name'] if not os.path.isfile(fn): print 'Downloading: %s from %s' % (fn, fd['folder']) try: dxpy.download_dxfile(flink, fn) except Exception, e: print "Error %s" % e parse_summary(fn, counts)
def flagstat_parse(dxlink): desc = dxpy.describe(dxlink) with dxpy.DXFile(desc['id'], mode='r') as flagstat_file: if not flagstat_file: return None qc_dict = { #values are regular expressions, will be replaced with scores [hiq, lowq] 'in_total': 'in total', 'duplicates': 'duplicates', 'mapped': 'mapped', 'paired_in_sequencing': 'paired in sequencing', 'read1': 'read1', 'read2': 'read2', 'properly_paired': 'properly paired', 'with_self_mate_mapped': 'with itself and mate mapped', 'singletons': 'singletons', 'mate_mapped_different_chr': 'with mate mapped to a different chr$', #i.e. at the end of the line 'mate_mapped_different_chr_hiQ': 'with mate mapped to a different chr \(mapQ>=5\)' #RE so must escape } flagstat_lines = flagstat_file.read().splitlines() for (qc_key, qc_pattern) in qc_dict.items(): qc_metrics = next(re.split(qc_pattern, line) for line in flagstat_lines if re.search(qc_pattern, line)) (hiq, lowq) = qc_metrics[0].split(' + ') qc_dict[qc_key] = [int(hiq.rstrip()), int(lowq.rstrip())] return qc_dict
def main(): args = get_args() first_analysis = True for (i, analysis_id) in enumerate(args.infile): analysis_id = analysis_id.strip() try: analysis = dxpy.describe(analysis_id) except: print "Invalid analysis ID %s. Skipping." % (analysis_id) continue experiment_m = re.match('^(ENCSR[0-9]{3}[A-Z]{3}) Peaks', analysis['name']) experiment_accession = experiment_m.group(1) if not experiment_m: print "No accession in %s, skipping." % (analysis['name']) continue if args.pipeline: pipeline = args.pipeline elif analysis['executableName'] == 'histone_chip_seq': pipeline = 'histone' elif analysis['executableName'] == 'tf_chip_seq': pipeline = 'tf' if pipeline == 'histone': histone(args, analysis, experiment_accession, first_analysis) elif pipeline == 'tf': tf(args, analysis, experiment_accession, first_analysis) else: print "Unrecognized pipeline: %s, skipping." % (pipeline) continue first_analysis = False
def main(): args = get_args() first_analysis = True for (i, analysis_id) in enumerate(args.infile): analysis_id = analysis_id.strip() try: analysis = dxpy.describe(analysis_id) except: print "Invalid analysis ID %s. Skipping." %(analysis_id) continue if args.pipeline: if args.pipeline == 'histone': histone_m = re.match('^(ENCSR[0-9]{3}[A-Z]{3}) Peaks',analysis['executableName']) tf_m = None elif args.pipeline == 'tf': tf_m = re.match('^(ENCSR[0-9]{3}[A-Z]{3}) Peaks',analysis['name']) histone_m = None else: histone_m = re.match('^(ENCSR[0-9]{3}[A-Z]{3}) Peaks',analysis['executableName']) tf_m = re.match('^(ENCSR[0-9]{3}[A-Z]{3}) Peaks',analysis['name']) if histone_m: experiment_accession = histone_m.group(1) histone(args, analysis, experiment_accession, first_analysis) elif tf_m: experiment_accession = tf_m.group(1) tf(args, analysis, experiment_accession, first_analysis) else: print "No accession in %s, skipping." %(analysis['executableName']) continue first_analysis = False
def received_message(self, message): message = json.loads(message.__unicode__()) if "job" in message and "level" in message and "line" in message: last_line = self.last_seen_log_lines[message["job"]].get( message["level"], 0) if last_line < message["line"]: self.last_seen_log_lines[message["job"]][ message["level"]] = message["line"] if self.skipped_messages > 0: logger.warn("Skipped {} seen messages".format( self.skipped_messages)) self.skipped_messages = 0 else: self.skipped_messages += 1 return if self.print_job_info and 'job' in message and message[ 'job'] not in self.seen_jobs: self.seen_jobs[message['job']] = dxpy.describe(message['job']) print( get_find_executions_string(self.seen_jobs[message['job']], has_children=False, show_outputs=False)) if message.get('source') == 'SYSTEM' and message.get( 'msg') == 'END_LOG': self.close() elif self.msg_callback: self.msg_callback(message) else: print(self.msg_output_format.format(**message))
def test_build_asset_with_valid_destination(self): asset_spec = { "name": "foo", "title": "A human readable name", "description": "A detailed description about the asset", "version": "0.0.1", "distribution": "Ubuntu", "release": "14.04", "execDepends": [{ "name": "python-numpy" }] } asset_dir = self.write_asset_directory("asset_with_valid_destination", json.dumps(asset_spec)) with testutil.temporary_project() as other_project: test_dirname = 'asset_dir' run('dx mkdir -p {project}:{dirname}'.format( project=other_project.get_id(), dirname=test_dirname)) asset_bundle_id = json.loads( run('dx build_asset --json --destination ' + other_project.get_id() + ':/' + test_dirname + '/ ' + asset_dir))['id'] self.assertIn('record', asset_bundle_id) asset_desc = dxpy.describe(asset_bundle_id) self.assertEqual(asset_desc['project'], other_project.get_id()) self.assertEqual(asset_desc['folder'], '/asset_dir')
def get_notebook_app_versions(): """ Get the valid version numbers of the notebook app. """ notebook_apps = dxpy.find_apps(name=NOTEBOOK_APP, all_versions=True) versions = [str(dxpy.describe(app['id'])['version']) for app in notebook_apps] return versions
def build(project, folder, version_id, top_dir, path_dict): asset = find_asset(project, folder) if asset is None: # get a copy of the dxfuse executable _add_dxfuse_to_resources(top_dir) # Create a configuration file _gen_config_file(version_id, top_dir, path_dict) jar_path = _sbt_assembly(top_dir, version_id) # get a copy of the download agent (dxda) _download_dxda_into_resources(top_dir) make_prerequisits(project, folder, version_id, top_dir) asset = find_asset(project, folder) # Move the file to the top level directory all_in_one_jar = os.path.join(top_dir, "dxWDL-{}.jar".format(version_id)) shutil.move(os.path.join(top_dir, jar_path), all_in_one_jar) region = dxpy.describe(project.get_id())['region'] ad = AssetDesc(region, asset.get_id(), project) # Hygiene, remove the new configuration file, we # don't want it to leak into the next build cycle. # os.remove(crnt_conf_path) return ad
def merge_map_reports(map_report_set, target_root): '''Merges techrep map_reports.''' # Working on map_reports now all_reports = "" biorep_map_report = target_root + '_map_report.txt' append_line( "### Combined Bismark map report for several technical replicates ###\n", biorep_map_report) for techrep_map_report_dlink in map_report_set: file_desc = dxpy.describe(techrep_map_report_dlink) file_root = file_desc['name'] file_root = file_root.replace('_techrep_bismark_map_report.txt', '') file_root = file_root.replace('_bismark_map_report.txt', '') file_root = file_root.replace('_map_report.txt', '') techrep_map_report = file_root + '_techrep_map_report.txt' append_line("###################################", biorep_map_report) append_line("### Map report for ${file_root} ###", biorep_map_report) print "* Downloading %s_techrep_bismark_map_report.txt file..." % file_root dxpy.download_dxfile(techrep_map_report_dlink, techrep_map_report) run_cmd('cat ' + techrep_map_report, out=biorep_map_report, append=True) if len(all_reports) == 0: all_reports = techrep_map_report else: all_reports += ',' + techrep_map_report if all_reports == techrep_map_report: # only one run_cmd('mv %s %s' % (techrep_map_report, biorep_map_report)) all_reports = biorep_map_report return (biorep_map_report, all_reports)
def ExportVCF(kwargs, output_path, ref_fn): ref_name_version = dxpy.describe(kwargs["reference_fasta"])["name"] ref_name_version = ref_name_version.rstrip(".fa") vcf_out_fn = kwargs["output_prefix"] + '.pindel.vcf' command_args = ["pindel2vcf"] command_args.append("-r {input}".format(input=ref_fn)) command_args.append("-P {input}".format(input=output_path)) command_args.append("-v {input}".format(input=vcf_out_fn)) if kwargs["vcf_gatk_compatible"]: command_args.append("-G") if "export_vcf_advanced_options" in kwargs: command_args.append(kwargs["export_vcf_advanced_options"]) else: ref_date = str(datetime.date.today()) command_args.append("-R {input}".format(input=ref_name_version)) command_args.append("-d ''") try: vcf_command = " ".join(command_args) print "Executing: " + vcf_command print subprocess.check_output(vcf_command, stderr=subprocess.STDOUT, shell=True) except subprocess.CalledProcessError, e: print e print e.output raise dxpy.AppError("APP ERROR: App was not able to convert pindel to vcf. Please check pindel2vcf inputs")
def closed(self, code=None, reason=None): if code: self.closed_code = code self.closed_reason = reason elif not self.error: self.closed_code = 1000 self.closed_reason = "Normal" elif self.exception and type(self.exception) in {KeyboardInterrupt, SystemExit}: self.closed_code = 1000 self.closed_reason = "Connection terminated by client" else: self.closed_code = 1006 self.closed_reason = str(self.exception) if self.exception else "Abnormal" if self.closed_code != 1000: try: error = json.loads(self.closed_reason) raise DXJobLogStreamingException( "Error while streaming job logs: {type}: {message}\n".format( **error ) ) except (KeyError, ValueError): raise DXJobLogStreamingException( "Error while streaming job logs: {code}: {reason}\n".format( code=self.closed_code, reason=self.closed_reason ) ) elif self.print_job_info: if self.job_id not in self.seen_jobs: self.seen_jobs[self.job_id] = {} for job_id in self.seen_jobs.keys(): self.seen_jobs[job_id] = dxpy.describe(job_id) print( get_find_executions_string( self.seen_jobs[job_id], has_children=False, show_outputs=True ) ) else: self.seen_jobs[self.job_id] = dxpy.describe(self.job_id) if (self.exit_on_failed and self.seen_jobs[self.job_id].get('state') in {'failed', 'terminated'}): err_exit(code=3)
def test_build_asset_with_valid_dxasset(self): asset_spec = { "name": "asset_library_name", "title": "A human readable name", "description": "A detailed description about the asset", "version": "0.0.1", "distribution": "Ubuntu", "release": "12.04", "instanceType": "mem1_ssd1_x2", "execDepends": [{"name": "python-numpy"}] } asset_dir = self.write_asset_directory("asset_with_valid_json", json.dumps(asset_spec)) asset_bundle_id = json.loads(run('dx build_asset --json ' + asset_dir))['id'] self.assertIn('record', asset_bundle_id) self.assertEqual(dxpy.describe(asset_bundle_id)['project'], self.project) job_id = dxpy.describe(asset_bundle_id)['createdBy']['job'] self.assertEqual(dxpy.describe(job_id)['instanceType'], "mem1_ssd1_x2")
def build(project, folder, version_id, top_dir): sbt_assembly(top_dir) asset = find_asset(project, folder) if asset is None: make_prerequisits(project, folder, version_id, top_dir) asset = find_asset(project, folder) region = dxpy.describe(project.get_id())['region'] return AssetDesc(region, asset.get_id(), project)
def process(reads_file, reference_tar, bwa_aln_params, bwa_version, debug): # reads_file, reference_tar should be links to file objects. # reference_tar should be a tar of files generated by bwa index and # the tar should be uncompressed to avoid repeating the decompression. if debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) bwa = BWA_PATH.get(bwa_version) assert bwa, "BWA version %s is not supported" % (bwa_version) logger.info("In process with bwa %s" % (bwa)) # Generate filename strings and download the files to the local filesystem reads_filename = dxpy.describe(reads_file)['name'] reads_file = dxpy.download_dxfile(reads_file, reads_filename) reads_basename = strip_extensions(reads_filename, STRIP_EXTENSIONS) reference_tar_filename = dxpy.describe(reference_tar)['name'] dxpy.download_dxfile(reference_tar, reference_tar_filename) reference_dirname = 'reference_files' reference_filename = \ resolve_reference(reference_tar_filename, reference_dirname) logger.info("Using reference file: %s" % (reference_filename)) print(subprocess.check_output('ls -l', shell=True)) # generate the suffix array index file sai_filename = '%s.sai' % (reads_basename) with open(sai_filename, 'w') as sai_file: # Build the bwa command and call bwa bwa_command = "%s aln %s -t %d %s %s" \ % (bwa, bwa_aln_params, cpu_count(), reference_filename, reads_filename) logger.info("Running bwa with %s" % (bwa_command)) subprocess.check_call(shlex.split(bwa_command), stdout=sai_file) print(subprocess.check_output('ls -l', shell=True)) # Upload the output to the DNAnexus project logger.info("Uploading suffix array %s" % (sai_filename)) sai_dxfile = dxpy.upload_local_file(sai_filename) output = {"suffix_array_index": dxpy.dxlink(sai_dxfile)} logger.info("Returning from process with %s" % (output)) return output
def closed(self, code=None, reason=None): if code: self.closed_code = code self.closed_reason = reason elif not self.error: self.closed_code = 1000 self.closed_reason = "Normal" elif self.exception and type(self.exception) in {KeyboardInterrupt, SystemExit}: self.closed_code = 1000 self.closed_reason = "Connection terminated by client" else: self.closed_code = 1006 self.closed_reason = str(self.exception) if self.exception else "Abnormal" if self.closed_code != 1000: try: error = json.loads(self.closed_reason) raise DXJobLogStreamingException( "Error while streaming job logs: {type}: {message}\n".format( **error ) ) except (KeyError, ValueError): raise DXJobLogStreamingException( "Error while streaming job logs: {code}: {reason}\n".format( code=self.closed_code, reason=self.closed_reason ) ) elif self.print_job_info: if self.job_id not in self.seen_jobs: self.seen_jobs[self.job_id] = {} for job_id in self.seen_jobs.keys(): self.seen_jobs[job_id] = dxpy.describe(job_id) print( get_find_executions_string( self.seen_jobs[job_id], has_children=False, show_outputs=True ) ) else: self.seen_jobs[self.job_id] = dxpy.describe(self.job_id) if self.seen_jobs[self.job_id].get('state') in {'failed', 'terminated'}: err_exit(code=3)
def app_2_upversion(app_id): try: upversion = dxpy.describe(app_id)['details']['upstreamVersion'] except KeyError: if app_id in app_id_hardcode_up_version: return app_id_hardcode_up_version[app_id] else: upversion = 'NA' return upversion.strip()
def description_from_fid(fid,properties=False): '''Returns file description object from fid.''' try: dxlink = FILES[fid] except: #print >> sys.stderr, "File %s not cached, trying id" % fid) dxlink = fid return dxpy.describe(dxlink,incl_properties=properties)
def get_notebook_app_versions(): """ Get the valid version numbers of the notebook app. """ notebook_apps = dxpy.find_apps(name=NOTEBOOK_APP, all_versions=True) versions = [ str(dxpy.describe(app['id'])['version']) for app in notebook_apps ] return versions
def _get_sequence_stream(dxf): """From the given dxfile, create a command to stream the contents to stdout and gunzip it if needed.""" fn = dxpy.describe(dxf)['name'] cmd = 'dx cat {0} '.format(dxf['$dnanexus_link']) if os.path.splitext(fn)[-1] == '.gz': cmd += '| gunzip ' return cmd
def description_from_fid(fid, properties=False): '''Returns file description object from fid.''' try: dxlink = FILES[fid] except: #print >> sys.stderr, "File %s not cached, trying id" % fid) dxlink = fid return dxpy.describe(dxlink, incl_properties=properties)
def get_control_mapping_stages(peaks_analysis, experiment, keypair, server, reps=[1,2]): #Find the control inputs logger.debug('in get_control_mapping_stages with peaks_analysis %s; experiment %s; reps %s' %(peaks_analysis['id'], experiment['accession'], reps)) peaks_stages = peaks_analysis.get('stages') peaks_stage = next(stage for stage in peaks_stages if stage['execution']['name'] == "ENCODE Peaks") tas = [dxpy.describe(peaks_stage['execution']['input']['ctl%s_ta' %(n)]) for n in reps] mapping_jobs = [dxpy.describe(ta['createdBy']['job']) for ta in tas] mapping_analyses = [dxpy.describe(mapping_job['analysis']) for mapping_job in mapping_jobs] mapping_stages = [] for (i,repn) in enumerate(reps): mapping_stage = get_mapping_stages(mapping_analyses[i], keypair, server, repn) if not mapping_stage: logger.error('%s: failed to find mapping stages for rep%d' %(peaks_analysis['id'], repn)) return None else: mapping_stages.append(mapping_stage) return mapping_stages
def file_path_from_fid(fid,projectToo=False): '''Returns full dx path to file from a file id.''' fileDict = description_from_fid(fid) if fileDict['folder'] == '/': path = '/' + fileDict['name'] else: path = fileDict['folder'] + '/' + fileDict['name'] if projectToo: projDict = dxpy.describe(fileDict['project']) path = projDict['name'] + ':' + path return path
def poll_for_server_running(job_id): """ Poll for the job to start running and post the SERVER_READY_TAG. """ sys.stdout.write('Waiting for server in {0} to initialize ...'.format(job_id)) sys.stdout.flush() desc = dxpy.describe(job_id) # Keep checking until the server has begun or it has failed. while(SERVER_READY_TAG not in desc['tags'] and desc['state'] != 'failed'): time.sleep(SLEEP_PERIOD) sys.stdout.write('.') sys.stdout.flush() desc = dxpy.describe(job_id) # If the server job failed, provide friendly advice. if desc['state'] == 'failed': msg = RED('Error:') + ' Server failed to run.\n' msg += 'You may want to check the job logs by running:' msg += BOLD('dx watch {0}'.format(job_id)) err_exit(msg)
def get_input_spec_patterns(): ''' Extract the inputSpec patterns, if they exist -- modifed from dx-upload-all-outputs Returns a dict of all patterns, with keys equal to the respective input parameter names. ''' input_spec = None if 'DX_JOB_ID' in environ: # works in the cloud, not locally job_desc = dxpy.describe(dxpy.JOB_ID) if job_desc["function"] == "main": # The input spec does not apply for subjobs desc = dxpy.describe(job_desc.get("app", job_desc.get("applet"))) if "inputSpec" in desc: input_spec = desc["inputSpec"] elif 'DX_TEST_DXAPP_JSON' in environ: # works only locally path_to_dxapp_json = environ['DX_TEST_DXAPP_JSON'] with open(path_to_dxapp_json) as fd: dxapp_json = json.load(fd) input_spec = dxapp_json.get('inputSpec') # convert to a dictionary. Each entry in the input spec # has {name, class} attributes. if input_spec is None: return {} # For each field name, return its patterns. # Make sure a pattern is legal, ignore illegal patterns. def is_legal_pattern(pattern): return "*" in pattern patterns_dict = {} for spec in input_spec: name = spec['name'] if 'patterns' in spec: patterns_dict[name] = [] for p in spec['patterns']: if is_legal_pattern(p): patterns_dict[name].append(p) return patterns_dict
def test_dx_object_tagging(self): the_tags = [u"Σ1=n", u"helloo0", u"ωω"] # tag record_id = run(u"dx new record Ψ --brief").strip() run(u"dx tag Ψ " + u" ".join(the_tags)) mytags = dxpy.describe(record_id)['tags'] for tag in the_tags: self.assertIn(tag, mytags) # untag run(u"dx untag Ψ " + u" ".join(the_tags[:2])) mytags = dxpy.describe(record_id)['tags'] for tag in the_tags[:2]: self.assertNotIn(tag, mytags) self.assertIn(the_tags[2], mytags) # -a flag second_record_id = run(u"dx new record Ψ --brief").strip() self.assertNotEqual(record_id, second_record_id) run(u"dx tag -a Ψ " + u" ".join(the_tags)) mytags = dxpy.describe(record_id)['tags'] for tag in the_tags: self.assertIn(tag, mytags) second_tags = dxpy.describe(second_record_id)['tags'] for tag in the_tags: self.assertIn(tag, second_tags) run(u"dx untag -a Ψ " + u" ".join(the_tags)) mytags = dxpy.describe(record_id)['tags'] self.assertEqual(len(mytags), 0) second_tags = dxpy.describe(second_record_id)['tags'] self.assertEqual(len(second_tags), 0)
def DownloadFilesFromArray(input_ids): print "\nDownloading {n} files".format(n=len(input_ids)) if len(input_ids) < 1: raise dxpy.AppInternalError("No files were given as input") filenames = [] start_time = time.time() for id in input_ids: fn = dxpy.describe(id)["name"] filenames.append(fn) dxpy.download_dxfile(dxid=id, filename=fn) print "Downloaded {files} in {min} minutes".format(files=sorted(filenames), min=float((time.time()-start_time)/60)) return sorted(filenames)