def get_jobs_per_project(projects): """ Return dict of project2state2jobs Args: projects (list): List of project ids Returns: dict: Dict of project to state to jobs """ project2jobs = defaultdict(lambda: defaultdict(list)) project_no_run = [] for project in projects: project_id = project.describe()["id"] project_name = project.describe()["name"] log.info(f'Get job per {project_name} started') jobs = list(dx.find_jobs(project=project_id, created_after="-24h")) if jobs: for job in jobs: job = dx.DXJob(job["id"]) job_name = job.describe()["name"] job_state = job.describe()["state"] project2jobs[(project_name, project_id)][job_state].append(job_name) else: project_no_run.append(project_name) return project2jobs, project_no_run
def find_jobs(self): dxapplet = dxpy.DXApplet() dxapplet.new(name="test_applet", inputSpec=[{"name": "chromosomes", "class": "record"}, {"name": "rowFetchChunk", "class": "int"} ], outputSpec=[{"name": "mappings", "class": "record"}], runSpec={"code": "def main(): pass", "interpreter": "python2.7", "execDepends": [{"name": "python-numpy"}]}) dxrecord = dxpy.new_dxrecord() dxrecord.close() prog_input = {"chromosomes": {"$dnanexus_link": dxrecord.get_id()}, "rowFetchChunk": 100} dxjob = dxapplet.run(applet_input=prog_input) results = list(dxpy.find_jobs(launched_by='user-000000000000000000000000', applet=dxapplet, project=dxapplet.get_proj_id(), origin_job=dxjob.get_id(), parent_job=None, modified_after=0, describe=True)) self.assertEqual(len(results), 1) result = results[0] self.assertEqual(result["id"], dxjob.get_id()) self.assertTrue("describe" in result) self.assertEqual(result["describe"]["id"], dxjob.get_id()) self.assertEqual(result["describe"]["class"], "job") self.assertEqual(result["describe"]["applet"], dxapplet.get_id()) self.assertEqual(result["describe"]["project"], dxapplet.get_proj_id()) self.assertEqual(result["describe"]["originJob"], dxjob.get_id()) self.assertEqual(result["describe"]["parentJob"], None)
def test_pacb_subjobs(self): """ Make sure PacBio format subjobs work as expected by running the app with multiple file inputs """ job_input = self.base_input # these are 10GB files job_input["reads"] = [{ "$dnanexus_link": "file-FPY0BY80pbJvg49Z3k4zZp71" }, { "$dnanexus_link": "file-FPY0BX802J4jybZVJf0gy272" }] job_input["reads_indices"] = [{ "$dnanexus_link": "file-FPY096j0VvqbBk3Y5367XbJp" }, { "$dnanexus_link": "file-FPY096j0G03Gg49Z3k4zZg1Q" }] job_input["datatype"] = "PacBio" job_input["chunk_size"] = 9 job = dxpy.DXApplet(self.applet_id).run(job_input, folder=self.tempdirdx, name=self.testname, project=DX_PROJECT_ID) print "Waiting for %s to complete" % (job.get_id(), ) try: job.wait_on_done() output = job.describe()["output"] # check that 2 chunks were run and 2 files are output mappings_files = output["bam_files"] self.assertTrue(len(mappings_files) > 1) # check that two subjobs named "map_reads_pbmm2" were run subjobs = dxpy.find_jobs(parent_job=job.id) subjob_names = [ dxpy.DXJob(subjob['id']).name for subjob in subjobs ] subjob_names = [s.split(':')[-1] for s in subjob_names] mapping_jobs = [ s for s in subjob_names if s.split(':')[-1] == 'map_reads_pbmm2' ] self.assertTrue(len(mapping_jobs) == 2) except Exception: DX_PROJ_OBJ.move_folder(self.tempdirdx, ARTIFACTS_FOLDER) raise
def pooled_controls(peaks_analysis, rep): #this is not surfaced explicitly so must be inferred #General: get the id's of the files actually used for the specified rep and pooled controls. If the id is the same as the #pooled control id then return true. #Specifically: #starting with the peaks_analysis, get its stages #get "ENCODE Peaks" stage #get the job id for "ENCODE Peaks" #get the control and experiment file ID's for the specified rep #find the child jobs of the "ENCODE Peaks" job #find the child job where macs2 was run with the experiment file corresponding to the experiment file for this rep #get from that child job the file ID of the control #if the contol file ID for this rep from ENCODE Peaks is the same as in macs2 then return False else return True #Could double-check the log output of ENCODE Peaks to search for the strings "Using pooled controls for replicate 1." #"Using pooled controls for replicate 2." and "Using pooled controls." But there is no corresponding "Not pooling controls" #message, so it's underdertermined. logger.debug('in pooled_controls with peaks_analysis %s; rep %s' %(peaks_analysis['id'], rep)) peaks_stages = peaks_analysis.get('stages') ENCODE_Peaks_stage = next(stage for stage in peaks_stages if stage['execution']['name'] == "ENCODE Peaks") ENCODE_Peaks_exp_file = ENCODE_Peaks_stage['execution']['input']['rep%s_ta' %(rep)] ENCODE_Peaks_ctl_file = ENCODE_Peaks_stage['execution']['input']['ctl%s_ta' %(rep)] # print ENCODE_Peaks_stage['execution']['id'] # print ENCODE_Peaks_stage['execution']['project'] child_jobs = dxpy.find_jobs(parent_job=ENCODE_Peaks_stage['execution']['id'], name="MACS2", project=ENCODE_Peaks_stage['execution']['project'], describe=True) rep_job = next(job for job in child_jobs if job['describe']['input']['experiment'] == ENCODE_Peaks_exp_file) # for job in child_jobs: # #pprint.pprint(job) # if job['describe']['input']['experiment'] == ENCODE_Peaks_exp_file: # rep_job = job rep_job_ctl_file = rep_job['describe']['input']['control'] logger.info("Rep%s input control file %s; actually used %s" %(rep, ENCODE_Peaks_ctl_file, rep_job_ctl_file)) if ENCODE_Peaks_ctl_file == rep_job_ctl_file: logger.info('Inferred controls not pooled for rep%s' %(rep)) return False else: logger.info('Inferred pooled controls for rep%s' %(rep)) return True
def dnanexus_workflows_get_status(event, context): """Handle GET /workflows/{workflow_id}/status. Args: event (dict): has a key "workflow_id" that's been taking from the URL. This is the id that was generated when the POST request was made, and the dnanexus executions we care about should be tagged with it. context (dict): an AWS context object that we ignore """ wes_workflow_id = event["workflow_id"] set_dx_authorization(event["headers"]["Authorization"]) # Ths base_job is the job that ran dxWDL and launched the workflow itself # as a subjob. We only need to query its status to find the status of the # whole execution since the success or failure of its child workflow will # be propagated to it. try: base_job = list( dxpy.find_jobs(properties={"wes_id": wes_workflow_id}, name="WES dxWDL Runner", return_handler=True))[0] except IndexError: error_dict = { "errorType": "NotFound", "httpStatus": "404", "requestId": context.aws_request_id, "message": "Workflow {} was not found".format(wes_workflow_id) } return error_dict # Query the dnanexus state and translate that to a WES state dx_state = base_job.describe()["state"] wes_state = dx_to_wes_state(dx_state) return {"workflow_id": wes_workflow_id, "state": wes_state}
def interactive_help(in_class, param_desc, prompt): is_array = param_desc['class'].startswith("array:") print_param_help(param_desc) print() array_help_str = ', or <ENTER> to finish the list of inputs' if in_class in dx_data_classes: # Class is some sort of data object if dxpy.WORKSPACE_ID is not None: proj_name = None try: proj_name = dxpy.api.project_describe(dxpy.WORKSPACE_ID)['name'] except: pass if proj_name is not None: print('Your current working directory is ' + proj_name + ':' + dxpy.config.get('DX_CLI_WD', '/')) while True: print('Pick an option to find input data:') try: opt_num = pick(['List and choose from available data in the current project', 'List and choose from available data in the DNAnexus Reference Genomes project', 'Select another project to list and choose available data', 'Select an output from a previously-run job (current project only)', 'Return to original prompt (specify an ID or path directly)']) except KeyboardInterrupt: opt_num = 4 if opt_num == 0: query_project = dxpy.WORKSPACE_ID elif opt_num == 1: query_project = dxpy.find_one_project(name="Reference Genome Files", public=True, billed_to="org-dnanexus", level="VIEW")['id'] elif opt_num == 2: project_generator = dxpy.find_projects(level='VIEW', describe=True, explicit_perms=True) print('\nProjects to choose from:') query_project = paginate_and_pick(project_generator, (lambda result: result['describe']['name']))['id'] if opt_num in range(3): result_generator = dxpy.find_data_objects(classname=in_class, typename=param_desc.get('type'), describe=True, project=query_project) print('\nAvailable data:') result_choice = paginate_and_pick(result_generator, (lambda result: get_ls_l_desc(result['describe']))) if result_choice == 'none found': print('No compatible data found') continue elif result_choice == 'none picked': continue else: return [result_choice['project'] + ':' + result_choice['id']] elif opt_num == 3: # Select from previous jobs in current project result_generator = dxpy.find_jobs(project=dxpy.WORKSPACE_ID, describe=True, parent_job="none") print() print('Previously-run jobs to choose from:') result_choice = paginate_and_pick(result_generator, (lambda result: get_find_executions_string(result['describe'], has_children=False, single_result=True)), filter_fn=(lambda result: result['describe']['state'] not in ['unresponsive', 'terminating', 'terminated', 'failed'])) if result_choice == 'none found': print('No jobs found') continue elif result_choice == 'none picked': continue else: if 'output' in result_choice['describe'] and result_choice['describe']['output'] != None: keys = result_choice['describe']['output'].keys() else: exec_handler = dxpy.get_handler(result_choice.get('app', result_choice['applet'])) exec_desc = exec_handler.describe() if 'outputSpec' not in exec_desc: # This if block will either continue, return, or raise print('No output spec found for the executable') try: field = input('Output field to use (^C or <ENTER> to cancel): ') if field == '': continue else: return [result_choice['id'] + ':' + field] except KeyboardInterrupt: continue else: keys = exec_desc['outputSpec'].keys() if len(keys) > 1: print('\nOutput fields to choose from:') field_choice = pick(keys) return [result_choice['id'] + ':' + keys[field_choice]] elif len(keys) == 1: print('Using the only output field: ' + keys[0]) return [result_choice['id'] + ':' + keys[0]] else: print('No available output fields') else: print(fill('Enter an ID or path (<TAB> twice for compatible ' + in_class + 's in current directory)' + (array_help_str if is_array else ''))) return shlex.split(input(prompt)) else: if in_class == 'boolean': if is_array: print(fill('Enter "true", "false"' + array_help_str)) else: print(fill('Enter "true" or "false"')) elif in_class == 'string' and is_array: print(fill('Enter a nonempty string' + array_help_str)) elif (in_class == 'float' or in_class == 'int') and is_array: print(fill('Enter a number' + array_help_str)) elif in_class == 'hash': print(fill('Enter a quoted JSON hash')) result = input(prompt) if in_class == 'string': return [result] else: return shlex.split(result)
def interactive_help(in_class, param_desc, prompt): is_array = param_desc['class'].startswith("array:") print_param_help(param_desc) print() array_help_str = ', or <ENTER> to finish the list of inputs' if in_class in dx_data_classes: # Class is some sort of data object if dxpy.WORKSPACE_ID is not None: proj_name = None try: proj_name = dxpy.api.project_describe(dxpy.WORKSPACE_ID)['name'] except: pass if proj_name is not None: print('Your current working directory is ' + proj_name + ':' + dxpy.config.get('DX_CLI_WD', '/')) while True: print('Pick an option to find input data:') try: opt_num = pick(['List and choose from available data in the current project', 'List and choose from available data in the DNAnexus Reference Genomes Files project', 'Select another project to list and choose available data', 'Select an output from a previously-run job (current project only)', 'Return to original prompt (specify an ID or path directly)']) except KeyboardInterrupt: opt_num = 4 if opt_num == 0: query_project = dxpy.WORKSPACE_ID elif opt_num == 1: region = None if dxpy.WORKSPACE_ID: region = dxpy.describe(dxpy.WORKSPACE_ID).get("region") query_project = dxpy.find_one_project(name="Reference Genome Files:*", public=True, billed_to="org-dnanexus_apps", level="VIEW", name_mode="glob", region=region)['id'] elif opt_num == 2: project_generator = dxpy.find_projects(level='VIEW', describe=True, explicit_perms=True) print('\nProjects to choose from:') query_project = paginate_and_pick(project_generator, (lambda result: result['describe']['name']))['id'] if opt_num in range(3): result_generator = dxpy.find_data_objects(classname=in_class, typename=param_desc.get('type'), describe=dict(fields=get_ls_l_desc_fields()), project=query_project) print('\nAvailable data:') result_choice = paginate_and_pick(result_generator, (lambda result: get_ls_l_desc(result['describe']))) if result_choice == 'none found': print('No compatible data found') continue elif result_choice == 'none picked': continue else: return [result_choice['project'] + ':' + result_choice['id']] elif opt_num == 3: # Select from previous jobs in current project result_generator = dxpy.find_jobs(project=dxpy.WORKSPACE_ID, describe=True, parent_job="none") print() print('Previously-run jobs to choose from:') result_choice = paginate_and_pick(result_generator, (lambda result: get_find_executions_string(result['describe'], has_children=False, single_result=True)), filter_fn=(lambda result: result['describe']['state'] not in ['unresponsive', 'terminating', 'terminated', 'failed'])) if result_choice == 'none found': print('No jobs found') continue elif result_choice == 'none picked': continue else: if 'output' in result_choice['describe'] and result_choice['describe']['output'] != None: keys = result_choice['describe']['output'].keys() else: exec_handler = dxpy.get_handler(result_choice.get('app', result_choice['applet'])) exec_desc = exec_handler.describe() if 'outputSpec' not in exec_desc: # This if block will either continue, return, or raise print('No output spec found for the executable') try: field = input('Output field to use (^C or <ENTER> to cancel): ') if field == '': continue else: return [result_choice['id'] + ':' + field] except KeyboardInterrupt: continue else: keys = exec_desc['outputSpec'].keys() if len(keys) > 1: print('\nOutput fields to choose from:') field_choice = pick(keys) return [result_choice['id'] + ':' + keys[field_choice]] elif len(keys) == 1: print('Using the only output field: ' + keys[0]) return [result_choice['id'] + ':' + keys[0]] else: print('No available output fields') else: print(fill('Enter an ID or path (<TAB> twice for compatible ' + in_class + 's in current directory)' + (array_help_str if is_array else ''))) return shlex.split(input(prompt)) else: if in_class == 'boolean': if is_array: print(fill('Enter "true", "false"' + array_help_str)) else: print(fill('Enter "true" or "false"')) elif in_class == 'string' and is_array: print(fill('Enter a nonempty string' + array_help_str)) elif (in_class == 'float' or in_class == 'int') and is_array: print(fill('Enter a number' + array_help_str)) elif in_class == 'hash': print(fill('Enter a quoted JSON hash')) result = input(prompt) if in_class == 'string': return [result] else: return shlex.split(result)
def dnanexus_workflows_get(event, context): """Handle GET /workflows/{workflow_id}. Args: event (dict): has a key "workflow_id" that's been taking from the URL. This is the id that was generated when the POST request was made, and the dnanexus executions we care about should be tagged with it. context (dict): an AWS context object that we ignore """ auth_header = set_dx_authorization(event["headers"]["Authorization"]) wes_workflow_id = event["workflow_id"] # First try to find the dxWDL job that's the parent job of everything try: base_job = list( dxpy.find_jobs(properties={"wes_id": wes_workflow_id}, name="WES dxWDL Runner", return_handler=True))[0] except IndexError: error_dict = { "errorType": "NotFound", "httpStatus": "404", "requestId": context.aws_request_id, "message": "Workflow {} was not found".format(wes_workflow_id) } return error_dict child_jobs = list( dxpy.find_jobs(root_execution=base_job.get_id(), return_handler=True)) child_job_ids = [j.get_id() for j in child_jobs] response = { "state": "", "workflow_id": "", "workflow_log": { "start_time": "", "end_time": "", "stdout": "", "stderr": "", "exit_code": -1 }, "task_logs": [] } dx_state = base_job.describe()["state"] wes_state = dx_to_wes_state(dx_state) response["state"] = wes_state def get_logs_for_job(dx_job_id): """Retrieve the logs for single DXJob.""" dx_exe_path = os.path.abspath("bin/dx") cmd = [ "dx", "watch", "-q", "--no-timestamps", "--get-streams", "--no-follow", dx_job_id ] proc = subprocess.Popen( cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, env={ "DX_SECURITY_CONTEXT": auth_header, "PYTHONPATH": ':'.join([ os.environ.get("PYTHONPATH", ""), os.path.dirname(os.path.dirname(dx_exe_path)) ]), "PATH": ':'.join([os.environ["PATH"], os.path.dirname(dx_exe_path)]) }) stdout, stderr = proc.communicate() return stdout pool = Pool(8) jobs_to_logs = dict( zip(child_job_ids, pool.map(get_logs_for_job, child_job_ids))) for job_id in child_job_ids: dx_job = dxpy.DXJob(job_id) job_desc = dx_job.describe() task_name = job_desc["executableName"] time_fmt = "{:%Y-%m-%dT%H:%M:%S}" try: start_time = time_fmt.format( datetime.datetime.fromtimestamp(job_desc["startedRunning"] / 1000)) except: start_time = "" try: end_time = time_fmt.format( datetime.datetime.fromtimestamp(job_desc["stoppedRunning"] / 1000)) except: end_time = "" try: log = jobs_to_logs[job_id] except: log = "" response["task_logs"].append({ "name": task_name + ":" + job_id, "start_time": start_time, "end_time": end_time, "stdout": log, "stderr": "", "exit_code": -1 }) return response