Ejemplo n.º 1
0
def get_jobs_per_project(projects):
    """
    Return dict of project2state2jobs

    Args:
        projects (list): List of project ids

    Returns:
        dict: Dict of project to state to jobs
    """

    project2jobs = defaultdict(lambda: defaultdict(list))
    project_no_run = []

    for project in projects:
        project_id = project.describe()["id"]
        project_name = project.describe()["name"]

        log.info(f'Get job per {project_name} started')
        jobs = list(dx.find_jobs(project=project_id, created_after="-24h"))

        if jobs:
            for job in jobs:
                job = dx.DXJob(job["id"])
                job_name = job.describe()["name"]
                job_state = job.describe()["state"]
                project2jobs[(project_name,
                              project_id)][job_state].append(job_name)
        else:
            project_no_run.append(project_name)

    return project2jobs, project_no_run
Ejemplo n.º 2
0
 def find_jobs(self):
     dxapplet = dxpy.DXApplet()
     dxapplet.new(name="test_applet",
                  inputSpec=[{"name": "chromosomes", "class": "record"},
                             {"name": "rowFetchChunk", "class": "int"}
                             ],
                  outputSpec=[{"name": "mappings", "class": "record"}],
                  runSpec={"code": "def main(): pass",
                           "interpreter": "python2.7",
                           "execDepends": [{"name": "python-numpy"}]})
     dxrecord = dxpy.new_dxrecord()
     dxrecord.close()
     prog_input = {"chromosomes": {"$dnanexus_link": dxrecord.get_id()},
                   "rowFetchChunk": 100}
     dxjob = dxapplet.run(applet_input=prog_input)
     results = list(dxpy.find_jobs(launched_by='user-000000000000000000000000',
                                   applet=dxapplet,
                                   project=dxapplet.get_proj_id(),
                                   origin_job=dxjob.get_id(),
                                   parent_job=None,
                                   modified_after=0,
                                   describe=True))
     self.assertEqual(len(results), 1)
     result = results[0]
     self.assertEqual(result["id"], dxjob.get_id())
     self.assertTrue("describe" in result)
     self.assertEqual(result["describe"]["id"], dxjob.get_id())
     self.assertEqual(result["describe"]["class"], "job")
     self.assertEqual(result["describe"]["applet"], dxapplet.get_id())
     self.assertEqual(result["describe"]["project"], dxapplet.get_proj_id())
     self.assertEqual(result["describe"]["originJob"], dxjob.get_id())
     self.assertEqual(result["describe"]["parentJob"], None)
Ejemplo n.º 3
0
    def test_pacb_subjobs(self):
        """
        Make sure PacBio format subjobs work as expected by running the app
        with multiple file inputs
        """
        job_input = self.base_input
        # these are 10GB files
        job_input["reads"] = [{
            "$dnanexus_link": "file-FPY0BY80pbJvg49Z3k4zZp71"
        }, {
            "$dnanexus_link": "file-FPY0BX802J4jybZVJf0gy272"
        }]
        job_input["reads_indices"] = [{
            "$dnanexus_link":
            "file-FPY096j0VvqbBk3Y5367XbJp"
        }, {
            "$dnanexus_link":
            "file-FPY096j0G03Gg49Z3k4zZg1Q"
        }]
        job_input["datatype"] = "PacBio"
        job_input["chunk_size"] = 9
        job = dxpy.DXApplet(self.applet_id).run(job_input,
                                                folder=self.tempdirdx,
                                                name=self.testname,
                                                project=DX_PROJECT_ID)
        print "Waiting for %s to complete" % (job.get_id(), )
        try:
            job.wait_on_done()
            output = job.describe()["output"]
            # check that 2 chunks were run and 2 files are output
            mappings_files = output["bam_files"]
            self.assertTrue(len(mappings_files) > 1)

            # check that two subjobs named "map_reads_pbmm2" were run
            subjobs = dxpy.find_jobs(parent_job=job.id)
            subjob_names = [
                dxpy.DXJob(subjob['id']).name for subjob in subjobs
            ]
            subjob_names = [s.split(':')[-1] for s in subjob_names]
            mapping_jobs = [
                s for s in subjob_names
                if s.split(':')[-1] == 'map_reads_pbmm2'
            ]
            self.assertTrue(len(mapping_jobs) == 2)
        except Exception:
            DX_PROJ_OBJ.move_folder(self.tempdirdx, ARTIFACTS_FOLDER)
            raise
def pooled_controls(peaks_analysis, rep):
	#this is not surfaced explicitly so must be inferred
	#General:  get the id's of the files actually used for the specified rep and pooled controls.  If the id is the same as the
	#pooled control id then return true.
	#Specifically:
	#starting with the peaks_analysis, get its stages
	#get "ENCODE Peaks" stage
	#get the job id for "ENCODE Peaks"
	#get the control and experiment file ID's for the specified rep
	#find the child jobs of the "ENCODE Peaks" job
	#find the child job where macs2 was run with the experiment file corresponding to the experiment file for this rep
	#get from that child job the file ID of the control
	#if the contol file ID for this rep from ENCODE Peaks is the same as in macs2 then return False else return True
	#Could double-check the log output of ENCODE Peaks to search for the strings "Using pooled controls for replicate 1."
	#"Using pooled controls for replicate 2." and "Using pooled controls."  But there is no corresponding "Not pooling controls"
	#message, so it's underdertermined.
	logger.debug('in pooled_controls with peaks_analysis %s; rep %s' %(peaks_analysis['id'], rep))
	peaks_stages = peaks_analysis.get('stages')
	ENCODE_Peaks_stage = next(stage for stage in peaks_stages if stage['execution']['name'] == "ENCODE Peaks")
	ENCODE_Peaks_exp_file = ENCODE_Peaks_stage['execution']['input']['rep%s_ta' %(rep)]
	ENCODE_Peaks_ctl_file = ENCODE_Peaks_stage['execution']['input']['ctl%s_ta' %(rep)]
	# print ENCODE_Peaks_stage['execution']['id']
	# print ENCODE_Peaks_stage['execution']['project']
	child_jobs = dxpy.find_jobs(parent_job=ENCODE_Peaks_stage['execution']['id'], name="MACS2", project=ENCODE_Peaks_stage['execution']['project'], describe=True)
	rep_job = next(job for job in child_jobs if job['describe']['input']['experiment'] == ENCODE_Peaks_exp_file)
	# for job in child_jobs:
	# 	#pprint.pprint(job)
	# 	if job['describe']['input']['experiment'] == ENCODE_Peaks_exp_file:
	# 		rep_job = job
	rep_job_ctl_file = rep_job['describe']['input']['control']
	logger.info("Rep%s input control file %s; actually used %s" %(rep, ENCODE_Peaks_ctl_file, rep_job_ctl_file))
	if ENCODE_Peaks_ctl_file == rep_job_ctl_file:
		logger.info('Inferred controls not pooled for rep%s' %(rep))
		return False
	else:
		logger.info('Inferred pooled controls for rep%s' %(rep))
		return True
Ejemplo n.º 5
0
def dnanexus_workflows_get_status(event, context):
    """Handle GET /workflows/{workflow_id}/status.

    Args:
        event (dict): has a key "workflow_id" that's been taking from the URL.
            This is the id that was generated when the POST request was made,
            and the dnanexus executions we care about should be tagged with it.
        context (dict): an AWS context object that we ignore
    """

    wes_workflow_id = event["workflow_id"]
    set_dx_authorization(event["headers"]["Authorization"])

    # Ths base_job is the job that ran dxWDL and launched the workflow itself
    # as a subjob. We only need to query its status to find the status of the
    # whole execution since the success or failure of its child workflow will
    # be propagated to it.
    try:
        base_job = list(
            dxpy.find_jobs(properties={"wes_id": wes_workflow_id},
                           name="WES dxWDL Runner",
                           return_handler=True))[0]
    except IndexError:
        error_dict = {
            "errorType": "NotFound",
            "httpStatus": "404",
            "requestId": context.aws_request_id,
            "message": "Workflow {} was not found".format(wes_workflow_id)
        }
        return error_dict

    # Query the dnanexus state and translate that to a WES state
    dx_state = base_job.describe()["state"]
    wes_state = dx_to_wes_state(dx_state)

    return {"workflow_id": wes_workflow_id, "state": wes_state}
Ejemplo n.º 6
0
def interactive_help(in_class, param_desc, prompt):
    is_array = param_desc['class'].startswith("array:")
    print_param_help(param_desc)
    print()
    array_help_str = ', or <ENTER> to finish the list of inputs'
    if in_class in dx_data_classes:
        # Class is some sort of data object
        if dxpy.WORKSPACE_ID is not None:
            proj_name = None
            try:
                proj_name = dxpy.api.project_describe(dxpy.WORKSPACE_ID)['name']
            except:
                pass
            if proj_name is not None:
                print('Your current working directory is ' + proj_name + ':' + dxpy.config.get('DX_CLI_WD', '/'))
        while True:
            print('Pick an option to find input data:')
            try:
                opt_num = pick(['List and choose from available data in the current project',
                                'List and choose from available data in the DNAnexus Reference Genomes project',
                                'Select another project to list and choose available data',
                                'Select an output from a previously-run job (current project only)',
                                'Return to original prompt (specify an ID or path directly)'])
            except KeyboardInterrupt:
                opt_num = 4
            if opt_num == 0:
                query_project = dxpy.WORKSPACE_ID
            elif opt_num == 1:
                query_project = dxpy.find_one_project(name="Reference Genome Files", public=True, billed_to="org-dnanexus", level="VIEW")['id']
            elif opt_num == 2:
                project_generator = dxpy.find_projects(level='VIEW', describe=True, explicit_perms=True)
                print('\nProjects to choose from:')
                query_project = paginate_and_pick(project_generator, (lambda result: result['describe']['name']))['id']
            if opt_num in range(3):
                result_generator = dxpy.find_data_objects(classname=in_class,
                                                          typename=param_desc.get('type'),
                                                          describe=True,
                                                          project=query_project)
                print('\nAvailable data:')
                result_choice = paginate_and_pick(result_generator,
                                                  (lambda result: get_ls_l_desc(result['describe'])))
                if result_choice == 'none found':
                    print('No compatible data found')
                    continue
                elif result_choice == 'none picked':
                    continue
                else:
                    return [result_choice['project'] + ':' + result_choice['id']]
            elif opt_num == 3:
                # Select from previous jobs in current project
                result_generator = dxpy.find_jobs(project=dxpy.WORKSPACE_ID,
                                                  describe=True,
                                                  parent_job="none")
                print()
                print('Previously-run jobs to choose from:')
                result_choice = paginate_and_pick(result_generator,
                                                  (lambda result: get_find_executions_string(result['describe'],
                                                                                             has_children=False,
                                                                                             single_result=True)),
                                                  filter_fn=(lambda result: result['describe']['state'] not in ['unresponsive', 'terminating', 'terminated', 'failed']))
                if result_choice == 'none found':
                    print('No jobs found')
                    continue
                elif result_choice == 'none picked':
                    continue
                else:
                    if 'output' in result_choice['describe'] and result_choice['describe']['output'] != None:
                        keys = result_choice['describe']['output'].keys()
                    else:
                        exec_handler = dxpy.get_handler(result_choice.get('app', result_choice['applet']))
                        exec_desc = exec_handler.describe()
                        if 'outputSpec' not in exec_desc:
                            # This if block will either continue, return, or raise
                            print('No output spec found for the executable')
                            try:
                                field = input('Output field to use (^C or <ENTER> to cancel): ')
                                if field == '':
                                    continue
                                else:
                                    return [result_choice['id'] + ':' + field]
                            except KeyboardInterrupt:
                                continue
                        else:
                            keys = exec_desc['outputSpec'].keys()
                    if len(keys) > 1:
                        print('\nOutput fields to choose from:')
                        field_choice = pick(keys)
                        return [result_choice['id'] + ':' + keys[field_choice]]
                    elif len(keys) == 1:
                        print('Using the only output field: ' + keys[0])
                        return [result_choice['id'] + ':' + keys[0]]
                    else:
                        print('No available output fields')
            else:
                print(fill('Enter an ID or path (<TAB> twice for compatible ' + in_class + 's in current directory)' + (array_help_str if is_array else '')))
                return shlex.split(input(prompt))
    else:
        if in_class == 'boolean':
            if is_array:
                print(fill('Enter "true", "false"' + array_help_str))
            else:
                print(fill('Enter "true" or "false"'))
        elif in_class == 'string' and is_array:
                print(fill('Enter a nonempty string' + array_help_str))
        elif (in_class == 'float' or in_class == 'int') and is_array:
            print(fill('Enter a number' + array_help_str))
        elif in_class == 'hash':
            print(fill('Enter a quoted JSON hash'))
        result = input(prompt)
        if in_class == 'string':
            return [result]
        else:
            return shlex.split(result)
Ejemplo n.º 7
0
def interactive_help(in_class, param_desc, prompt):
    is_array = param_desc['class'].startswith("array:")
    print_param_help(param_desc)
    print()
    array_help_str = ', or <ENTER> to finish the list of inputs'
    if in_class in dx_data_classes:
        # Class is some sort of data object
        if dxpy.WORKSPACE_ID is not None:
            proj_name = None
            try:
                proj_name = dxpy.api.project_describe(dxpy.WORKSPACE_ID)['name']
            except:
                pass
            if proj_name is not None:
                print('Your current working directory is ' + proj_name + ':' + dxpy.config.get('DX_CLI_WD', '/'))
        while True:
            print('Pick an option to find input data:')
            try:
                opt_num = pick(['List and choose from available data in the current project',
                                'List and choose from available data in the DNAnexus Reference Genomes Files project',
                                'Select another project to list and choose available data',
                                'Select an output from a previously-run job (current project only)',
                                'Return to original prompt (specify an ID or path directly)'])
            except KeyboardInterrupt:
                opt_num = 4
            if opt_num == 0:
                query_project = dxpy.WORKSPACE_ID
            elif opt_num == 1:
                region = None
                if dxpy.WORKSPACE_ID:
                    region = dxpy.describe(dxpy.WORKSPACE_ID).get("region")
                query_project = dxpy.find_one_project(name="Reference Genome Files:*", public=True, billed_to="org-dnanexus_apps", level="VIEW", name_mode="glob", region=region)['id']
            elif opt_num == 2:
                project_generator = dxpy.find_projects(level='VIEW', describe=True, explicit_perms=True)
                print('\nProjects to choose from:')
                query_project = paginate_and_pick(project_generator, (lambda result: result['describe']['name']))['id']
            if opt_num in range(3):
                result_generator = dxpy.find_data_objects(classname=in_class,
                                                          typename=param_desc.get('type'),
                                                          describe=dict(fields=get_ls_l_desc_fields()),
                                                          project=query_project)
                print('\nAvailable data:')
                result_choice = paginate_and_pick(result_generator,
                                                  (lambda result: get_ls_l_desc(result['describe'])))
                if result_choice == 'none found':
                    print('No compatible data found')
                    continue
                elif result_choice == 'none picked':
                    continue
                else:
                    return [result_choice['project'] + ':' + result_choice['id']]
            elif opt_num == 3:
                # Select from previous jobs in current project
                result_generator = dxpy.find_jobs(project=dxpy.WORKSPACE_ID,
                                                  describe=True,
                                                  parent_job="none")
                print()
                print('Previously-run jobs to choose from:')
                result_choice = paginate_and_pick(result_generator,
                                                  (lambda result: get_find_executions_string(result['describe'],
                                                                                             has_children=False,
                                                                                             single_result=True)),
                                                  filter_fn=(lambda result: result['describe']['state'] not in ['unresponsive', 'terminating', 'terminated', 'failed']))
                if result_choice == 'none found':
                    print('No jobs found')
                    continue
                elif result_choice == 'none picked':
                    continue
                else:
                    if 'output' in result_choice['describe'] and result_choice['describe']['output'] != None:
                        keys = result_choice['describe']['output'].keys()
                    else:
                        exec_handler = dxpy.get_handler(result_choice.get('app', result_choice['applet']))
                        exec_desc = exec_handler.describe()
                        if 'outputSpec' not in exec_desc:
                            # This if block will either continue, return, or raise
                            print('No output spec found for the executable')
                            try:
                                field = input('Output field to use (^C or <ENTER> to cancel): ')
                                if field == '':
                                    continue
                                else:
                                    return [result_choice['id'] + ':' + field]
                            except KeyboardInterrupt:
                                continue
                        else:
                            keys = exec_desc['outputSpec'].keys()
                    if len(keys) > 1:
                        print('\nOutput fields to choose from:')
                        field_choice = pick(keys)
                        return [result_choice['id'] + ':' + keys[field_choice]]
                    elif len(keys) == 1:
                        print('Using the only output field: ' + keys[0])
                        return [result_choice['id'] + ':' + keys[0]]
                    else:
                        print('No available output fields')
            else:
                print(fill('Enter an ID or path (<TAB> twice for compatible ' + in_class + 's in current directory)' + (array_help_str if is_array else '')))
                return shlex.split(input(prompt))
    else:
        if in_class == 'boolean':
            if is_array:
                print(fill('Enter "true", "false"' + array_help_str))
            else:
                print(fill('Enter "true" or "false"'))
        elif in_class == 'string' and is_array:
                print(fill('Enter a nonempty string' + array_help_str))
        elif (in_class == 'float' or in_class == 'int') and is_array:
            print(fill('Enter a number' + array_help_str))
        elif in_class == 'hash':
            print(fill('Enter a quoted JSON hash'))
        result = input(prompt)
        if in_class == 'string':
            return [result]
        else:
            return shlex.split(result)
Ejemplo n.º 8
0
def dnanexus_workflows_get(event, context):
    """Handle GET /workflows/{workflow_id}.

    Args:
        event (dict): has a key "workflow_id" that's been taking from the URL.
            This is the id that was generated when the POST request was made,
            and the dnanexus executions we care about should be tagged with it.
        context (dict): an AWS context object that we ignore
    """

    auth_header = set_dx_authorization(event["headers"]["Authorization"])
    wes_workflow_id = event["workflow_id"]

    # First try to find the dxWDL job that's the parent job of everything
    try:
        base_job = list(
            dxpy.find_jobs(properties={"wes_id": wes_workflow_id},
                           name="WES dxWDL Runner",
                           return_handler=True))[0]
    except IndexError:
        error_dict = {
            "errorType": "NotFound",
            "httpStatus": "404",
            "requestId": context.aws_request_id,
            "message": "Workflow {} was not found".format(wes_workflow_id)
        }
        return error_dict

    child_jobs = list(
        dxpy.find_jobs(root_execution=base_job.get_id(), return_handler=True))
    child_job_ids = [j.get_id() for j in child_jobs]

    response = {
        "state": "",
        "workflow_id": "",
        "workflow_log": {
            "start_time": "",
            "end_time": "",
            "stdout": "",
            "stderr": "",
            "exit_code": -1
        },
        "task_logs": []
    }

    dx_state = base_job.describe()["state"]
    wes_state = dx_to_wes_state(dx_state)
    response["state"] = wes_state

    def get_logs_for_job(dx_job_id):
        """Retrieve the logs for single DXJob."""

        dx_exe_path = os.path.abspath("bin/dx")
        cmd = [
            "dx", "watch", "-q", "--no-timestamps", "--get-streams",
            "--no-follow", dx_job_id
        ]
        proc = subprocess.Popen(
            cmd,
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
            env={
                "DX_SECURITY_CONTEXT":
                auth_header,
                "PYTHONPATH":
                ':'.join([
                    os.environ.get("PYTHONPATH", ""),
                    os.path.dirname(os.path.dirname(dx_exe_path))
                ]),
                "PATH":
                ':'.join([os.environ["PATH"],
                          os.path.dirname(dx_exe_path)])
            })
        stdout, stderr = proc.communicate()
        return stdout

    pool = Pool(8)

    jobs_to_logs = dict(
        zip(child_job_ids, pool.map(get_logs_for_job, child_job_ids)))

    for job_id in child_job_ids:
        dx_job = dxpy.DXJob(job_id)
        job_desc = dx_job.describe()

        task_name = job_desc["executableName"]
        time_fmt = "{:%Y-%m-%dT%H:%M:%S}"
        try:
            start_time = time_fmt.format(
                datetime.datetime.fromtimestamp(job_desc["startedRunning"] /
                                                1000))
        except:
            start_time = ""
        try:
            end_time = time_fmt.format(
                datetime.datetime.fromtimestamp(job_desc["stoppedRunning"] /
                                                1000))
        except:
            end_time = ""

        try:
            log = jobs_to_logs[job_id]
        except:
            log = ""

        response["task_logs"].append({
            "name": task_name + ":" + job_id,
            "start_time": start_time,
            "end_time": end_time,
            "stdout": log,
            "stderr": "",
            "exit_code": -1
        })

    return response