def load_job(workflow, job, cwl_args=None, cwd=None): """ Tries to load json object from "job". If failed, assumes that "job" has been already parsed into Object. Inits loaded "job_data" based on the "workflow" (mostly for setting defaults from the workflow inputs; never fails). "cwl_args" can be used to update parameters for loading and runtime contexts. If "job" was file, resolves relative paths based on the job file location. If "job" was already parsed into Object, resolves relative paths based on "cwd". If "cwd" was None uses "inputs_folder" value from "cwl_args" or its default value returned from "get_default_cwl_args" function. Checking links after relative paths are resolved is disabled (checklinks is set to False in both places). This will prevent rasing an exception by schema salad in those cases when an input file will be created from the provided content during workflow execution. Always returns CommentedMap """ cwl_args = {} if cwl_args is None else cwl_args default_cwl_args = get_default_cwl_args(cwl_args) cwd = default_cwl_args["inputs_folder"] if cwd is None else cwd loading_context = setup_loadingContext( LoadingContext(default_cwl_args), RuntimeContext(default_cwl_args), argparse.Namespace(**default_cwl_args)) job_copy = deepcopy(job) try: job_data, _ = loading_context.loader.resolve_ref(job_copy, checklinks=False) except (FileNotFoundError, SchemaSaladException) as err: job_data = load_yaml(json.dumps(job_copy)) job_data["id"] = file_uri(cwd) + "/" job_data, metadata = loading_context.loader.resolve_all( job_data, job_data["id"], checklinks=False) initialized_job_data = init_job_order( job_order_object=job_data, args=argparse.Namespace(**default_cwl_args), process=slow_cwl_load(workflow=workflow, cwl_args=default_cwl_args), loader=loading_context.loader, stdout=os.devnull) return initialized_job_data
def collect_reports(context, task_ids=None): """ Collects reports from "context" for specified "task_ids". If "task_ids" was not set, use all tasks from DAG. Loads and merges data from reports. """ task_ids = context["dag"].task_ids if task_ids is None else task_ids job_data = {} for report_location in context["ti"].xcom_pull(task_ids=task_ids): if report_location is not None: job_data = merge(job_data, load_yaml(report_location)) return job_data
def remove_tmp_data(dr): logging.info(f"""Searching tmp data for dag_id: {dr.dag_id}, run_id: {dr.run_id}""") tmp_folder_set = set() for ti in dr.get_task_instances(): logging.info(f"""Task: {ti.task_id}, execution_date: {ti.execution_date}, pid: {ti.pid}, state: {ti.state}""") try: logging.info(" - searching for tmp_folder in the report file") report_location = ti.xcom_pull(task_ids=ti.task_id) tmp_folder_set.add(load_yaml(report_location)["tmp_folder"]) except Exception: logging.info(" - report file has been already deleted or it's missing tmp_folder field") for tmp_folder in tmp_folder_set: try: logging.info(f"""Removing tmp data from {tmp_folder}""") shutil.rmtree(tmp_folder) except Exception as ex: logging.error(f"""Failed to delete {tmp_folder}\n {ex}""")
def load_test_suite(args): """ Loads tests from the provided --suite file. Selects tests based on the indices from --range. Updates tools locations to be absolute, loads jobs and updates all inputs files locations to be absolute too. Adds "outputs_folder" to the job, as well as the "index" to indicate which test case was used. Adds run_id's as keys for easy access and proper test identification when receiving results. """ suite_data = load_yaml(args.suite) suite_dir = os.path.dirname(args.suite) suite_data_filtered = OrderedDict() # use OrderedDict just to keep it similar to suite_data for i in args.range: test_data = suite_data[i] run_id = str(uuid.uuid4()) tool_location = get_absolute_path(test_data["tool"], suite_dir) job_location = get_absolute_path(test_data["job"], suite_dir) if "job" in test_data: job_data = load_job( workflow=tool_location, job=job_location ) else: job_data = {} job_data["outputs_folder"] = get_dir(os.path.join(args.tmp, run_id)) test_data.update({ "job": job_data, # already parsed, includes "outputs_folder" "tool": tool_location, "dag_id": get_rootname(test_data["tool"]), "index": i+1, # to know test case number, 1-based to correspond to --range "finished": False # to indicate whether the test was finished or not }) logging.info(f"Load test case {i+1} to run {tool_location} with {job_location} as {run_id}") suite_data_filtered[run_id] = test_data # use "run_id" as a key for fast access when checking results return suite_data_filtered
def assert_and_fix_args_for_test(args): """ Asserts, fixes and sets parameters from test parser. Tries to convert --range argument to a list of indices. If --range wasn't set or indices are not valid, set it to include all tests from --suite. Dublicates are removed. This function should never fail. """ suite_data = load_yaml(args.suite) suite_len = len(suite_data) try: selected_indices = [] for interval in args.range.split(","): parsed_interval = [int(i) - 1 for i in interval.split("-") ] # switch to 0-based coodrinates if len(parsed_interval) == 2: # safety check if parsed_interval[0] >= parsed_interval[1]: raise ValueError if parsed_interval[0] >= suite_len: raise ValueError if parsed_interval[1] >= suite_len: raise ValueError selected_indices.extend( range(parsed_interval[0], parsed_interval[1] + 1) # need closed interval ) elif len(parsed_interval) == 1: # safety check if parsed_interval[0] >= suite_len: raise ValueError selected_indices.append(parsed_interval[0]) else: raise ValueError except (AttributeError, IndexError, ValueError): selected_indices = list(range(0, suite_len)) args.range = sorted( set(selected_indices)) # convert to set to remove possible dublicates
def test_load_yaml_from_str_should_fail(location): with pytest.raises((YAMLError, ValueError)): data = load_yaml(location)
def test_load_yaml_from_file_should_fail(): location = path.join(DATA_FOLDER, "jobs", "dummy.json") with pytest.raises(ValueError): data = load_yaml(location)
def test_load_yaml_from_file(): location = path.join(DATA_FOLDER, "jobs", "bam-bedgraph-bigwig.json") data = load_yaml(location) assert "bam_file" in data, "Failed to load yaml (json)"