def build_cwltool_cmd2(**kwargs): ctx = kwargs['dag_run'].conf run_id = kwargs['run_id'] #tmpdir is temp directory in /hubmap-tmp tmpdir = utils.get_tmp_dir_path(run_id) print('tmpdir: ', tmpdir) #get data directory parent_data_dir = ctx['parent_lz_path'] print('parent_data_dir: ', parent_data_dir) data_dir = os.path.join( tmpdir, 'cwl_out', 'ometiff-pyramids') # This stage reads input from stage 1 print('data_dir: ', data_dir) #this is the call to the CWL command = [ *get_cwltool_base_cmd(tmpdir), cwl_workflows[1], '--input_directory', './ometiff-pyramids', ] return join_quote_command_str(command)
def build_cwltool_cmd2(**kwargs): ctx = kwargs['dag_run'].conf run_id = kwargs['run_id'] tmpdir = utils.get_tmp_dir_path(run_id) print('tmpdir: ', tmpdir) data_dir = ctx['parent_lz_path'] print('data_dir: ', data_dir) cwltool_dir = os.path.dirname(cwltool.__file__) while cwltool_dir: part1, part2 = os.path.split(cwltool_dir) cwltool_dir = part1 if part2 == 'lib': break assert cwltool_dir, 'Failed to find cwltool bin directory' cwltool_dir = os.path.join(cwltool_dir, 'bin') command = [ 'env', 'PATH=%s:%s' % (cwltool_dir, os.environ['PATH']), 'cwltool', os.fspath(PIPELINE_BASE_DIR / cwl_workflow2), '--input_dir', '.' ] command_str = ' '.join(shlex.quote(piece) for piece in command) print('final command_str: %s' % command_str) return command_str
def build_cwltool_cmd1(**kwargs): ctx = kwargs['dag_run'].conf run_id = kwargs['run_id'] tmpdir = utils.get_tmp_dir_path(run_id) print('tmpdir: ', tmpdir) data_dir = ctx['parent_lz_path'] print('data_dir: ', data_dir) cwltool_dir = get_cwltool_bin_path() command = [ 'env', 'PATH=%s:%s' % (cwltool_dir, os.environ['PATH']), 'cwltool', os.fspath(PIPELINE_BASE_DIR / cwl_workflow1), '--data_dir', data_dir, ] # command = [ # 'cp', # '-R', # os.path.join(os.environ['AIRFLOW_HOME'], # 'data', 'temp', 'std_salmon_out', 'cwl_out'), # tmpdir # ] command_str = ' '.join(shlex.quote(piece) for piece in command) print('final command_str: %s' % command_str) return command_str
def build_cwltool_cmd1(**kwargs): ctx = kwargs["dag_run"].conf run_id = kwargs["run_id"] tmpdir = utils.get_tmp_dir_path(run_id) print("tmpdir: ", tmpdir) data_dirs = ctx["parent_lz_path"] data_dirs = [data_dirs] if isinstance(data_dirs, str) else data_dirs print("data_dirs: ", data_dirs) command = [ *get_cwltool_base_cmd(tmpdir), "--relax-path-checks", "--debug", "--outdir", tmpdir / "cwl_out", "--parallel", cwl_workflows[0], "--assay", params.assay, "--threads", THREADS, ] for data_dir in data_dirs: command.append("--fastq_dir") command.append(data_dir) return join_quote_command_str(command)
def build_cwltool_cmd2(**kwargs): ctx = kwargs["dag_run"].conf run_id = kwargs["run_id"] tmpdir = utils.get_tmp_dir_path(run_id) print("tmpdir: ", tmpdir) data_dir = ctx["parent_lz_path"] print("data_dir: ", data_dir) command = [ *get_cwltool_base_cmd(tmpdir), cwl_workflows[1], "--input_dir", ".", ] return join_quote_command_str(command)
def build_cwltool_cwl_cytokit(**kwargs): ctx = kwargs['dag_run'].conf run_id = kwargs['run_id'] tmpdir = utils.get_tmp_dir_path(run_id) print('tmpdir: ', tmpdir) data_dir = ctx['parent_lz_path'] print('data_dir: ', data_dir) command = [ *get_cwltool_base_cmd(tmpdir), cwl_workflows['cytokit'], '--gpus=0,1', '--data_dir', data_dir, ] return join_quote_command_str(command)
def build_cwltool_cmd_sprm_to_anndata(**kwargs): ctx = kwargs['dag_run'].conf run_id = kwargs['run_id'] tmpdir = utils.get_tmp_dir_path(run_id) print('tmpdir: ', tmpdir) parent_data_dir = ctx['parent_lz_path'] print('parent_data_dir: ', parent_data_dir) data_dir = tmpdir / 'cwl_out' # This stage reads input from stage 1 print('data_dir: ', data_dir) command = [ *get_cwltool_base_cmd(tmpdir), cwl_workflows['sprm_to_anndata'], '--input_dir', data_dir / 'sprm_outputs', ] return join_quote_command_str(command)
def build_cwltool_cmd_ome_tiff_offsets(**kwargs): ctx = kwargs['dag_run'].conf run_id = kwargs['run_id'] tmpdir = utils.get_tmp_dir_path(run_id) print('tmpdir: ', tmpdir) parent_data_dir = ctx['parent_lz_path'] print('parent_data_dir: ', parent_data_dir) data_dir = tmpdir / 'cwl_out' print('data_dir: ', data_dir) command = [ *get_cwltool_base_cmd(tmpdir), cwl_workflows['ome_tiff_offsets'], '--input_dir', data_dir / 'stitched/expressions', ] return join_quote_command_str(command)
def build_cwltool_cmd3(**kwargs): ctx = kwargs["dag_run"].conf run_id = kwargs["run_id"] tmpdir = utils.get_tmp_dir_path(run_id) print("tmpdir: ", tmpdir) data_dir = ctx["parent_lz_path"] print("data_dir: ", data_dir) command = [ *get_cwltool_base_cmd(tmpdir), cwl_workflows[2], "--input_dir", # This pipeline invocation runs in a 'hubmap_ui' subdirectory, # so use the parent directory as input "..", ] return join_quote_command_str(command)
def build_cwltool_cmd1(**kwargs): ctx = kwargs['dag_run'].conf run_id = kwargs['run_id'] tmpdir = utils.get_tmp_dir_path(run_id) data_dir = ctx['parent_lz_path'] command = [ *get_cwltool_base_cmd(tmpdir), '--outdir', tmpdir / 'cwl_out', '--parallel', cwl_workflows[0], '--fastq_dir', data_dir, '--threads', THREADS, ] return join_quote_command_str(command)
def build_cwltool_cmd1(**kwargs): ctx = kwargs['dag_run'].conf run_id = kwargs['run_id'] tmpdir = utils.get_tmp_dir_path(run_id) print('tmpdir: ', tmpdir) data_dir = ctx['parent_lz_path'] print('data_dir: ', data_dir) cwltool_dir = os.path.dirname(cwltool.__file__) while cwltool_dir: part1, part2 = os.path.split(cwltool_dir) cwltool_dir = part1 if part2 == 'lib': break assert cwltool_dir, 'Failed to find cwltool bin directory' cwltool_dir = os.path.join(cwltool_dir, 'bin') command = [ 'env', 'PATH=%s:%s' % (cwltool_dir, os.environ['PATH']), 'cwltool', '--debug', '--outdir', os.path.join(tmpdir, 'cwl_out'), '--parallel', os.fspath(PIPELINE_BASE_DIR / cwl_workflow1), '--fastq_dir', data_dir, '--threads', str(THREADS), ] # command = [ # 'cp', # '-R', # os.path.join(os.environ['AIRFLOW_HOME'], # 'data', 'temp', 'std_salmon_out', 'cwl_out'), # tmpdir # ] command_str = ' '.join(shlex.quote(piece) for piece in command) print('final command_str: %s' % command_str) return command_str
def build_cwltool_cmd_create_vis_symlink_archive(**kwargs): ctx = kwargs['dag_run'].conf run_id = kwargs['run_id'] tmpdir = utils.get_tmp_dir_path(run_id) print('tmpdir: ', tmpdir) parent_data_dir = ctx['parent_lz_path'] print('parent_data_dir: ', parent_data_dir) data_dir = tmpdir / 'cwl_out' print('data_dir: ', data_dir) command = [ *get_cwltool_base_cmd(tmpdir), cwl_workflows['create_vis_symlink_archive'], '--ometiff_dir', data_dir / 'stitched', '--sprm_output', data_dir / 'sprm_outputs', ] return join_quote_command_str(command)
def flex_maybe_spawn(**kwargs): """ This is a generator which returns appropriate DagRunOrders """ print('kwargs:') pprint(kwargs) print('dag_run conf:') ctx = kwargs['dag_run'].conf pprint(ctx) md_extract_retcode = int( kwargs['ti'].xcom_pull(task_ids="run_md_extract")) md_consistency_retcode = int( kwargs['ti'].xcom_pull(task_ids="md_consistency_tests")) if md_extract_retcode == 0 and md_consistency_retcode == 0: collectiontype = kwargs['ti'].xcom_pull(key='collectiontype', task_ids="send_status_msg") assay_type = kwargs['ti'].xcom_pull(key='assay_type', task_ids="send_status_msg") print('collectiontype: <{}>, assay_type: <{}>'.format( collectiontype, assay_type)) md_fname = os.path.join(utils.get_tmp_dir_path(kwargs['run_id']), 'rslt.yml') with open(md_fname, 'r') as f: md = yaml.safe_load(f) payload = { k: kwargs['dag_run'].conf[k] for k in kwargs['dag_run'].conf } payload = { 'ingest_id': ctx['run_id'], 'crypt_auth_tok': ctx['crypt_auth_tok'], 'parent_lz_path': ctx['lz_path'], 'parent_submission_id': ctx['submission_id'], 'metadata': md, 'dag_provenance_list': utils.get_git_provenance_list(__file__) } for next_dag in utils.downstream_workflow_iter( collectiontype, assay_type): yield next_dag, DagRunOrder(payload=payload) else: return None
def build_cwltool_cmd_sprm(**kwargs): ctx = kwargs['dag_run'].conf run_id = kwargs['run_id'] tmpdir = utils.get_tmp_dir_path(run_id) print('tmpdir: ', tmpdir) parent_data_dir = ctx['parent_lz_path'] print('parent_data_dir: ', parent_data_dir) data_dir = tmpdir / 'cwl_out' print('data_dir: ', data_dir) command = [ *get_cwltool_base_cmd(tmpdir), cwl_workflows['sprm'], '--enable_manhole', '--image_dir', data_dir / 'stitched/expressions', '--mask_dir', data_dir / 'stitched/mask', ] return join_quote_command_str(command)
def build_cwltool_cwl_ome_tiff_pyramid(**kwargs): ctx = kwargs['dag_run'].conf run_id = kwargs['run_id'] #tmpdir is temp directory in /hubmap-tmp tmpdir = utils.get_tmp_dir_path(run_id) print('tmpdir: ', tmpdir) #data directory is the stitched images, which are found in tmpdir data_dir = ctx['parent_lz_path'] print('data_dir: ', data_dir) #this is the call to the CWL command = [ *get_cwltool_base_cmd(tmpdir), "--relax-path-checks", cwl_workflows['ome_tiff_pyramid'], '--ometiff_directory', '.', ] return join_quote_command_str(command)
def build_cwltool_cmd1(**kwargs): ctx = kwargs['dag_run'].conf run_id = kwargs['run_id'] #tmpdir is temp directory in /hubmap-tmp tmpdir = utils.get_tmp_dir_path(run_id) print('tmpdir: ', tmpdir) #data directory is input directory in /hubmap-data data_dir = ctx['parent_lz_path'] print('data_dir: ', data_dir) #this is the call to the CWL command = [ *get_cwltool_base_cmd(tmpdir), cwl_workflows[0], '--ometiff_directory', data_dir, ] return join_quote_command_str(command)
def build_cwltool_cmd1(**kwargs): ctx = kwargs['dag_run'].conf run_id = kwargs['run_id'] tmpdir = utils.get_tmp_dir_path(run_id) data_dirs = ctx['parent_lz_path'] data_dirs = [data_dirs] if isinstance(data_dirs, str) else data_dirs command = [ *get_cwltool_base_cmd(tmpdir), '--outdir', tmpdir / 'cwl_out', '--parallel', cwl_workflows[0], '--threads', THREADS, ] for data_dir in data_dirs: command.append('--sequence_directory') command.append(data_dir) return join_quote_command_str(command)
def build_cwltool_cmd3(**kwargs): ctx = kwargs['dag_run'].conf run_id = kwargs['run_id'] tmpdir = utils.get_tmp_dir_path(run_id) print('tmpdir: ', tmpdir) parent_data_dir = ctx['parent_lz_path'] print('parent_data_dir: ', parent_data_dir) data_dir = os.path.join( tmpdir, 'cwl_out') # This stage reads input from stage 1 print('data_dir: ', data_dir) cwltool_dir = get_cwltool_bin_path() command = [ 'env', 'PATH=%s:%s' % (cwltool_dir, os.environ['PATH']), 'cwltool', os.fspath(PIPELINE_BASE_DIR / cwl_workflow3), '--input_dir', os.path.join(data_dir, 'sprm_outputs') ] command_str = ' '.join(shlex.quote(piece) for piece in command) print('final command_str: %s' % command_str) return command_str
def build_cwltool_cmd1(**kwargs): ctx = kwargs['dag_run'].conf run_id = kwargs['run_id'] tmpdir = utils.get_tmp_dir_path(run_id) tmp_subdir = tmpdir / 'cwl_out' data_dir = ctx['parent_lz_path'] try: delay_sec = int(ctx['metadata']['delay_sec']) except ValueError: print("Could not parse delay_sec " "{} ; defaulting to 30 sec".format( ctx['metadata']['delay_sec'])) delay_sec = 30 for fname in ctx['metadata']['files_to_copy']: print(fname) commands = [ [f'tmp_dir={tmpdir}'], ['sleep', delay_sec], ['cd', data_dir], ['mkdir', '-p', tmp_subdir], ] if ctx['metadata']['files_to_copy']: commands.append( ['cp', *ctx['metadata']['files_to_copy'], tmp_subdir]) print('command list:') pprint(commands) command_strs = [ join_quote_command_str(command) for command in commands ] command_str = ' ; '.join(command_strs) print('overall command_str:', command_str) return command_str
def build_cwltool_cmd1(**kwargs): ctx = kwargs['dag_run'].conf run_id = kwargs['run_id'] tmpdir = utils.get_tmp_dir_path(run_id) print('tmpdir: ', tmpdir) tmp_subdir = os.path.join(tmpdir, 'cwl_out') print('tmp_subdir: ', tmp_subdir) data_dir = ctx['parent_lz_path'] print('data_dir: ', data_dir) try: delay_sec = int(ctx['metadata']['delay_sec']) except ValueError: print("Could not parse delay_sec " "{} ; defaulting to 30 sec".format( ctx['metadata']['delay_sec'])) delay_sec = 30 for fname in ctx['metadata']['files_to_copy']: print(fname) command = [ 'sleep', '{}'.format(delay_sec), ';', 'cd', data_dir, ';', 'mkdir', '-p', '{}'.format(tmp_subdir), ';' ] if ctx['metadata']['files_to_copy']: command.extend(['cp']) command.extend(ctx['metadata']['files_to_copy']) command.extend([tmp_subdir]) print('command list: ', command) command_str = ' '.join(piece if piece == ';' else shlex.quote(piece) for piece in command) command_str = 'tmp_dir="{}" ; '.format(tmpdir) + command_str print('final command_str: %s' % command_str) return command_str
def send_status_msg(**kwargs): ctx = kwargs['dag_run'].conf retcode_ops = ['run_md_extract', 'md_consistency_tests'] print('raw: ', [kwargs['ti'].xcom_pull(task_ids=op) for op in retcode_ops]) retcodes = [ int(kwargs['ti'].xcom_pull(task_ids=op)) for op in retcode_ops ] retcode_dct = {k: v for k, v in zip(retcode_ops, retcodes)} print('retcodes: ', retcode_dct) success = all([rc == 0 for rc in retcodes]) ds_dir = ctx['lz_path'] http_conn_id = 'ingest_api_connection' endpoint = '/datasets/status' method = 'PUT' headers = { 'authorization': 'Bearer ' + utils.decrypt_tok(ctx['crypt_auth_tok'].encode()), 'content-type': 'application/json' } print('headers:') pprint(headers) extra_options = [] http = HttpHook(method, http_conn_id=http_conn_id) if success: md_fname = os.path.join(utils.get_tmp_dir_path(kwargs['run_id']), 'rslt.yml') with open(md_fname, 'r') as f: scanned_md = yaml.safe_load(f) dag_prv = utils.get_git_provenance_list([__file__]) md = {'dag_provenance_list': dag_prv, 'metadata': scanned_md} # Inclusion of files information in this message is getting disabled due to size #md.update(utils.get_file_metadata_dict(ds_dir, # utils.get_tmp_dir_path(kwargs['run_id']))) try: assert_json_matches_schema(md, 'dataset_metadata_schema.yml') data = { 'dataset_id': ctx['submission_id'], 'status': 'QA', 'message': 'the process ran', 'metadata': md } except AssertionError as e: print('invalid metadata follows:') pprint(md) data = { 'dataset_id': ctx['submission_id'], 'status': 'Error', 'message': 'internal error; schema violation: {}'.format(e), 'metadata': {} } kwargs['ti'].xcom_push( key='collectiontype', value=(scanned_md['collectiontype'] if 'collectiontype' in scanned_md else None)) kwargs['ti'].xcom_push( key='assay_type', value=(scanned_md['assay_type'] if 'assay_type' in scanned_md else None)) else: for op in retcode_ops: if retcode_dct[op]: if op == 'run_md_extract': log_fname = os.path.join( utils.get_tmp_dir_path(kwargs['run_id']), 'session.log') with open(log_fname, 'r') as f: err_txt = '\n'.join(f.readlines()) else: err_txt = kwargs['ti'].xcom_pull(task_ids=op, key='err_msg') break else: err_txt = 'Unknown error' data = { 'dataset_id': ctx['submission_id'], 'status': 'Invalid', 'message': err_txt } kwargs['ti'].xcom_push(key='collectiontype', value=None) print('data: ') pprint(data) response = http.run(endpoint, json.dumps(data), headers, extra_options) print('response: ') pprint(response.json())
def read_metadata_file(**kwargs): md_fname = os.path.join(utils.get_tmp_dir_path(kwargs['run_id']), 'rslt.yml') with open(md_fname, 'r') as f: scanned_md = yaml.safe_load(f) return scanned_md
def send_status_msg(**kwargs): ctx = kwargs['dag_run'].conf retcode_ops = ['pipeline_exec', 'move_data'] retcodes = [ int(kwargs['ti'].xcom_pull(task_ids=op)) for op in retcode_ops ] print('retcodes: ', {k: v for k, v in zip(retcode_ops, retcodes)}) success = all([rc == 0 for rc in retcodes]) derived_dataset_uuid = kwargs['ti'].xcom_pull( key='derived_dataset_uuid', task_ids="send_create_dataset") ds_dir = kwargs['ti'].xcom_pull(task_ids='send_create_dataset') if 'metadata_to_return' in ctx['metadata']: md_to_return = ctx['metadata']['metadata_to_return'] else: md_to_return = {} http_conn_id = 'ingest_api_connection' endpoint = '/datasets/status' method = 'PUT' crypt_auth_tok = kwargs['dag_run'].conf['crypt_auth_tok'] headers = { 'authorization': 'Bearer ' + decrypt_tok(crypt_auth_tok.encode()), 'content-type': 'application/json' } # print('headers:') # pprint(headers) # reduce exposure of auth_tok extra_options = [] http = HttpHook(method, http_conn_id=http_conn_id) if success: md = {'metadata': md_to_return} if 'dag_provenance' in kwargs['dag_run'].conf: md['dag_provenance'] = kwargs['dag_run'].conf[ 'dag_provenance'].copy() md['dag_provenance'].update( utils.get_git_provenance_dict([__file__])) else: dag_prv = (kwargs['dag_run'].conf['dag_provenance_list'] if 'dag_provenance_list' in kwargs['dag_run'].conf else []) dag_prv.extend(utils.get_git_provenance_list([__file__])) md['dag_provenance_list'] = dag_prv md.update( utils.get_file_metadata_dict( ds_dir, utils.get_tmp_dir_path(kwargs['run_id']), [])) try: assert_json_matches_schema(md, 'dataset_metadata_schema.yml') data = { 'dataset_id': derived_dataset_uuid, 'status': 'QA', 'message': 'the process ran', 'metadata': md } except AssertionError as e: print('invalid metadata follows:') pprint(md) data = { 'dataset_id': derived_dataset_uuid, 'status': 'Error', 'message': 'internal error; schema violation: {}'.format(e), 'metadata': {} } else: log_fname = os.path.join(utils.get_tmp_dir_path(kwargs['run_id']), 'session.log') with open(log_fname, 'r') as f: err_txt = '\n'.join(f.readlines()) data = { 'dataset_id': derived_dataset_uuid, 'status': 'Invalid', 'message': err_txt } print('data: ') pprint(data) response = http.run(endpoint, json.dumps(data), headers, extra_options) print('response: ') pprint(response.json())