def handler(event, context): ''' this is to check if the task run is done: http://docs.sevenbridges.com/reference#get-task-execution-details ''' # used to automatically determine the environment tibanna_settings = event.get('_tibanna', {}) tibanna = Tibanna(**tibanna_settings) sbg = sbg_utils.create_sbg_workflow(token=tibanna.sbg_keys, **event.get('workflow')) ff_meta = ff_utils.create_ffmeta(sbg, **event.get('ff_meta')) # check status of workflow, error if not done status = sbg.check_task() LOG.info("status of sbg task is %s" % status) if not status['status'] in ['DONE', 'COMPLETED', 'FAILED']: data = {'workflow': sbg.as_dict(), 'status': status} raise sbg_utils.SBGStillRunningException('Task not finished => %s' % data) if status['status'] == 'FAILED': ff_meta.run_status = 'error' ff_meta.description = 'SBG task %s reported FAILED status' % sbg.task_id ff_meta.post(key=tibanna.ff_keys) # TODO: handle only specific errors so this can terminate the rest of the workflow return {'workflow': sbg.as_dict(), 'run_response': status, 'ff_meta': ff_meta.as_dict(), 'pf_meta': event.get('pf_meta'), '_tibanna': tibanna.as_dict(), }
def test_proc_file_for_arg_name(run_awsem_event_data_processed_files, proc_file_in_webdev): of = [{ "workflow_argument_name": "output_file1", "uuid": proc_file_in_webdev['uuid'] }, { "workflow_argument_name": "output_file2", "uuid": "f4864029-a8ad-4bb8-93e7-5108f46bbbbb" }] tibanna_settings = run_awsem_event_data_processed_files.get('_tibanna', {}) # if they don't pass in env guess it from output_bucket env = tibanna_settings.get('env') # tibanna provides access to keys based on env and stuff like that tibanna = Tibanna( env, ff_keys=run_awsem_event_data_processed_files.get('ff_keys'), settings=tibanna_settings) file_with_type = proc_file_in_webdev.copy() file_with_type['@type'] = ['FileProcessed', 'Item', 'whatever'] with mock.patch('core.utils.get_metadata', return_value=file_with_type): pf, resp = proc_file_for_arg_name(of, 'output_file1', tibanna) assert type(pf) == ProcessedFileMetadata assert pf.__dict__ == proc_file_in_webdev
def test_handle_processed_files(run_awsf_event_data_secondary_files): data = run_awsf_event_data_secondary_files tibanna_settings = data.get('_tibanna', {}) # if they don't pass in env guess it from output_bucket env = tibanna_settings.get('env') # tibanna provides access to keys based on env and stuff like that tibanna = Tibanna(env, s3_keys=data.get('s3_keys'), ff_keys=data.get('ff_keys'), settings=tibanna_settings) workflow_uuid = data['workflow_uuid'] workflow_info = ff_utils.get_metadata(workflow_uuid, key=tibanna.ff_keys) output_files, pf_meta = handle_processed_files(workflow_info, tibanna) assert (output_files) assert len(output_files) == 3 for of in output_files: if of['extension'] == '.pairs.gz': assert of['secondary_file_extensions'] == ['.pairs.gz.px2'] assert of['secondary_file_formats'] == ['pairs_px2'] assert of['extra_files'] else: assert 'secondary_files_extension' not in of assert 'secondary_files_formats' not in of assert (pf_meta) assert len(pf_meta) == 3 for pf in pf_meta: pdict = pf.__dict__ if pdict['file_format'] == 'pairs': assert pdict['extra_files'] == [{'file_format': 'pairs_px2'}] else: assert 'extra_files' not in pdict
def test_add_secondary_files_to_args(run_awsem_event_data): input_file = { "bucket_name": "elasticbeanstalk-fourfront-webdev-wfoutput", "workflow_argument_name": "input_pairs", "uuid": [ "d2c897ec-bdb2-47ce-b1b1-845daccaa571", "d2c897ec-bdb2-47ce-b1b1-845daccaa571" ], "object_key": ["4DNFI25JXLLI.pairs.gz", "4DNFI25JXLLI.pairs.gz"] } args = { 'input_files': { 'input_pairs': { 'bucket': 'elasticbeanstalk-fourfront-webdev-wfoutput', 'object_key': [ 'd2c897ec-bdb2-47ce-b1b1-845daccaa571/4DNFI25JXLLI.pairs.gz', 'd2c897ec-bdb2-47ce-b1b1-845daccaa571/4DNFI25JXLLI.pairs.gz' ] } } } data = run_awsem_event_data tibanna_settings = data.get('_tibanna', {}) # if they don't pass in env guess it from output_bucket env = tibanna_settings.get('env') # tibanna provides access to keys based on env and stuff like that tibanna = Tibanna(env, ff_keys=data.get('ff_keys'), settings=tibanna_settings) add_secondary_files_to_args(input_file, tibanna.ff_keys, tibanna.env, args)
def run_md5(ctx, env, accession, uuid): tibanna = Tibanna(env=env) meta_data = get_metadata(accession, key=tibanna.ff_keys) file_name = meta_data['upload_key'].split('/')[-1] input_json = make_input(env=env, workflow='md5', object_key=file_name, uuid=uuid) return _run_workflow(input_json, accession)
def test_get_format_extension_map(run_awsem_event_data): tibanna_settings = run_awsem_event_data.get('_tibanna', {}) # if they don't pass in env guess it from output_bucket env = tibanna_settings.get('env') # tibanna provides access to keys based on env and stuff like that tibanna = Tibanna(env, ff_keys=run_awsem_event_data.get('ff_keys'), settings=tibanna_settings) fe_map = get_format_extension_map(tibanna.ff_keys) assert(fe_map) assert 'pairs' in fe_map.keys()
def handler(event, context): ''' export output files from sbg to our s3 ''' # get data # used to automatically determine the environment tibanna_settings = event.get('_tibanna', {}) tibanna = Tibanna(**tibanna_settings) sbg = sbg_utils.create_sbg_workflow(token=tibanna.sbg_keys, **event.get('workflow')) run_response = event.get('run_response') ff_meta = event.get('ff_meta') uuid = ff_meta['uuid'] pf_meta = event.get('pf_meta') if run_response in ['FAILED', 'ABORTED']: raise Exception("workflow run failed or aborted") sbg.export_all_output_files(run_response, ff_meta, base_dir=uuid) # creating after we export will add output file info to ff_meta ff_meta = ff_utils.create_ffmeta(sbg, **event.get('ff_meta')) ff_meta.run_status = "output_files_transferring" ff_meta.post(key=tibanna.ff_keys) for pf in pf_meta: pf['status'] = "uploading" # we still need a code for patching. return { 'workflow': sbg.as_dict(), 'ff_meta': ff_meta.as_dict(), # 'pf_meta': [pf.as_dict() for pf in pf_meta] 'pf_meta': pf_meta, '_tibanna': tibanna.as_dict() }
def batch_fastqc(ctx, env, batch_size=20): ''' try to run fastqc on everythign that needs it ran ''' files_processed = 0 files_skipped = 0 # handle ctrl-c import signal def report(signum, frame): print("Processed %s files, skipped %s files" % (files_processed, files_skipped)) sys.exit(-1) signal.signal(signal.SIGINT, report) tibanna = Tibanna(env=env) uploaded_files = get_files_to_match( tibanna, "search/?type=File&status=uploaded&limit=%s" % batch_size, frame="embedded") # TODO: need to change submit 4dn to not overwrite my limit if len(uploaded_files['@graph']) > batch_size: limited_files = uploaded_files['@graph'][:batch_size] else: limited_files = uploaded_files['@graph'] for ufile in limited_files: fastqc_run = False for wfrun in ufile.get('workflow_run_inputs', []): if 'fastqc' in wfrun: fastqc_run = True if not fastqc_run: print("running fastqc for %s" % ufile.get('accession')) run_fastqc(ctx, env, ufile.get('accession'), ufile.get('uuid')) files_processed += 1 else: print("******** fastqc already run for %s skipping" % ufile.get('accession')) files_skipped += 1 sleep(5) if files_processed % 10 == 0: sleep(60) print("Processed %s files, skipped %s files" % (files_processed, files_skipped))
def test_merge_source_experiment(run_awsem_event_data): input_file = { "bucket_name": "elasticbeanstalk-fourfront-webdev-wfoutput", "workflow_argument_name": "input_pairs", "uuid": ["d2c897ec-bdb2-47ce-b1b1-845daccaa571", "d2c897ec-bdb2-47ce-b1b1-845daccaa571"], "object_key": ["4DNFI25JXLLI.pairs.gz", "4DNFI25JXLLI.pairs.gz"] } data = run_awsem_event_data tibanna_settings = data.get('_tibanna', {}) # if they don't pass in env guess it from output_bucket env = tibanna_settings.get('env') # tibanna provides access to keys based on env and stuff like that tibanna = Tibanna(env, ff_keys=data.get('ff_keys'), settings=tibanna_settings) res = merge_source_experiments(input_file['uuid'], tibanna.ff_keys, tibanna.env) LOG.info(res) assert 'fake_source_experiment' in res
def test_handle_processed_files2(run_awsem_event_data_processed_files2): data = run_awsem_event_data_processed_files2 tibanna_settings = data.get('_tibanna', {}) # if they don't pass in env guess it from output_bucket env = tibanna_settings.get('env') # tibanna provides access to keys based on env and stuff like that tibanna = Tibanna(env, ff_keys=data.get('ff_keys'), settings=tibanna_settings) workflow_uuid = data['workflow_uuid'] workflow_info = ff_utils.get_metadata(workflow_uuid, key=tibanna.ff_keys) output_files, pf_meta = handle_processed_files( workflow_info, tibanna, custom_fields=data.get('custom_pf_fields')) assert (pf_meta) assert (output_files) for pf in pf_meta: pdict = pf.__dict__ assert 'genome_assembly' in pdict assert pdict['genome_assembly'] == 'GRCh38'
def is_status_uploading(event): print("is status uploading: %s" % event) upload_key = event['Records'][0]['s3']['object']['key'] if upload_key.endswith('html'): return False uuid, object_key = upload_key.split('/') accession = object_key.split('.')[0] # guess env from bucket name bucket = event['Records'][0]['s3']['bucket']['name'] env = '-'.join(bucket.split('-')[1:3]) tibanna = Tibanna(env=env) meta = get_metadata(accession, key=tibanna.ff_keys, ff_env=env, add_on='frame=object', check_queue=True) if meta: return meta.get('status', '') == 'uploading' else: return False
def test_process_input_file_info(run_awsem_event_data): input_file = { "bucket_name": "elasticbeanstalk-fourfront-webdev-wfoutput", "workflow_argument_name": "input_pairs", "uuid": [ "d2c897ec-bdb2-47ce-b1b1-845daccaa571", "d2c897ec-bdb2-47ce-b1b1-845daccaa571" ], "object_key": ["4DNFI25JXLLI.pairs.gz", "4DNFI25JXLLI.pairs.gz"] } args = {'input_files': {"some_input": {}, "some_other_input": {}}} data = run_awsem_event_data tibanna_settings = data.get('_tibanna', {}) # if they don't pass in env guess it from output_bucket env = tibanna_settings.get('env') # tibanna provides access to keys based on env and stuff like that tibanna = Tibanna(env, ff_keys=data.get('ff_keys'), settings=tibanna_settings) process_input_file_info(input_file, tibanna.ff_keys, tibanna.env, args) assert len(args['input_files']) == 3 assert 'secondary_files' in args
def batch_md5(ctx, env, batch_size=20): ''' try to run fastqc on everythign that needs it ran ''' tibanna = Tibanna(env=env) file_bucket = tibanna.s3.outfile_bucket.replace('wfoutput', 'files') tibanna.s3.outfile_bucket = file_bucket uploaded_files = get_files_to_match(tibanna, "search/?type=File&status=uploading", frame="embedded") limited_files = uploaded_files['@graph'] files_processed = 0 total_files = len(limited_files) skipped_files = 0 for ufile in limited_files: if files_processed >= batch_size: print("we have done enough here") sys.exit(0) if not tibanna.s3.does_key_exist(ufile.get('upload_key')): print("******** no file for %s on s3, can't run md5, skipping" % ufile.get('accession')) skipped_files += 1 continue else: print("running md5 for %s" % ufile.get('accession')) run_md5(ctx, env, ufile.get('accession'), ufile.get('uuid')) files_processed += 1 sleep(10) if files_processed % 10 == 0: sleep(60) print("Total Files: %s, Processed Files: %s, Skipped Files: %s" % (total_files, files_processed, skipped_files))
def real_handler(event, context): ''' this is generic function to run awsem workflow based on the data passed in workflow_uuid : for now, pass this on. Later we can add a code to automatically retrieve this from app_name. Note multiple workflow_uuids can be available for an app_name (different versions of the same app could have a different uuid) ''' # get incomming data input_file_list = event.get('input_files') for infile in input_file_list: if not infile: raise ("malformed input, check your input_files") app_name = event.get('app_name') print(app_name) workflow_uuid = event.get('workflow_uuid') output_bucket = event.get('output_bucket') parameters = ff_utils.convert_param(event.get('parameters'), True) tibanna_settings = event.get('_tibanna', {}) tag = event.get('tag') # if they don't pass in env guess it from output_bucket try: env = tibanna_settings.get('env', '-'.join(output_bucket.split('-')[1:-1])) # tibanna provides access to keys based on env and stuff like that tibanna = Tibanna(env, ff_keys=event.get('ff_keys'), settings=tibanna_settings) except Exception as e: raise TibannaStartException("%s" % e) args = dict() # get argument format & type info from workflow workflow_info = ff_utils.get_metadata(workflow_uuid, key=tibanna.ff_keys, ff_env=tibanna.env, add_on='frame=object') print("workflow info %s" % workflow_info) LOG.info("workflow info %s" % workflow_info) if 'error' in workflow_info.get('@type', []): raise Exception("FATAL, can't lookup workflow info for %s fourfront" % workflow_uuid) # get cwl info from workflow_info for k in [ 'app_name', 'app_version', 'cwl_directory_url', 'cwl_main_filename', 'cwl_child_filenames' ]: print(workflow_info.get(k)) LOG.info(workflow_info.get(k)) args[k] = workflow_info.get(k) if not args['cwl_child_filenames']: args['cwl_child_filenames'] = [] # switch to v1 if available if 'cwl_directory_url_v1' in workflow_info: # use CWL v1 args['cwl_directory_url'] = workflow_info['cwl_directory_url_v1'] args['cwl_version'] = 'v1' else: args['cwl_version'] = 'draft3' # create the ff_meta output info input_files = [] for input_file in input_file_list: for idx, uuid in enumerate(ensure_list(input_file['uuid'])): input_files.append({ 'workflow_argument_name': input_file['workflow_argument_name'], 'value': uuid, 'ordinal': idx + 1 }) print("input_files is %s" % input_files) LOG.info("input_files is %s" % input_files) # input file args for awsem for input_file in input_file_list: process_input_file_info(input_file, tibanna.ff_keys, tibanna.env, args) # source experiments input_file_uuids = [_['uuid'] for _ in input_file_list] pf_source_experiments = merge_source_experiments(input_file_uuids, tibanna.ff_keys, tibanna.env) # processed file metadata output_files, pf_meta = handle_processed_files( workflow_info, tibanna, pf_source_experiments, custom_fields=event.get('custom_pf_fields'), user_supplied_output_files=event.get('output_files')) print("output files= %s" % str(output_files)) # 4DN dcic award and lab are used here, unless provided in wfr_meta ff_meta = create_ffmeta_awsem( workflow_uuid, app_name, input_files, tag=tag, run_url=tibanna.settings.get('url', ''), output_files=output_files, parameters=parameters, extra_meta=event.get('wfr_meta'), ) print("ff_meta is %s" % ff_meta.__dict__) LOG.info("ff_meta is %s" % ff_meta.__dict__) # store metadata so we know the run has started ff_meta.post(key=tibanna.ff_keys) # parameters args['input_parameters'] = event.get('parameters') # output target args['output_target'] = dict() args['secondary_output_target'] = dict() for of in ff_meta.output_files: arg_name = of.get('workflow_argument_name') if of.get('type') == 'Output processed file': args['output_target'][arg_name] = of.get('upload_key') else: args['output_target'][arg_name] = ff_meta.uuid + '/' + arg_name if 'secondary_file_formats' in of: # takes only the first secondary file. args['secondary_output_target'][arg_name] \ = [_.get('upload_key') for _ in of.get('extra_files', [{}, ])] # output bucket args['output_S3_bucket'] = event.get('output_bucket') # initialize config parameters as null for benchmarking config = event['config'] if 'instance_type' not in config: config['instance_type'] = '' if 'EBS_optimized' not in config: config['EBS_optimized'] = '' if 'ebs_size' not in config: config['ebs_size'] = 0 event.update({ "ff_meta": ff_meta.as_dict(), 'pf_meta': [meta.as_dict() for meta in pf_meta], "_tibanna": tibanna.as_dict(), "args": args }) return (event)
def handler(event, context): ''' this is generic function to run sbg workflow based on the data passed in workflow_uuid : for now, pass this on. Later we can add a code to automatically retrieve this from app_name. Note multiple workflow_uuids can be available for an app_name (different versions of the same app could have a different uuid) ''' # get incomming data input_file_list = event.get('input_files') app_name = event.get('app_name') parameter_dict = event.get('parameters') workflow_uuid = event.get('workflow_uuid') output_bucket = event.get('output_bucket') tibanna_settings = event.get('_tibanna', {}) # if they don't pass in env guess it from output_bucket env = tibanna_settings.get('env', '-'.join(output_bucket.split('-')[1:-1])) # tibanna provides access to keys based on env and stuff like that tibanna = Tibanna(env, s3_keys=event.get('s3_keys'), ff_keys=event.get('ff_keys'), settings=tibanna_settings) LOG.info("input data is %s" % event) # represents the SBG info we need sbg = sbg_utils.create_sbg_workflow(app_name, tibanna.sbg_keys) LOG.info("sbg is %s" % sbg.__dict__) # represents the workflow metadata to be stored in fourfront parameters, _ = sbg_utils.to_sbg_workflow_args(parameter_dict, vals_as_string=True) # get argument format & type info from workflow workflow_info = ff_utils.get_metadata(workflow_uuid, key=tibanna.ff_keys) LOG.info("workflow info %s" % workflow_info) if 'error' in workflow_info.get('@type', []): raise Exception("FATAL, can't lookupt workflow info for % fourfront" % workflow_uuid) # This dictionary has a key 'arguments' with a value # { 'workflow_argument_name': ..., 'argument_type': ..., 'argument_format': ... } # get format-extension map try: fp_schema = ff_utils.get_metadata("profiles/file_processed.json", key=tibanna.ff_keys) fe_map = fp_schema.get('file_format_file_extension') except Exception as e: LOG.error( "Can't get format-extension map from file_processed schema. %s\n" % e) # processed file metadata output_files = [] try: if 'arguments' in workflow_info: pf_meta = [] for arg in workflow_info.get('arguments'): if (arg.get('argument_type') in [ 'Output processed file', 'Output report file', 'Output QC file' ]): of = dict() of['workflow_argument_name'] = arg.get( 'workflow_argument_name') of['type'] = arg.get('argument_type') if 'argument_format' in arg: # These are not processed files but report or QC files. pf = ff_utils.ProcessedFileMetadata( file_format=arg.get('argument_format')) try: resp = pf.post( key=tibanna.ff_keys ) # actually post processed file metadata here resp = resp.get('@graph')[0] of['upload_key'] = resp.get('upload_key') of['value'] = resp.get('uuid') except Exception as e: LOG.error( "Failed to post Processed file metadata. %s\n" % e) LOG.error("resp" + str(resp) + "\n") raise e of['format'] = arg.get('argument_format') of['extension'] = fe_map.get( arg.get('argument_format')) pf_meta.append(pf) output_files.append(of) except Exception as e: LOG.error("output_files = " + str(output_files) + "\n") LOG.error("Can't prepare output_files information. %s\n" % e) raise e # create the ff_meta output info input_files = [] for input_file in input_file_list: for idx, uuid in enumerate(ensure_list(input_file['uuid'])): input_files.append({ 'workflow_argument_name': input_file['workflow_argument_name'], 'value': uuid, 'ordinal': idx + 1 }) LOG.info("input_files is %s" % input_files) ff_meta = ff_utils.create_ffmeta(sbg, workflow_uuid, input_files, parameters, run_url=tibanna.settings.get('url', ''), output_files=output_files) LOG.info("ff_meta is %s" % ff_meta.__dict__) # store metadata so we know the run has started ff_meta.post(key=tibanna.ff_keys) # mount all input files to sbg this will also update sbg to store the import_ids for infile in input_file_list: imps = mount_on_sbg(infile, tibanna.s3_keys, sbg) infile['import_ids'] = imps # create a link to the output directory as well if output_bucket: sbg_volume = sbg_utils.create_sbg_volume_details() res = sbg.create_volumes(sbg_volume, output_bucket, public_key=tibanna.s3_keys['key'], secret_key=tibanna.s3_keys['secret']) vol_id = res.get('id') if not vol_id: # we got an error raise Exception("Unable to mount output volume, error is %s " % res) sbg.output_volume_id = vol_id # let's not pass keys in plain text parameters return { "input_file_args": input_file_list, "workflow": sbg.as_dict(), "ff_meta": ff_meta.as_dict(), 'pf_meta': [meta.as_dict() for meta in pf_meta], "_tibanna": tibanna.as_dict(), "parameter_dict": parameter_dict }
def test_tibanna(): data = {'env': 'fourfront-webdev', 'settings': {'1': '1'}} tibanna = Tibanna(**data) assert tibanna assert tibanna.as_dict() == data