Esempio n. 1
0
def handler(event, context):
    '''
    this is to check if the task run is done:
    http://docs.sevenbridges.com/reference#get-task-execution-details
    '''
    # used to automatically determine the environment
    tibanna_settings = event.get('_tibanna', {})
    tibanna = Tibanna(**tibanna_settings)
    sbg = sbg_utils.create_sbg_workflow(token=tibanna.sbg_keys, **event.get('workflow'))
    ff_meta = ff_utils.create_ffmeta(sbg, **event.get('ff_meta'))

    # check status of workflow, error if not done
    status = sbg.check_task()
    LOG.info("status of sbg task is %s" % status)

    if not status['status'] in ['DONE', 'COMPLETED', 'FAILED']:
        data = {'workflow': sbg.as_dict(),
                'status': status}
        raise sbg_utils.SBGStillRunningException('Task not finished => %s' % data)

    if status['status'] == 'FAILED':
        ff_meta.run_status = 'error'
        ff_meta.description = 'SBG task %s reported FAILED status' % sbg.task_id
        ff_meta.post(key=tibanna.ff_keys)

    # TODO: handle only specific errors so this can terminate the rest of the workflow

    return {'workflow': sbg.as_dict(),
            'run_response': status,
            'ff_meta': ff_meta.as_dict(),
            'pf_meta': event.get('pf_meta'),
            '_tibanna': tibanna.as_dict(),
            }
Esempio n. 2
0
def test_proc_file_for_arg_name(run_awsem_event_data_processed_files,
                                proc_file_in_webdev):
    of = [{
        "workflow_argument_name": "output_file1",
        "uuid": proc_file_in_webdev['uuid']
    }, {
        "workflow_argument_name": "output_file2",
        "uuid": "f4864029-a8ad-4bb8-93e7-5108f46bbbbb"
    }]

    tibanna_settings = run_awsem_event_data_processed_files.get('_tibanna', {})
    # if they don't pass in env guess it from output_bucket
    env = tibanna_settings.get('env')
    # tibanna provides access to keys based on env and stuff like that
    tibanna = Tibanna(
        env,
        ff_keys=run_awsem_event_data_processed_files.get('ff_keys'),
        settings=tibanna_settings)

    file_with_type = proc_file_in_webdev.copy()
    file_with_type['@type'] = ['FileProcessed', 'Item', 'whatever']
    with mock.patch('core.utils.get_metadata', return_value=file_with_type):
        pf, resp = proc_file_for_arg_name(of, 'output_file1', tibanna)
        assert type(pf) == ProcessedFileMetadata
        assert pf.__dict__ == proc_file_in_webdev
Esempio n. 3
0
def test_handle_processed_files(run_awsf_event_data_secondary_files):
    data = run_awsf_event_data_secondary_files
    tibanna_settings = data.get('_tibanna', {})
    # if they don't pass in env guess it from output_bucket
    env = tibanna_settings.get('env')
    # tibanna provides access to keys based on env and stuff like that
    tibanna = Tibanna(env,
                      s3_keys=data.get('s3_keys'),
                      ff_keys=data.get('ff_keys'),
                      settings=tibanna_settings)
    workflow_uuid = data['workflow_uuid']
    workflow_info = ff_utils.get_metadata(workflow_uuid, key=tibanna.ff_keys)

    output_files, pf_meta = handle_processed_files(workflow_info, tibanna)
    assert (output_files)
    assert len(output_files) == 3
    for of in output_files:
        if of['extension'] == '.pairs.gz':
            assert of['secondary_file_extensions'] == ['.pairs.gz.px2']
            assert of['secondary_file_formats'] == ['pairs_px2']
            assert of['extra_files']
        else:
            assert 'secondary_files_extension' not in of
            assert 'secondary_files_formats' not in of

    assert (pf_meta)
    assert len(pf_meta) == 3
    for pf in pf_meta:
        pdict = pf.__dict__
        if pdict['file_format'] == 'pairs':
            assert pdict['extra_files'] == [{'file_format': 'pairs_px2'}]
        else:
            assert 'extra_files' not in pdict
Esempio n. 4
0
def test_add_secondary_files_to_args(run_awsem_event_data):
    input_file = {
        "bucket_name":
        "elasticbeanstalk-fourfront-webdev-wfoutput",
        "workflow_argument_name":
        "input_pairs",
        "uuid": [
            "d2c897ec-bdb2-47ce-b1b1-845daccaa571",
            "d2c897ec-bdb2-47ce-b1b1-845daccaa571"
        ],
        "object_key": ["4DNFI25JXLLI.pairs.gz", "4DNFI25JXLLI.pairs.gz"]
    }
    args = {
        'input_files': {
            'input_pairs': {
                'bucket':
                'elasticbeanstalk-fourfront-webdev-wfoutput',
                'object_key': [
                    'd2c897ec-bdb2-47ce-b1b1-845daccaa571/4DNFI25JXLLI.pairs.gz',
                    'd2c897ec-bdb2-47ce-b1b1-845daccaa571/4DNFI25JXLLI.pairs.gz'
                ]
            }
        }
    }
    data = run_awsem_event_data
    tibanna_settings = data.get('_tibanna', {})
    # if they don't pass in env guess it from output_bucket
    env = tibanna_settings.get('env')
    # tibanna provides access to keys based on env and stuff like that
    tibanna = Tibanna(env,
                      ff_keys=data.get('ff_keys'),
                      settings=tibanna_settings)
    add_secondary_files_to_args(input_file, tibanna.ff_keys, tibanna.env, args)
Esempio n. 5
0
def run_md5(ctx, env, accession, uuid):
    tibanna = Tibanna(env=env)
    meta_data = get_metadata(accession, key=tibanna.ff_keys)
    file_name = meta_data['upload_key'].split('/')[-1]

    input_json = make_input(env=env,
                            workflow='md5',
                            object_key=file_name,
                            uuid=uuid)
    return _run_workflow(input_json, accession)
Esempio n. 6
0
def test_get_format_extension_map(run_awsem_event_data):
    tibanna_settings = run_awsem_event_data.get('_tibanna', {})
    # if they don't pass in env guess it from output_bucket
    env = tibanna_settings.get('env')
    # tibanna provides access to keys based on env and stuff like that
    tibanna = Tibanna(env, ff_keys=run_awsem_event_data.get('ff_keys'),
                      settings=tibanna_settings)

    fe_map = get_format_extension_map(tibanna.ff_keys)
    assert(fe_map)
    assert 'pairs' in fe_map.keys()
Esempio n. 7
0
def handler(event, context):
    '''
    export output files from sbg to our s3
    '''

    # get data
    # used to automatically determine the environment
    tibanna_settings = event.get('_tibanna', {})
    tibanna = Tibanna(**tibanna_settings)
    sbg = sbg_utils.create_sbg_workflow(token=tibanna.sbg_keys,
                                        **event.get('workflow'))
    run_response = event.get('run_response')
    ff_meta = event.get('ff_meta')
    uuid = ff_meta['uuid']
    pf_meta = event.get('pf_meta')

    if run_response in ['FAILED', 'ABORTED']:
        raise Exception("workflow run failed or aborted")

    sbg.export_all_output_files(run_response, ff_meta, base_dir=uuid)
    # creating after we export will add output file info to ff_meta
    ff_meta = ff_utils.create_ffmeta(sbg, **event.get('ff_meta'))
    ff_meta.run_status = "output_files_transferring"
    ff_meta.post(key=tibanna.ff_keys)

    for pf in pf_meta:
        pf['status'] = "uploading"
    # we still need a code for patching.

    return {
        'workflow': sbg.as_dict(),
        'ff_meta': ff_meta.as_dict(),
        # 'pf_meta': [pf.as_dict() for pf in pf_meta]
        'pf_meta': pf_meta,
        '_tibanna': tibanna.as_dict()
    }
Esempio n. 8
0
def batch_fastqc(ctx, env, batch_size=20):
    '''
    try to run fastqc on everythign that needs it ran
    '''
    files_processed = 0
    files_skipped = 0

    # handle ctrl-c
    import signal

    def report(signum, frame):
        print("Processed %s files, skipped %s files" %
              (files_processed, files_skipped))
        sys.exit(-1)

    signal.signal(signal.SIGINT, report)

    tibanna = Tibanna(env=env)
    uploaded_files = get_files_to_match(
        tibanna,
        "search/?type=File&status=uploaded&limit=%s" % batch_size,
        frame="embedded")

    # TODO: need to change submit 4dn to not overwrite my limit
    if len(uploaded_files['@graph']) > batch_size:
        limited_files = uploaded_files['@graph'][:batch_size]
    else:
        limited_files = uploaded_files['@graph']

    for ufile in limited_files:
        fastqc_run = False
        for wfrun in ufile.get('workflow_run_inputs', []):
            if 'fastqc' in wfrun:
                fastqc_run = True
        if not fastqc_run:
            print("running fastqc for %s" % ufile.get('accession'))
            run_fastqc(ctx, env, ufile.get('accession'), ufile.get('uuid'))
            files_processed += 1
        else:
            print("******** fastqc already run for %s skipping" %
                  ufile.get('accession'))
            files_skipped += 1
        sleep(5)
        if files_processed % 10 == 0:
            sleep(60)

    print("Processed %s files, skipped %s files" %
          (files_processed, files_skipped))
Esempio n. 9
0
def test_merge_source_experiment(run_awsem_event_data):
    input_file = {
        "bucket_name": "elasticbeanstalk-fourfront-webdev-wfoutput",
        "workflow_argument_name": "input_pairs",
        "uuid": ["d2c897ec-bdb2-47ce-b1b1-845daccaa571", "d2c897ec-bdb2-47ce-b1b1-845daccaa571"],
        "object_key": ["4DNFI25JXLLI.pairs.gz", "4DNFI25JXLLI.pairs.gz"]
    }
    data = run_awsem_event_data
    tibanna_settings = data.get('_tibanna', {})
    # if they don't pass in env guess it from output_bucket
    env = tibanna_settings.get('env')
    # tibanna provides access to keys based on env and stuff like that
    tibanna = Tibanna(env, ff_keys=data.get('ff_keys'),
                      settings=tibanna_settings)
    res = merge_source_experiments(input_file['uuid'], tibanna.ff_keys, tibanna.env)
    LOG.info(res)
    assert 'fake_source_experiment' in res
Esempio n. 10
0
def test_handle_processed_files2(run_awsem_event_data_processed_files2):
    data = run_awsem_event_data_processed_files2
    tibanna_settings = data.get('_tibanna', {})
    # if they don't pass in env guess it from output_bucket
    env = tibanna_settings.get('env')
    # tibanna provides access to keys based on env and stuff like that
    tibanna = Tibanna(env,
                      ff_keys=data.get('ff_keys'),
                      settings=tibanna_settings)
    workflow_uuid = data['workflow_uuid']
    workflow_info = ff_utils.get_metadata(workflow_uuid, key=tibanna.ff_keys)

    output_files, pf_meta = handle_processed_files(
        workflow_info, tibanna, custom_fields=data.get('custom_pf_fields'))
    assert (pf_meta)
    assert (output_files)
    for pf in pf_meta:
        pdict = pf.__dict__
        assert 'genome_assembly' in pdict
        assert pdict['genome_assembly'] == 'GRCh38'
Esempio n. 11
0
def is_status_uploading(event):
    print("is status uploading: %s" % event)
    upload_key = event['Records'][0]['s3']['object']['key']
    if upload_key.endswith('html'):
        return False

    uuid, object_key = upload_key.split('/')
    accession = object_key.split('.')[0]

    # guess env from bucket name
    bucket = event['Records'][0]['s3']['bucket']['name']
    env = '-'.join(bucket.split('-')[1:3])

    tibanna = Tibanna(env=env)
    meta = get_metadata(accession,
                        key=tibanna.ff_keys,
                        ff_env=env,
                        add_on='frame=object',
                        check_queue=True)
    if meta:
        return meta.get('status', '') == 'uploading'
    else:
        return False
Esempio n. 12
0
def test_process_input_file_info(run_awsem_event_data):
    input_file = {
        "bucket_name":
        "elasticbeanstalk-fourfront-webdev-wfoutput",
        "workflow_argument_name":
        "input_pairs",
        "uuid": [
            "d2c897ec-bdb2-47ce-b1b1-845daccaa571",
            "d2c897ec-bdb2-47ce-b1b1-845daccaa571"
        ],
        "object_key": ["4DNFI25JXLLI.pairs.gz", "4DNFI25JXLLI.pairs.gz"]
    }
    args = {'input_files': {"some_input": {}, "some_other_input": {}}}
    data = run_awsem_event_data
    tibanna_settings = data.get('_tibanna', {})
    # if they don't pass in env guess it from output_bucket
    env = tibanna_settings.get('env')
    # tibanna provides access to keys based on env and stuff like that
    tibanna = Tibanna(env,
                      ff_keys=data.get('ff_keys'),
                      settings=tibanna_settings)
    process_input_file_info(input_file, tibanna.ff_keys, tibanna.env, args)
    assert len(args['input_files']) == 3
    assert 'secondary_files' in args
Esempio n. 13
0
def batch_md5(ctx, env, batch_size=20):
    '''
    try to run fastqc on everythign that needs it ran
    '''
    tibanna = Tibanna(env=env)
    file_bucket = tibanna.s3.outfile_bucket.replace('wfoutput', 'files')
    tibanna.s3.outfile_bucket = file_bucket
    uploaded_files = get_files_to_match(tibanna,
                                        "search/?type=File&status=uploading",
                                        frame="embedded")

    limited_files = uploaded_files['@graph']

    files_processed = 0
    total_files = len(limited_files)
    skipped_files = 0
    for ufile in limited_files:
        if files_processed >= batch_size:
            print("we have done enough here")
            sys.exit(0)

        if not tibanna.s3.does_key_exist(ufile.get('upload_key')):
            print("******** no file for %s on s3, can't run md5, skipping" %
                  ufile.get('accession'))
            skipped_files += 1
            continue
        else:
            print("running md5 for %s" % ufile.get('accession'))
            run_md5(ctx, env, ufile.get('accession'), ufile.get('uuid'))
            files_processed += 1
            sleep(10)
            if files_processed % 10 == 0:
                sleep(60)

    print("Total Files: %s, Processed Files: %s, Skipped Files: %s" %
          (total_files, files_processed, skipped_files))
Esempio n. 14
0
def real_handler(event, context):
    '''
    this is generic function to run awsem workflow
    based on the data passed in

    workflow_uuid : for now, pass this on. Later we can add a code to automatically retrieve this from app_name.
    Note multiple workflow_uuids can be available for an app_name
    (different versions of the same app could have a different uuid)
    '''
    # get incomming data
    input_file_list = event.get('input_files')
    for infile in input_file_list:
        if not infile:
            raise ("malformed input, check your input_files")
    app_name = event.get('app_name')
    print(app_name)
    workflow_uuid = event.get('workflow_uuid')
    output_bucket = event.get('output_bucket')
    parameters = ff_utils.convert_param(event.get('parameters'), True)
    tibanna_settings = event.get('_tibanna', {})
    tag = event.get('tag')
    # if they don't pass in env guess it from output_bucket
    try:
        env = tibanna_settings.get('env',
                                   '-'.join(output_bucket.split('-')[1:-1]))
        # tibanna provides access to keys based on env and stuff like that
        tibanna = Tibanna(env,
                          ff_keys=event.get('ff_keys'),
                          settings=tibanna_settings)
    except Exception as e:
        raise TibannaStartException("%s" % e)

    args = dict()

    # get argument format & type info from workflow
    workflow_info = ff_utils.get_metadata(workflow_uuid,
                                          key=tibanna.ff_keys,
                                          ff_env=tibanna.env,
                                          add_on='frame=object')
    print("workflow info  %s" % workflow_info)
    LOG.info("workflow info  %s" % workflow_info)
    if 'error' in workflow_info.get('@type', []):
        raise Exception("FATAL, can't lookup workflow info for %s fourfront" %
                        workflow_uuid)

    # get cwl info from workflow_info
    for k in [
            'app_name', 'app_version', 'cwl_directory_url',
            'cwl_main_filename', 'cwl_child_filenames'
    ]:
        print(workflow_info.get(k))
        LOG.info(workflow_info.get(k))
        args[k] = workflow_info.get(k)
    if not args['cwl_child_filenames']:
        args['cwl_child_filenames'] = []

    # switch to v1 if available
    if 'cwl_directory_url_v1' in workflow_info:  # use CWL v1
        args['cwl_directory_url'] = workflow_info['cwl_directory_url_v1']
        args['cwl_version'] = 'v1'
    else:
        args['cwl_version'] = 'draft3'

    # create the ff_meta output info
    input_files = []
    for input_file in input_file_list:
        for idx, uuid in enumerate(ensure_list(input_file['uuid'])):
            input_files.append({
                'workflow_argument_name':
                input_file['workflow_argument_name'],
                'value':
                uuid,
                'ordinal':
                idx + 1
            })
    print("input_files is %s" % input_files)
    LOG.info("input_files is %s" % input_files)

    # input file args for awsem
    for input_file in input_file_list:
        process_input_file_info(input_file, tibanna.ff_keys, tibanna.env, args)

    # source experiments
    input_file_uuids = [_['uuid'] for _ in input_file_list]
    pf_source_experiments = merge_source_experiments(input_file_uuids,
                                                     tibanna.ff_keys,
                                                     tibanna.env)

    # processed file metadata
    output_files, pf_meta = handle_processed_files(
        workflow_info,
        tibanna,
        pf_source_experiments,
        custom_fields=event.get('custom_pf_fields'),
        user_supplied_output_files=event.get('output_files'))
    print("output files= %s" % str(output_files))

    # 4DN dcic award and lab are used here, unless provided in wfr_meta
    ff_meta = create_ffmeta_awsem(
        workflow_uuid,
        app_name,
        input_files,
        tag=tag,
        run_url=tibanna.settings.get('url', ''),
        output_files=output_files,
        parameters=parameters,
        extra_meta=event.get('wfr_meta'),
    )

    print("ff_meta is %s" % ff_meta.__dict__)
    LOG.info("ff_meta is %s" % ff_meta.__dict__)

    # store metadata so we know the run has started
    ff_meta.post(key=tibanna.ff_keys)

    # parameters
    args['input_parameters'] = event.get('parameters')

    # output target
    args['output_target'] = dict()
    args['secondary_output_target'] = dict()
    for of in ff_meta.output_files:
        arg_name = of.get('workflow_argument_name')
        if of.get('type') == 'Output processed file':
            args['output_target'][arg_name] = of.get('upload_key')
        else:
            args['output_target'][arg_name] = ff_meta.uuid + '/' + arg_name
        if 'secondary_file_formats' in of:
            # takes only the first secondary file.
            args['secondary_output_target'][arg_name] \
                = [_.get('upload_key') for _ in of.get('extra_files', [{}, ])]

    # output bucket
    args['output_S3_bucket'] = event.get('output_bucket')

    # initialize config parameters as null for benchmarking
    config = event['config']
    if 'instance_type' not in config:
        config['instance_type'] = ''
    if 'EBS_optimized' not in config:
        config['EBS_optimized'] = ''
    if 'ebs_size' not in config:
        config['ebs_size'] = 0

    event.update({
        "ff_meta": ff_meta.as_dict(),
        'pf_meta': [meta.as_dict() for meta in pf_meta],
        "_tibanna": tibanna.as_dict(),
        "args": args
    })
    return (event)
Esempio n. 15
0
def handler(event, context):
    '''
    this is generic function to run sbg workflow
    based on the data passed in

    workflow_uuid : for now, pass this on. Later we can add a code to automatically retrieve this from app_name.
    Note multiple workflow_uuids can be available for an app_name
    (different versions of the same app could have a different uuid)
    '''
    # get incomming data
    input_file_list = event.get('input_files')
    app_name = event.get('app_name')
    parameter_dict = event.get('parameters')
    workflow_uuid = event.get('workflow_uuid')
    output_bucket = event.get('output_bucket')
    tibanna_settings = event.get('_tibanna', {})
    # if they don't pass in env guess it from output_bucket
    env = tibanna_settings.get('env', '-'.join(output_bucket.split('-')[1:-1]))
    # tibanna provides access to keys based on env and stuff like that
    tibanna = Tibanna(env,
                      s3_keys=event.get('s3_keys'),
                      ff_keys=event.get('ff_keys'),
                      settings=tibanna_settings)

    LOG.info("input data is %s" % event)
    # represents the SBG info we need
    sbg = sbg_utils.create_sbg_workflow(app_name, tibanna.sbg_keys)
    LOG.info("sbg is %s" % sbg.__dict__)

    # represents the workflow metadata to be stored in fourfront
    parameters, _ = sbg_utils.to_sbg_workflow_args(parameter_dict,
                                                   vals_as_string=True)

    # get argument format & type info from workflow
    workflow_info = ff_utils.get_metadata(workflow_uuid, key=tibanna.ff_keys)
    LOG.info("workflow info  %s" % workflow_info)
    if 'error' in workflow_info.get('@type', []):
        raise Exception("FATAL, can't lookupt workflow info for % fourfront" %
                        workflow_uuid)

    # This dictionary has a key 'arguments' with a value
    # { 'workflow_argument_name': ..., 'argument_type': ..., 'argument_format': ... }

    # get format-extension map
    try:
        fp_schema = ff_utils.get_metadata("profiles/file_processed.json",
                                          key=tibanna.ff_keys)
        fe_map = fp_schema.get('file_format_file_extension')
    except Exception as e:
        LOG.error(
            "Can't get format-extension map from file_processed schema. %s\n" %
            e)

    # processed file metadata
    output_files = []
    try:
        if 'arguments' in workflow_info:
            pf_meta = []
            for arg in workflow_info.get('arguments'):
                if (arg.get('argument_type') in [
                        'Output processed file', 'Output report file',
                        'Output QC file'
                ]):

                    of = dict()
                    of['workflow_argument_name'] = arg.get(
                        'workflow_argument_name')
                    of['type'] = arg.get('argument_type')
                    if 'argument_format' in arg:
                        # These are not processed files but report or QC files.
                        pf = ff_utils.ProcessedFileMetadata(
                            file_format=arg.get('argument_format'))
                        try:
                            resp = pf.post(
                                key=tibanna.ff_keys
                            )  # actually post processed file metadata here
                            resp = resp.get('@graph')[0]
                            of['upload_key'] = resp.get('upload_key')
                            of['value'] = resp.get('uuid')
                        except Exception as e:
                            LOG.error(
                                "Failed to post Processed file metadata. %s\n"
                                % e)
                            LOG.error("resp" + str(resp) + "\n")
                            raise e
                        of['format'] = arg.get('argument_format')
                        of['extension'] = fe_map.get(
                            arg.get('argument_format'))
                        pf_meta.append(pf)
                    output_files.append(of)

    except Exception as e:
        LOG.error("output_files = " + str(output_files) + "\n")
        LOG.error("Can't prepare output_files information. %s\n" % e)
        raise e

    # create the ff_meta output info
    input_files = []
    for input_file in input_file_list:
        for idx, uuid in enumerate(ensure_list(input_file['uuid'])):
            input_files.append({
                'workflow_argument_name':
                input_file['workflow_argument_name'],
                'value':
                uuid,
                'ordinal':
                idx + 1
            })
    LOG.info("input_files is %s" % input_files)

    ff_meta = ff_utils.create_ffmeta(sbg,
                                     workflow_uuid,
                                     input_files,
                                     parameters,
                                     run_url=tibanna.settings.get('url', ''),
                                     output_files=output_files)
    LOG.info("ff_meta is %s" % ff_meta.__dict__)

    # store metadata so we know the run has started
    ff_meta.post(key=tibanna.ff_keys)

    # mount all input files to sbg this will also update sbg to store the import_ids
    for infile in input_file_list:
        imps = mount_on_sbg(infile, tibanna.s3_keys, sbg)
        infile['import_ids'] = imps

    # create a link to the output directory as well
    if output_bucket:
        sbg_volume = sbg_utils.create_sbg_volume_details()
        res = sbg.create_volumes(sbg_volume,
                                 output_bucket,
                                 public_key=tibanna.s3_keys['key'],
                                 secret_key=tibanna.s3_keys['secret'])
        vol_id = res.get('id')
        if not vol_id:
            # we got an error
            raise Exception("Unable to mount output volume, error is %s " %
                            res)
        sbg.output_volume_id = vol_id

    # let's not pass keys in plain text parameters
    return {
        "input_file_args": input_file_list,
        "workflow": sbg.as_dict(),
        "ff_meta": ff_meta.as_dict(),
        'pf_meta': [meta.as_dict() for meta in pf_meta],
        "_tibanna": tibanna.as_dict(),
        "parameter_dict": parameter_dict
    }
Esempio n. 16
0
def test_tibanna():
    data = {'env': 'fourfront-webdev', 'settings': {'1': '1'}}
    tibanna = Tibanna(**data)
    assert tibanna
    assert tibanna.as_dict() == data