Esempio n. 1
0
 def check_mismatch_and_update(x, original_x, fieldname):
     if check_mismatch(x, original_x):
         raise Exception(fieldname + " not matching the original one")
     if x and not original_x:
         new_content[fieldname] = x
     printlog("check_mismatch_and_update: new_content = %s" %
              str(new_content))
Esempio n. 2
0
def update_ffmeta_from_awsemfile(awsemfile,
                                 ff_meta,
                                 tibanna,
                                 custom_qc_fields=None):
    patch_meta = False
    upload_key = awsemfile.key
    status = awsemfile.status
    printlog("awsemfile res is %s" % status)
    if status == 'COMPLETED':
        patch_meta = OUTFILE_UPDATERS[awsemfile.argument_type](
            'uploaded',
            awsemfile,
            ff_meta,
            tibanna,
            other_fields=custom_qc_fields)
    elif status in ['FAILED']:
        patch_meta = OUTFILE_UPDATERS[awsemfile.argument_type](
            'upload failed',
            awsemfile,
            ff_meta,
            tibanna,
            other_fields=custom_qc_fields)
        ff_meta.run_status = 'error'
        ff_meta.patch(key=tibanna.ff_keys)
        raise Exception("Failed to export file %s" % (upload_key))
    return patch_meta
Esempio n. 3
0
def create_and_post_processed_file(ff_keys,
                                   file_format,
                                   secondary_file_formats,
                                   source_experiments=None,
                                   other_fields=None):
    printlog(file_format)
    if not file_format:
        raise Exception("file format for processed file must be provided")
    if secondary_file_formats:
        extra_files = [{
            "file_format": parse_formatstr(v)
        } for v in secondary_file_formats]
    else:
        extra_files = None
    pf = ProcessedFileMetadata(file_format=file_format,
                               extra_files=extra_files,
                               source_experiments=source_experiments,
                               other_fields=other_fields)
    # actually post processed file metadata here
    resp = pf.post(key=ff_keys)
    if resp and '@graph' in resp:
        resp = resp.get('@graph')[0]
    else:
        raise Exception("Failed to post Processed file metadata.\n")
    return pf, resp
Esempio n. 4
0
def handle_postrun_json(bucket_name,
                        jobid,
                        event,
                        raise_error=True,
                        filesystem=None):
    postrunjson = "%s.postrun.json" % jobid
    if not does_key_exist(bucket_name, postrunjson):
        if raise_error:
            postrunjson_location = "https://s3.amazonaws.com/%s/%s" % (
                bucket_name, postrunjson)
            raise Exception("Postrun json not found at %s" %
                            postrunjson_location)
        return None
    postrunjsoncontent = json.loads(read_s3(bucket_name, postrunjson))
    if 'instance_id' in event:
        update_postrun_json(postrunjsoncontent, event['instance_id'],
                            filesystem)
    printlog("inside funtion handle_postrun_json")
    printlog("content=\n" + json.dumps(postrunjsoncontent, indent=4))
    try:
        boto3.client('s3').put_object(Bucket=bucket_name,
                                      Key=postrunjson,
                                      Body=json.dumps(postrunjsoncontent,
                                                      indent=4).encode())
    except Exception as e:
        raise "error in updating postrunjson %s" % str(e)
    add_postrun_json(postrunjsoncontent, event,
                     RESPONSE_JSON_CONTENT_INCLUSION_LIMIT)
Esempio n. 5
0
def test_tmp(update_ffmeta_tmpdata, tibanna_env):
    update_ffmeta_tmpdata.update(tibanna_env)
    with mock.patch('core.pony_utils.patch_metadata') as mock_request:
        ret = real_handler(update_ffmeta_tmpdata, None)
        mock_request.call_count == 3
    printlog(ret)
    # once for patch pf once for workflow run
    assert ret
Esempio n. 6
0
def test_register_to_higlass3(used_env):
    bucket = 'elasticbeanstalk-fourfront-webdev-wfoutput'
    bigbed_key = 'a34d5ea5-eada-4def-a4a7-c227b0d32395/4DNFIC624FKJ.bb'
    tibanna = Tibanna(used_env)
    with mock.patch('requests.post') as mock_request:
        res = register_to_higlass(tibanna, bucket, bigbed_key, 'bigwig', 'vector')
        mock_request.assert_called_once()
    printlog(res)
    assert res
Esempio n. 7
0
def test_register_to_higlass2(used_env):
    bucket = 'elasticbeanstalk-fourfront-webdev-wfoutput'
    bigwig_key = 'a940cf00-6001-473e-80d1-1e4a43866863/4DNFI75GAT6T.bw'
    tibanna = Tibanna(used_env)
    with mock.patch('requests.post') as mock_request:
        res = register_to_higlass(tibanna, bucket, bigwig_key, 'bigwig', 'vector')
        mock_request.assert_called_once()
        printlog(res)
        assert res
Esempio n. 8
0
def output_target_for_input_extra(target_inf,
                                  of,
                                  tibanna,
                                  overwrite_input_extra=False):
    extrafileexists = False
    printlog("target_inf = %s" % str(target_inf))  # debugging
    target_inf_meta = ff_utils.get_metadata(target_inf.get('value'),
                                            key=tibanna.ff_keys,
                                            ff_env=tibanna.env,
                                            add_on='frame=object',
                                            check_queue=True)
    target_format = parse_formatstr(of.get('format'))
    if target_inf_meta.get('extra_files'):
        for exf in target_inf_meta.get('extra_files'):
            if parse_formatstr(exf.get('file_format')) == target_format:
                extrafileexists = True
                if overwrite_input_extra:
                    exf['status'] = 'to be uploaded by workflow'
                break
        if not extrafileexists:
            new_extra = {
                'file_format': target_format,
                'status': 'to be uploaded by workflow'
            }
            target_inf_meta['extra_files'].append(new_extra)
    else:
        new_extra = {
            'file_format': target_format,
            'status': 'to be uploaded by workflow'
        }
        target_inf_meta['extra_files'] = [new_extra]
    if overwrite_input_extra or not extrafileexists:
        # first patch metadata
        printlog("extra_files_to_patch: %s" %
                 str(target_inf_meta.get('extra_files')))  # debugging
        ff_utils.patch_metadata(
            {'extra_files': target_inf_meta.get('extra_files')},
            target_inf.get('value'),
            key=tibanna.ff_keys,
            ff_env=tibanna.env)
        # target key
        # NOTE : The target bucket is assume to be the same as output bucket
        # i.e. the bucket for the input file should be the same as the output bucket.
        # which is true if both input and output are processed files.
        orgfile_key = target_inf_meta.get('upload_key')
        orgfile_format = parse_formatstr(target_inf_meta.get('file_format'))
        fe_map = FormatExtensionMap(tibanna.ff_keys)
        printlog("orgfile_key = %s" % orgfile_key)
        printlog("orgfile_format = %s" % orgfile_format)
        printlog("target_format = %s" % target_format)
        target_key = get_extra_file_key(orgfile_format, orgfile_key,
                                        target_format, fe_map)
        return target_key
    else:
        raise Exception(
            "input already has extra: 'User overwrite_input_extra': true")
Esempio n. 9
0
 def as_dict(self):
     d = self.__dict__.copy()
     printlog(d)
     del (d['client'])
     del (d['starttimes'])
     del (d['endtimes'])
     del (d['starttime'])
     del (d['endtime'])
     del (d['filesystem'])
     del (d['instance_id'])
     return (d)
Esempio n. 10
0
def create_ffmeta_input_files_from_pony_input_file_list(input_file_list):
    input_files_for_ffmeta = []
    for input_file in input_file_list:
        dim = flatten(create_dim(input_file['uuid']))
        if not dim:  # singlet
            dim = '0'
        uuid = flatten(input_file['uuid'])
        ordinal = create_ordinal(uuid)
        for d, u, o in zip(aslist(dim), aslist(uuid), aslist(ordinal)):
            infileobj = InputFileForWFRMeta(
                input_file['workflow_argument_name'], u, o,
                input_file.get('format_if_extra', ''), d)
            input_files_for_ffmeta.append(infileobj.as_dict())
    printlog("input_files_for_ffmeta is %s" % input_files_for_ffmeta)
    return input_files_for_ffmeta
Esempio n. 11
0
 def __init__(self, ff_keys):
     try:
         printlog("Searching in server : " + ff_keys['server'])
         ffe_all = search_metadata("/search/?type=FileFormat&frame=object",
                                   key=ff_keys)
     except Exception as e:
         raise Exception("Can't get the list of FileFormat objects. %s\n" %
                         e)
     self.fe_dict = dict()
     printlog("**ffe_all = " + str(ffe_all))
     for k in ffe_all:
         file_format = k['file_format']
         self.fe_dict[file_format] = \
             {'standard_extension': k['standard_file_extension'],
              'other_allowed_extensions': k.get('other_allowed_extensions', []),
              'extrafile_formats': k.get('extrafile_formats', [])
              }
Esempio n. 12
0
def handler(event, context):
    # fix non json-serializable datetime startDate
    if 'Records' in event and 'eventTime' in event['Records']:
        event["Records"]["eventTime"] = str(event["Records"]["eventTime"])

    upload_key = event['Records'][0]['s3']['object']['key']
    accession = upload_key.split('/')[1].split('.')[0]
    if not accession.startswith('4DN'):
        printlog("Skipping trigger: not 4DN accession %s" % accession)
        return event
    client = boto3.client('stepfunctions', region_name=AWS_REGION)
    response = client.start_execution(
        stateMachineArn=STEP_FUNCTION_ARN(INITIATOR_STEP_FUNCTION_NAME),
        name=accession + '_' + str(uuid.uuid4()),
        input=json.dumps(event),
    )
    printlog(str(response))
    return event
Esempio n. 13
0
def add_md5_filesize_to_pf_extra(pf, awsemfile):
    printlog("awsemfile.is_extra=%s" % awsemfile.is_extra)
    if awsemfile.is_extra:
        for pfextra in pf.extra_files:
            printlog("pfextra : %s" % str(pfextra))
            printlog("awsemfile.format_if_extra : %s" %
                     awsemfile.format_if_extra)
            if pfextra.get('file_format') == awsemfile.format_if_extra:
                if awsemfile.md5:
                    pfextra['md5sum'] = awsemfile.md5
                if awsemfile.filesize:
                    pfextra['file_size'] = awsemfile.filesize
        printlog("add_md5_filesize_to_pf_extra: %s" % pf.extra_files)
Esempio n. 14
0
def get_file_format(event):
    '''if the file extension matches the regular file format,
    returns (format, None)
    if it matches one of the format of an extra file,
    returns (format (e.g. 'pairs_px2'), 'extra')
    '''
    # guess env from bucket name
    bucket = event['Records'][0]['s3']['bucket']['name']
    env = '-'.join(bucket.split('-')[1:3])
    if env == 'fourfront-webprod':
        env = 'data'
    upload_key = event['Records'][0]['s3']['object']['key']
    uuid, object_key = upload_key.split('/')
    accession = object_key.split('.')[0]
    extension = object_key.replace(accession + '.', '')

    try:
        tibanna = Tibanna(env=env)
    except Exception as e:
        raise TibannaStartException("%s" % e)
    file_format, extra_formats = get_fileformats_for_accession(
        accession, tibanna.ff_keys, env)
    if file_format:
        fe_map = FormatExtensionMap(tibanna.ff_keys)
        printlog(fe_map)
        if extension == fe_map.get_extension(file_format):
            return (file_format, None)
        elif extension in fe_map.get_other_extensions(file_format):
            return (file_format, None)
        else:
            for extra_format in extra_formats:
                if extension == fe_map.get_extension(extra_format):
                    return (extra_format, 'extra')
                elif extension in fe_map.get_other_extensions(extra_format):
                    return (extra_format, 'extra')
        raise Exception(
            "file extension not matching: %s vs %s (%s)" %
            (extension, fe_map.get_extension(file_format), file_format))
    else:
        raise Exception("Cannot get input metadata")
Esempio n. 15
0
def create_wfr_output_files_and_processed_files(
        wf_meta,
        tibanna,
        pf_source_experiments=None,
        custom_fields=None,
        user_supplied_output_files=None):
    output_files = []
    pf_meta = []
    arg_type_list = [
        'Output processed file', 'Output report file', 'Output QC file',
        'Output to-be-extra-input file'
    ]
    for arg in wf_meta.get('arguments', []):
        printlog("processing arguments %s" % str(arg))
        if arg.get('argument_type') in arg_type_list:
            if user_supplied_output_files:
                pf, resp = user_supplied_proc_file(
                    user_supplied_output_files,
                    arg.get('workflow_argument_name'), tibanna)
                printlog(
                    "proc_file_for_arg_name returned %s \nfrom ff result of\n %s"
                    % (str(pf.__dict__), str(resp)))
            else:
                if arg.get('argument_type', '') == 'Output processed file':
                    argname = arg.get('workflow_argument_name')
                    pf, resp = create_and_post_processed_file(
                        tibanna.ff_keys, arg.get('argument_format', ''),
                        arg.get('secondary_file_formats', []),
                        pf_source_experiments,
                        parse_custom_fields(custom_fields, argname))
                else:
                    pf = None
                    resp = dict()
            of = create_wfr_outputfiles(arg, resp)
            if pf:
                pf_meta.append(pf)
            if of:
                output_files.append(of.as_dict())
    return output_files, pf_meta
Esempio n. 16
0
def register_to_higlass(tibanna, awsemfile_bucket, awsemfile_key, filetype,
                        datatype):
    payload = {
        "filepath": awsemfile_bucket + "/" + awsemfile_key,
        "filetype": filetype,
        "datatype": datatype
    }
    higlass_keys = tibanna.s3.get_higlass_key()
    if not isinstance(higlass_keys, dict):
        raise Exception("Bad higlass keys found: %s" % higlass_keys)
    auth = (higlass_keys['key'], higlass_keys['secret'])
    headers = {
        'Content-Type': 'application/json',
        'Accept': 'application/json'
    }
    res = requests.post(higlass_keys['server'] + '/api/v1/link_tile/',
                        data=json.dumps(payload),
                        auth=auth,
                        headers=headers)
    printlog("LOG resiter_to_higlass(POST request response): " +
             str(res.json()))
    return res.json()['uuid']
Esempio n. 17
0
def test_merge_source_experiment(run_awsem_event_data):
    input_file = {
        "bucket_name":
        "elasticbeanstalk-fourfront-webdev-wfoutput",
        "workflow_argument_name":
        "input_pairs",
        "uuid": [
            "d2c897ec-bdb2-47ce-b1b1-845daccaa571",
            "d2c897ec-bdb2-47ce-b1b1-845daccaa571"
        ],
        "object_key": ["4DNFI25JXLLI.pairs.gz", "4DNFI25JXLLI.pairs.gz"]
    }
    data = run_awsem_event_data
    tibanna_settings = data.get('_tibanna', {})
    # if they don't pass in env guess it from output_bucket
    env = tibanna_settings.get('env')
    # tibanna provides access to keys based on env and stuff like that
    tibanna = Tibanna(env,
                      ff_keys=data.get('ff_keys'),
                      settings=tibanna_settings)
    res = merge_source_experiments(input_file['uuid'], tibanna.ff_keys,
                                   tibanna.env)
    printlog(res)
    assert 'fake_source_experiment' in res
Esempio n. 18
0
def user_supplied_proc_file(user_supplied_output_files, arg_name, tibanna):
    if not user_supplied_output_files:
        raise Exception("user supplied processed files missing\n")
    of = [
        output for output in user_supplied_output_files
        if output.get('workflow_argument_name') == arg_name
    ]
    if of:
        if len(of) > 1:
            raise Exception(
                "multiple output files supplied with same workflow_argument_name"
            )
        of = of[0]
        return ProcessedFileMetadata.get(of.get('uuid'),
                                         tibanna.ff_keys,
                                         tibanna.env,
                                         return_data=True)
    else:
        printlog("no output_files found in input_json matching arg_name")
        printlog("user_supplied_output_files: %s" %
                 str(user_supplied_output_files))
        printlog("arg_name: %s" % str(arg_name))
        printlog("tibanna is %s" % str(tibanna))
        raise Exception("user supplied processed files missing\n")
Esempio n. 19
0
def md5_updater(status, awsemfile, ff_meta, tibanna, **kwargs):
    # get key
    ff_key = tibanna.ff_keys
    # get metadata about original input file
    accession = awsemfile.runner.get_file_accessions('input_file')[0]
    format_if_extras = awsemfile.runner.get_format_if_extras('input_file')
    original_file = ff_utils.get_metadata(accession,
                                          key=ff_key,
                                          ff_env=tibanna.env,
                                          add_on='frame=object',
                                          check_queue=True)
    if status.lower() == 'uploaded':  # md5 report file is uploaded
        md5, content_md5 = parse_md5_report(awsemfile.read())
        # add file size to input file metadata
        input_file = awsemfile.runner.input_files()[0]
        file_size = boto3.client('s3').head_object(Bucket=input_file.bucket,
                                                   Key=input_file.key).get(
                                                       'ContentLength', '')
        for format_if_extra in format_if_extras:
            printlog("format_if_extra : %s" % format_if_extra)
            new_file = _md5_updater(original_file, md5, content_md5,
                                    format_if_extra, file_size)
            if new_file:
                break
        printlog("new_file = %s" % str(new_file))
        if new_file:
            try:
                resp = ff_utils.patch_metadata(new_file, accession, key=ff_key)
                printlog(resp)
            except Exception as e:
                # TODO specific excpetion
                # if patch fails try to patch worfklow status as failed
                raise e
    else:
        pass
    # nothing to patch to ff_meta
    return None
Esempio n. 20
0
def real_handler(event, context):
    # check the status and other details of import
    '''
    this is to check if the task run is done:
    http://docs.sevenbridges.com/reference#get-task-execution-details
    '''
    # get data
    # used to automatically determine the environment
    tibanna_settings = event.get('_tibanna', {})
    try:
        tibanna = Tibanna(tibanna_settings['env'], settings=tibanna_settings)
    except Exception as e:
        raise TibannaStartException("%s" % e)
    ff_meta = create_ffmeta_awsem(
        app_name=event.get('ff_meta').get('awsem_app_name'),
        **event.get('ff_meta'))

    if event.get('error', False):
        ff_meta.run_status = 'error'
        ff_meta.description = event.get('error')
        patch_res = ff_meta.patch(key=tibanna.ff_keys)
        printlog("patch response: " + str(patch_res))
        # sending a notification email before throwing error
        if 'email' in event['config'] and event['config']['email']:
            try:
                send_notification_email(
                    event['_tibanna']['settings']['run_name'], event['jobid'],
                    ff_meta.run_status, event['_tibanna']['settings']['url'])
            except Exception as e:
                printlog("Cannot send email: %s" % e)
        raise Exception(event.get('error'))

    metadata_only = event.get('metadata_only', False)

    pf_meta = [ProcessedFileMetadata(**pf) for pf in event.get('pf_meta')]
    custom_qc_fields = event.get('custom_qc_fields', None)

    # ensure this bad boy is always initialized
    awsem = Awsem(event)
    # go through this and replace awsemfile_report with awsf format
    # actually interface should be look through ff_meta files and call
    # give me the status of this thing from the runner, and runner.output_files.length
    # so we just build a runner with interface to sbg and awsem
    # runner.output_files.length()
    # runner.output_files.file.status
    # runner.output_files.file.loc
    # runner.output_files.file.get

    awsem_output = awsem.output_files()
    awsem_output_extra = awsem.secondary_output_files()
    ff_output = len(ff_meta.output_files)
    if len(awsem_output) != ff_output:
        ff_meta.run_status = 'error'
        ff_meta.description = "%d files output expected %s" % (
            ff_output, len(awsem_output))
        ff_meta.patch(key=tibanna.ff_keys)
        raise Exception(
            "Failing the workflow because outputed files = %d and ffmeta = %d"
            % (awsem_output, ff_output))

    def update_metadata_from_awsemfile_list(awsemfile_list):
        patch_meta = False
        for awsemfile in awsemfile_list:
            patch_meta = update_ffmeta_from_awsemfile(awsemfile, ff_meta,
                                                      tibanna,
                                                      custom_qc_fields)
            if not metadata_only:
                update_pfmeta_from_awsemfile(awsemfile, pf_meta, tibanna)
        # allow for a simple way for updater to add appropriate meta_data
        if patch_meta:
            ff_meta.__dict__.update(patch_meta)

    update_metadata_from_awsemfile_list(awsem_output)
    update_metadata_from_awsemfile_list(awsem_output_extra)

    # if we got all the awsemfiles let's go ahead and update our ff_metadata object
    ff_meta.run_status = "complete"

    # add postrunjson log file to ff_meta as a url
    ff_meta.awsem_postrun_json = get_postrunjson_url(event)

    # make all the file awsemfile meta-data stuff here
    # TODO: fix bugs with ff_meta mapping for output and input file
    try:
        ff_meta.patch(key=tibanna.ff_keys)
    except Exception as e:
        raise Exception("Failed to update run_status %s" % str(e))
    # patch processed files - update only status, extra_files, md5sum and file_size
    if pf_meta:
        patch_fields = [
            'uuid', 'status', 'extra_files', 'md5sum', 'file_size',
            'higlass_uid'
        ]
        try:
            for pf in pf_meta:
                printlog(pf.as_dict())
                pf.patch(key=tibanna.ff_keys, fields=patch_fields)
        except Exception as e:
            raise Exception("Failed to update processed metadata %s" % str(e))

    event['ff_meta'] = ff_meta.as_dict()
    event['pf_meta'] = [_.as_dict() for _ in pf_meta]

    # sending a notification email after the job finishes
    if 'email' in event['config'] and event['config']['email']:
        try:
            send_notification_email(event['_tibanna']['settings']['run_name'],
                                    event['jobid'],
                                    event['ff_meta']['run_status'],
                                    event['_tibanna']['settings']['url'])
        except Exception as e:
            printlog("Cannot send email: %s" % e)

    return event
Esempio n. 21
0
def launch_and_get_instance_id(launch_args,
                               jobid,
                               spot_instance=None,
                               spot_duration=None,
                               behavior_on_capacity_limit='fail'):
    try:  # capturing stdout from the launch command
        os.environ[
            'AWS_DEFAULT_REGION'] = 'us-east-1'  # necessary? not sure just put it in there
        ec2 = boto3.client('ec2')
    except Exception as e:
        raise Exception("Failed to create a client for EC2")

    if spot_instance:
        spot_options = {
            'SpotInstanceType': 'one-time',
            'InstanceInterruptionBehavior': 'terminate'
        }
        if spot_duration:
            spot_options['BlockDurationMinutes'] = spot_duration
        launch_args.update({
            'InstanceMarketOptions': {
                'MarketType': 'spot',
                'SpotOptions': spot_options
            }
        })
    try:
        res = 0
        res = ec2.run_instances(**launch_args)
    except Exception as e:
        if 'InsufficientInstanceCapacity' in str(
                e) or 'InstanceLimitExceeded' in str(e):
            if behavior_on_capacity_limit == 'fail':
                errmsg = "Instance limit exception - use 'behavior_on_capacity_limit' option"
                errmsg += "to change the behavior to wait_and_retry, or retry_without_spot. %s" % str(
                    e)
                raise EC2InstanceLimitException(errmsg)
            elif behavior_on_capacity_limit == 'wait_and_retry':
                errmsg = "Instance limit exception - wait and retry later: %s" % str(
                    e)
                raise EC2InstanceLimitWaitException(errmsg)
            elif behavior_on_capacity_limit == 'retry_without_spot':
                if not spot_instance:
                    errmsg = "'behavior_on_capacity_limit': 'retry_without_spot' works only with"
                    errmsg += "'spot_instance' : true. %s" % str(e)
                    raise Exception(errmsg)
                del (launch_args['InstanceMarketOptions'])
                try:
                    res = ec2.run_instances(**launch_args)
                    printlog("trying without spot : %s" % str(res))
                except Exception as e2:
                    errmsg = "Instance limit exception without spot instance %s" % str(
                        e2)
                    raise EC2InstanceLimitException(errmsg)
        else:
            raise Exception(
                "failed to launch instance for job {jobid}: {log}. %s".format(
                    jobid=jobid, log=res) % e)

    try:
        instance_id = res['Instances'][0]['InstanceId']
    except Exception as e:
        raise Exception(
            "failed to retrieve instance ID for job {jobid}".format(
                jobid=jobid))

    return instance_id
Esempio n. 22
0
def read_s3(bucket, object_name):
    response = boto3.client('s3').get_object(Bucket=bucket, Key=object_name)
    printlog(str(response))
    return response['Body'].read()
Esempio n. 23
0
def handler(event, context):
    '''
    somewhere in the event data should be a jobid
    '''

    # s3 bucket that stores the output
    bucket_name = event['config']['log_bucket']

    # info about the jobby job
    jobid = event['jobid']
    job_started = "%s.job_started" % jobid
    job_success = "%s.success" % jobid
    job_error = "%s.error" % jobid

    # check to see ensure this job has started else fail
    if not does_key_exist(bucket_name, job_started):
        raise EC2StartingException(
            "Failed to find jobid %s, ec2 is probably still booting" % jobid)

    # check to see if job has error, report if so
    if does_key_exist(bucket_name, job_error):
        handle_postrun_json(bucket_name, jobid, event, False)
        raise AWSEMJobErrorException(
            "Job encountered an error check log using invoke log --job-id=%s" %
            jobid)

    # check to see if job has completed
    if does_key_exist(bucket_name, job_success):
        handle_postrun_json(bucket_name, jobid, event)
        print("completed successfully")
        return event

    # checking if instance is terminated for no reason
    instance_id = event.get('instance_id', '')
    if instance_id:  # skip test for instance_id by not giving it to event
        try:
            res = boto3.client('ec2').describe_instances(
                InstanceIds=[instance_id])
        except Exception as e:
            if 'InvalidInstanceID.NotFound' in str(e):
                raise EC2UnintendedTerminationException(
                    "EC2 is no longer found for job %s - please rerun." %
                    jobid)
            else:
                raise e
        if not res['Reservations']:
            raise EC2UnintendedTerminationException(
                "EC2 is no longer found for job %s - please rerun." % jobid)
        else:
            ec2_state = res['Reservations'][0]['Instances'][0]['State']['Name']
            if ec2_state in ['stopped', 'shutting-down', 'terminated']:
                errmsg = "EC2 is terminated unintendedly for job %s - please rerun." % jobid
                printlog(errmsg)
                raise EC2UnintendedTerminationException(errmsg)

        # check CPU utilization for the past hour
        filesystem = '/dev/nvme1n1'  # doesn't matter for cpu utilization
        end = datetime.now(tzutc())
        start = end - timedelta(hours=1)
        jobstart_time = boto3.client('s3').get_object(
            Bucket=bucket_name, Key=job_started).get('LastModified')
        if jobstart_time + timedelta(hours=1) < end:
            cw_res = TibannaResource(instance_id, filesystem, start,
                                     end).as_dict()
            if 'max_cpu_utilization_percent' in cw_res:
                if not cw_res['max_cpu_utilization_percent'] or cw_res[
                        'max_cpu_utilization_percent'] < 1.0:
                    # the instance wasn't terminated - otherwise it would have been captured in the previous error.
                    try:
                        boto3.client('ec2').terminate_instances(
                            InstanceIds=[instance_id])
                    except Exception as e:
                        errmsg = "Nothing has been running for the past hour for job %s," + \
                                 "but cannot terminate the instance (cpu utilization (%s) : %s" % \
                                 jobid, str(cw_res['max_cpu_utilization_percent']), str(e)
                        printlog(errmsg)
                        raise EC2IdleException(errmsg)

    # if none of the above
    raise StillRunningException("job %s still running" % jobid)
Esempio n. 24
0
def update_pfmeta_from_awsemfile(awsemfile, pf_meta, tibanna):
    status = awsemfile.status
    printlog("awsemfile res is %s" % status)
    if status == 'COMPLETED':
        if awsemfile.argument_type == 'Output processed file':
            update_processed_file(awsemfile, pf_meta, tibanna)
Esempio n. 25
0
def real_handler(event, context):
    '''
    this is generic function to run awsem workflow
    based on the data passed in

    workflow_uuid : for now, pass this on. Later we can add a code to automatically retrieve this from app_name.
    Note multiple workflow_uuids can be available for an app_name
    (different versions of the same app could have a different uuid)
    '''
    # keep the input json on s3
    logbucket = event.get('config', {}).get('log_bucket', '')
    jobid = event.get('jobid', '')
    if logbucket and jobid:
        boto3.client('s3').put_object(Body=json.dumps(
            event, indent=4).encode('ascii'),
                                      Key=jobid + '.input.json',
                                      Bucket=logbucket)

    # get incomming data
    input_file_list = event.get('input_files')
    for infile in input_file_list:
        if not infile:
            raise ("malformed input, check your input_files")
    workflow_uuid = event.get('workflow_uuid')
    output_bucket = event.get('output_bucket')
    parameters = ff_utils.convert_param(event.get('parameters'), True)
    tibanna_settings = event.get('_tibanna', {})
    if 'overwrite_input_extra' in event.get('config'):
        overwrite_input_extra = event.get('config')['overwrite_input_extra']
    else:
        overwrite_input_extra = event.get('overwrite_input_extra', False)
    tag = event.get('tag')
    # if they don't pass in env guess it from output_bucket
    try:
        env = tibanna_settings.get('env',
                                   '-'.join(output_bucket.split('-')[1:-1]))
        printlog("Tibanna setting : env= " + env)
        # tibanna provides access to keys based on env and stuff like that
        tibanna = Tibanna(env,
                          ff_keys=event.get('ff_keys'),
                          settings=tibanna_settings)
        printlog("Tibanna ff_keys url : " + tibanna.ff_keys['server'])
        printlog("Tibanna.s3.url: " + tibanna.s3.url)
    except Exception as e:
        raise TibannaStartException("%s" % e)

    args = dict()

    # get argument format & type info from workflow
    wf_meta = ff_utils.get_metadata(workflow_uuid,
                                    key=tibanna.ff_keys,
                                    ff_env=tibanna.env,
                                    add_on='frame=object')
    printlog("workflow info  %s" % wf_meta)
    if 'error' in wf_meta.get('@type', []):
        raise Exception("FATAL, can't lookup workflow info for %s fourfront" %
                        workflow_uuid)

    # get cwl info from wf_meta
    for k in [
            'app_name', 'app_version', 'cwl_directory_url',
            'cwl_main_filename', 'cwl_child_filenames', 'wdl_directory_url',
            'wdl_main_filename', 'wdl_child_filenames'
    ]:
        printlog(wf_meta.get(k))
        args[k] = wf_meta.get(k, '')
    if not args['cwl_child_filenames']:
        args['cwl_child_filenames'] = []
    if not args['wdl_child_filenames']:
        args['wdl_child_filenames'] = []

    if 'workflow_language' in wf_meta and wf_meta['workflow_language'] == 'WDL':
        args['language'] = 'wdl'
    else:
        # switch to v1 if available
        if 'cwl_directory_url_v1' in wf_meta:  # use CWL v1
            args['cwl_directory_url'] = wf_meta['cwl_directory_url_v1']
            args['cwl_version'] = 'v1'
        else:
            args['cwl_version'] = 'draft3'

    # input file args for awsem
    for input_file in input_file_list:
        process_input_file_info(input_file, tibanna.ff_keys, tibanna.env, args)

    # create the ff_meta output info
    input_files_for_ffmeta = create_ffmeta_input_files_from_pony_input_file_list(
        input_file_list)

    # source experiments
    input_file_uuids = [_['uuid'] for _ in input_file_list]
    pf_source_experiments = merge_source_experiments(input_file_uuids,
                                                     tibanna.ff_keys,
                                                     tibanna.env)

    # processed file metadata
    output_files, pf_meta = \
        create_wfr_output_files_and_processed_files(wf_meta, tibanna,
                                                    pf_source_experiments,
                                                    custom_fields=event.get('custom_pf_fields'),
                                                    user_supplied_output_files=event.get('output_files'))
    print("output files= %s" % str(output_files))

    # 4DN dcic award and lab are used here, unless provided in wfr_meta
    ff_meta = create_ffmeta_awsem(workflow_uuid,
                                  args['app_name'],
                                  args['app_version'],
                                  input_files_for_ffmeta,
                                  tag=tag,
                                  run_url=tibanna.settings.get('url', ''),
                                  output_files=output_files,
                                  parameters=parameters,
                                  extra_meta=event.get('wfr_meta'),
                                  jobid=jobid)

    printlog("ff_meta is %s" % ff_meta.__dict__)

    # store metadata so we know the run has started
    ff_meta.post(key=tibanna.ff_keys)

    # parameters
    args['input_parameters'] = event.get('parameters')

    # output target
    args['output_target'] = dict()
    args['secondary_output_target'] = dict()
    for of in ff_meta.output_files:
        arg_name = of.get('workflow_argument_name')
        if of.get('type') == 'Output processed file':
            args['output_target'][arg_name] = of.get('upload_key')
        elif of.get('type') == 'Output to-be-extra-input file':
            target_inf = input_files_for_ffmeta[
                0]  # assume only one input for now
            target_key = output_target_for_input_extra(target_inf, of, tibanna,
                                                       overwrite_input_extra)
            args['output_target'][arg_name] = target_key
        else:
            random_tag = str(int(random.random() * 1000000000000))
            # add a random tag at the end for non-processed file e.g. md5 report,
            # so that if two or more wfr are trigerred (e.g. one with parent file, one with extra file)
            # it will create a different output. Not implemented for processed files -
            # it's tricky because processed files must have a specific name.
            args['output_target'][
                arg_name] = ff_meta.uuid + '/' + arg_name + random_tag
        if 'secondary_file_formats' in of and 'extra_files' in of and of[
                'extra_files']:
            for ext in of.get('extra_files'):
                if arg_name not in args['secondary_output_target']:
                    args['secondary_output_target'] = {
                        arg_name: [ext.get('upload_key')]
                    }
                else:
                    args['secondary_output_target'][arg_name].append(
                        ext.get('upload_key'))

    # output bucket
    args['output_S3_bucket'] = event.get('output_bucket')

    # dependencies
    if 'dependency' in event:
        args['dependency'] = event['dependency']

    # initialize config parameters as null for benchmarking
    config = event['config']
    if 'instance_type' not in config:
        config['instance_type'] = ''
    if 'EBS_optimized' not in config:
        config['EBS_optimized'] = ''
    if 'ebs_size' not in config:
        config['ebs_size'] = 0
    if 'public_postrun_json' not in config:
        config['public_postrun_json'] = True

    event.update({
        "ff_meta": ff_meta.as_dict(),
        'pf_meta': [meta.as_dict() for meta in pf_meta],
        "_tibanna": tibanna.as_dict(),
        "args": args
    })
    return (event)
Esempio n. 26
0
def _qc_updater(status,
                awsemfile,
                ff_meta,
                tibanna,
                quality_metric='quality_metric_fastqc',
                file_argument='input_fastq',
                report_html=None,
                datafiles=None,
                zipped=True,
                datajson_argument=None,
                other_fields=None):
    if datajson_argument == awsemfile.argument_name:
        return
    # avoid using [] as default argument
    if datafiles is None:
        datafiles = ['summary.txt', 'fastqc_data.txt']
    if status == 'uploading':
        # wait until this bad boy is finished
        return
    # keys
    ff_key = tibanna.ff_keys
    # move files to proper s3 location
    # need to remove sbg from this line
    accession = awsemfile.runner.get_file_accessions(file_argument)[0]
    zipped_report = awsemfile.key
    files_to_parse = datafiles
    if report_html:
        files_to_parse.append(report_html)
    printlog("accession is %s" % accession)
    jsondata = dict()
    if zipped:
        try:
            files = awsemfile.s3.unzip_s3_to_s3(zipped_report,
                                                accession,
                                                files_to_parse,
                                                acl='public-read')
        except Exception as e:
            printlog(tibanna.s3.__dict__)
            raise Exception("%s (key={})\n".format(zipped_report) % e)
        printlog("files : %s" % str(files))
        filedata = [files[_]['data'] for _ in datafiles]
    else:
        if datajson_argument:
            datajson_key = awsemfile.runner.get_file_key(datajson_argument)
            jsondata0 = [
                json.loads(awsemfile.s3.read_s3(_)) for _ in datajson_key
            ]
            for d in jsondata0:
                jsondata.update(d)
        filedata = [awsemfile.s3.read_s3(_) for _ in datafiles]
        reportdata = awsemfile.s3.read_s3(report_html)
        report_html = accession + 'qc_report.html'
        awsemfile.s3.s3_put(reportdata, report_html, acl='public-read')
        qc_url = 'https://s3.amazonaws.com/' + awsemfile.bucket + '/' + report_html
        files = {report_html: {'data': reportdata, 's3key': qc_url}}
    # schema. do not need to check_queue
    qc_schema = ff_utils.get_metadata("profiles/" + quality_metric + ".json",
                                      key=ff_key,
                                      ff_env=tibanna.env)
    # parse fastqc metadata
    if report_html in files:
        qc_url = files[report_html]['s3key']
    else:
        qc_url = None
    meta = parse_qc_table(filedata,
                          qc_schema=qc_schema.get('properties'),
                          url=qc_url)
    if jsondata:
        meta.update(jsondata)
    # custom fields
    if other_fields:
        for field in other_fields:
            meta.update(other_fields)
    printlog("qc meta is %s" % meta)
    # post fastq metadata
    qc_meta = ff_utils.post_metadata(meta, quality_metric, key=ff_key)
    if qc_meta.get('@graph'):
        qc_meta = qc_meta['@graph'][0]
    printlog("qc_meta is %s" % qc_meta)
    # update original file as well
    try:
        original_file = ff_utils.get_metadata(accession,
                                              key=ff_key,
                                              ff_env=tibanna.env,
                                              add_on='frame=object',
                                              check_queue=True)
        printlog("original_file is %s" % original_file)
    except Exception as e:
        raise Exception(
            "Couldn't get metadata for accession {} : ".format(accession) +
            str(e))
    patch_file = {'quality_metric': qc_meta['@id']}
    try:
        ff_utils.patch_metadata(patch_file, original_file['uuid'], key=ff_key)
    except Exception as e:
        raise Exception("patch_metadata failed in fastqc_updater." + str(e) +
                        "original_file ={}\n".format(str(original_file)))
    # patch the workflow run, value_qc is used to make drawing graphs easier.
    output_files = ff_meta.output_files
    output_files[0]['value_qc'] = qc_meta['@id']
    retval = {'output_files': output_files}
    printlog("retval is %s" % retval)
    return retval