Ejemplo n.º 1
0
def main():

    rx = Reactor()
    m = rx.context.message_dict

    job = PipelineJob(
        rx, 'transcriptic', 'Yeast-Gates',
        'sample.transcriptic.aq1btsj94wghbk',
        'measurement.transcriptic.sample.transcriptic.aq1btsj94wghbk.2')

    job.setup(data=m)
    # Set up and launch Agave jobs with callbacks based on job.callback

    job_def = {
        'appId':
        'hello-agave-cli-0.1.0u1',
        'notifications': [{
            'event': '*',
            'persistent': False,
            'url': job.callback + '&status=${STATUS}'
        }]
    }

    try:
        resp = rx.client.jobs.submit(body=job_def)

        job_id = None
        if 'id' in resp:
            job_id = resp['id']
            job.running({'launched': job_id})
        else:
            job.fail()
    except Exception as exc:
        job.cancel()
        rx.on_failure('Failed to launch pipeline', exc)
def test_files_get(system, path, file, willpass):
    r = Reactor()
    if willpass is True:
        f = agaveutils.files.get(r.client, path,
                                 system, file, retries=PASS_RETRIES)
        assert os.path.basename(f) == file
    else:
        with pytest.raises(Exception):
            agaveutils.files.get(r.client, path,
                                 system, file, retries=FAIL_RETRIES)
    try:
        os.unlink(file)
    except Exception:
        pass
def test_files_mkdir(system, basepath, willpass):
    r = Reactor()

    if willpass is True:
        made_dir = agaveutils.files.mkdir(r.client,
                                          'unit_test',
                                          system,
                                          basepath,
                                          retries=PASS_RETRIES)
        assert made_dir is True
    else:
        with pytest.raises(Exception):
            made_dir = agaveutils.files.mkdir(r.client,
                                              'unit_test',
                                              system,
                                              basepath,
                                              retries=FAIL_RETRIES)
Ejemplo n.º 4
0
def main():
    """
    Exercise  utility features in Reactors base image
    """
    r = Reactor()

    r.logger.info("# Reactor attributes")

    r.logger.info("UUID: {}".format(r.uid))
    r.logger.info("Reactor logging nickname: {}".format(r.nickname))
    r.logger.info("API username: {}".format(r.username))

    # r.logger.info("# Reactor filesystem paths")
    # r.logger.info(r.storage.paths)

    r.logger.info("# Reactor context")
    r.logger.info(r.context)

    r.logger.info("# Actor database id")
    r.logger.info(r.context.actor_dbid)

    r.logger.info("# Reactor message")
    # If you are running this locally, try setting a MSG
    # environment variable to see it propagate into the
    # Reactor container environment
    r.logger.info(r.context.raw_message)

    r.logger.info("# Value of 'key1' from settings")
    r.logger.info(r.settings.key1)

    r.logger.info("# Demonstrate logger")
    r.logger.info("Hello, world")

    r.logger.info("# UTC Timestamp")
    r.logger.info(utcnow())

    r.logger.info("# Call Agave profiles API")
    try:
        r.logger.info(r.client.profiles.get())
    except Exception as e:
        r.logger.error("Error calling API: {}".format(e))
        pass

    print(agaveutils.uri.to_agave_uri('data-sd2e-community', '/sample'))
Ejemplo n.º 5
0
def main():
    r = Reactor()
    r.logger.info("Hello this is actor {}".format(r.uid))
    md = r.context.message_dict
    r.logger.info(json.dumps(md))

    # Extract config file from settings
    metrics_conf_file = r.settings.metrics.config
    metrics_config = load_metrics_config(metrics_conf_file)
    # Dig out csv_writer out_dir
    metrics_dir = metrics_config.get('csv_writer', 'out_dir')
    try:
        os.makedirs(metrics_dir)
    except FileExistsError:
        pass

    # Get Agave info
    agave_uri = md.get('uri')
    r.logger.info('Agave URI: %s', agave_uri)
    agave_sys, agave_path, agave_file = agaveutils.from_agave_uri(agave_uri)

    # Download all files from agave to metrics_dir
    download_metrics_files(r.client, agave_sys, agave_path, metrics_dir,
                           r.logger)

    # SynBioHub settings (user, password) are passed via secrets.json
    r.logger.info('Getting synbiohub settings')
    sbh_settings = r.settings.get('sbh', {})
    sbh_user = sbh_settings['user']
    sbh_password = sbh_settings['password']
    r.logger.info('SynBioHub user: %s', sbh_user)
    r.logger.info('SynBioHub password is available')

    sbhm_args = ['-u', sbh_user, '-p', sbh_password, '/reactor.ini']
    # r.logger.info('Invoking sbh metrics with %r', sbhm_args)
    sbhmetrics.main(sbhm_args)

    # Upload the metrics files to Agave
    upload_metrics_files(r.client, agave_sys, agave_path, metrics_dir,
                         r.logger)
Ejemplo n.º 6
0
def main():

    r = Reactor()
    m = AttrDict(r.context.message_dict)
    # ! This code fixes an edge case and will be moved lower in the stack
    if m == {}:
        try:
            jsonmsg = json.loads(r.context.raw_message)
            m = jsonmsg
        except Exception:
            pass

    if not r.validate_message(m):
        r.on_failure('Invalid message received', None)

    agave_uri = m.get('uri')
    generated_by = m.get('generated_by', [])
    r.logger.info('Indexing {}'.format(agave_uri))
    agave_sys, agave_path, agave_file = agaveutils.from_agave_uri(agave_uri)
    agave_full_path = os.path.join(agave_path, agave_file)
    agave_full_path = re.sub('^(/)+', '/', agave_full_path)

    ah = AgaveHelper(client=r.client)
    to_index = []
    if ah.isfile(agave_full_path):
        # INDEX THE FILE
        mgr = Indexer(mongodb=r.settings.mongodb, agave=r.client)
        try:
            mgr.index_if_exists(agave_full_path, storage_system=agave_sys)
        except Exception as exc:
            r.on_failure('Indexing failed for {}'.format(agave_uri, exc))
        # file_store = mgr.stores['file']
        # fixity_store = mgr.stores['fixity']

        # try:
        #     resp = fixity_store.index(agave_full_path, storage_system=agave_sys, generated_by=generated_by)
        #     r.logger.debug('Fixity indexed {} to uuid:{}'.format(
        #         os.path.basename(agave_uri), resp.get('uuid', None)))
        # except Exception as exc:
        #     r.on_failure('Indexing failed for {}'.format(agave_full_path), exc)
    else:
        # LIST DIR AND FIRE OFF INDEX TASKS
        r.logger.debug('Recursively listing {}'.format(agave_full_path))
        to_index = ah.listdir(agave_full_path,
                              recurse=True,
                              storage_system=agave_sys,
                              directories=False)

        r.logger.info('Found {} files to index'.format(len(to_index)))
        r.logger.debug('Messaging self with indexing targets')

        # Contents of to_list are likely to be in a sorted order. Adding a
        # shuffle spreads the indexing process evenly over all indexing targets
        shuffle(to_index)
        batch_sub = 0
        for idxpath in to_index:
            try:
                r.logger.debug('Self, please index {}'.format(idxpath))
                if r.local is False:
                    actor_id = r.uid
                    message = {
                        'uri': 'agave://' + agave_sys + '/' + idxpath,
                        'generated_by': generated_by,
                        '__options': {
                            'parent': agave_uri
                        }
                    }
                    resp = r.send_message(actor_id,
                                          message,
                                          retryMaxAttempts=3)
                    batch_sub += 1
                    if batch_sub > r.settings.batch.size:
                        batch_sub = 0
                        if r.settings.batch.randomize_sleep:
                            sleep(random() * r.settings.batch.sleep_duration)
                        else:
                            sleep(r.settings.batch.sleep_duration)
                    if 'executionId' in resp:
                        r.logger.debug(
                            'Dispatched indexing task for {} in execution {}'.
                            format(idxpath, resp['executionId']))
            except Exception as exc:
                r.logger.critical(
                    'Failed to launch indexing task for {}: {}'.format(
                        agave_full_path, exc))
Ejemplo n.º 7
0
def main():
    """Main function"""
    r = Reactor()
    r.logger.info("Hello this is actor {}".format(r.uid))
Ejemplo n.º 8
0
def main():

    rx = Reactor()
    mes = AttrDict(rx.context.message_dict)
    rx.logger.info('raw_message: {}'.format(rx.context.raw_message))

    if mes == {}:
        try:
            jsonmsg = json.loads(rx.context.raw_message)
            mes = jsonmsg
        except Exception:
            pass

    #    ['event', 'agavejobs', 'index', 'indexed']
    action = "urlparams"
    try:
        for a in ["index", "indexed"]:
            try:
                rx.logger.debug("Checking against schema {}".format(a))
                rx.validate_message(mes,
                                    messageschema="/schemas/" + a +
                                    ".jsonschema",
                                    permissive=False)
                action = a
                break
            except Exception as exc:
                print("Validation error: {}".format(exc))
        if action is None:
            pprint(mes)
            raise ValidationError("Unknown schema")
    except Exception as vexc:
        rx.on_failure("Message was not processed", vexc)

    rx.logger.debug("Schema: {}".format(action))

    # for k, v in os.environ.items():
    #     rx.logger.debug("env:{}={}".format(k, v))

    PARAMS = [
        ("uuid", "uuid", None),
        ("token", "token", None),
        ("level", "level", "1"),
        ("filters", "filters", None),
    ]

    # Look in the message, then in context, then in environment for values
    cb = dict()
    try:
        for param, key, default in PARAMS:
            cb[key] = mes.get(
                param, rx.context.get(param, os.environ.get(param, default)))
            rx.logger.debug("param:{}={}".format(param, cb[key]))
    except Exception as exc:
        rx.on_failure(exc)
    # Transform JSON string representation of filters so they can be used
    # as Python regex. This is enough for filters passed from message but
    # not a URL parameter.
    # TODO implement urldecode on ?filters parameter
    parsed_filters = cb["filters"]
    # if cb["filters"] is not None:
    #     for f in cb["filters"]:
    #         parsed_filters.append(unquote(f))
    #     cb["filters"] = parsed_filters

    rx.logger.info('Processing event {0} for {1}'.format(action, cb['uuid']))

    # Simple case - we're just processing 'indexed'
    if action == "indexed":
        rx.logger.info('Indexed job {}'.format(cb['uuid']))
        try:
            store = ManagedPipelineJobInstance(rx.settings.mongodb,
                                               uuid=cb['uuid'],
                                               agave=rx.client)

            store_state = store.state
            last_event = store.last_event

            # notify events manager that we are planning to process an 'indexed' event
            if rx.settings.state_enter:
                forward_event(cb['uuid'], 'indexed', store_state,
                              {"last_event": last_event}, rx)

            # This is where the actual indexed event is handled
            # (Job.state is updated and history amended)
            resp = store.indexed(token=cb["token"])

            # notify events manager that we processed an 'indexed' event
            if rx.settings.state_exit:
                forward_event(cb['uuid'], 'indexed', resp['state'],
                              {"last_event": resp['last_event']}, rx)

            rx.on_success('Processed indexed event for {0}'.format(cb['uuid']))
        except Exception as mexc:
            rx.on_failure('Failed to handle indexed event: {}', mexc)

    if action in ["index", "urlparams"]:
        rx.logger.info('Indexing job {}'.format(cb['uuid']))
        try:
            store = ManagedPipelineJobInstance(rx.settings.mongodb,
                                               agave=rx.client,
                                               uuid=cb['uuid'])
            # TODO - Pass in generated_by=config#pipelines.process_uuid

            # notify events manager that we got an 'index' event
            store_state = store.state
            last_event = store.last_event
            if rx.settings.state_enter:
                forward_event(cb['uuid'], 'index', 'INDEXING',
                              {"last_event": last_event}, rx)

            resp = store.index(
                token=cb["token"],
                transition=False,
                filters=cb["filters"],
                generated_by=[rx.settings.pipelines.process_uuid],
            )

            if rx.settings.state_exit:
                # because the index handler returns a list, we have to query the job in order to
                # know its state and last event
                updated_store = ManagedPipelineJobInstance(rx.settings.mongodb,
                                                           agave=rx.client,
                                                           uuid=cb['uuid'])
                forward_event(cb['uuid'], 'index', updated_store.state,
                              {"last_event": updated_store.last_event}, rx)

            # rx.logger.info('store.index response was: {}'.format(resp))

            if isinstance(resp, list):
                rx.logger.info(
                    "Indexed {} files to PipelineJob {}. ({} usec)".format(
                        len(resp), cb["uuid"], rx.elapsed()))

                # Send 'indexed' event to job via PipelineJobsManager (not PipelineJobsIndexer!)
                # This results in two messages required to move a job to FINISHED from INDEXING
                # but allows the jobs-manager to subscribe to and acto on the indexed event
                try:
                    # resp = store.indexed(token=cb["token"])
                    if rx.settings['standalone'] is True:
                        job_manager_id = rx.uid
                        mgr_mes = {'uuid': cb['uuid'], 'name': 'indexed'}
                    else:
                        job_manager_id = rx.settings.pipelines.job_manager_id
                        mgr_mes = {
                            'uuid': cb['uuid'],
                            'name': 'indexed',
                            'data': {
                                'source': 'jobs-manager.prod'
                            }
                        }

                    rx.send_message(job_manager_id,
                                    mgr_mes,
                                    retryMaxAttempts=10)

                    rx.on_success('Sent indexed event for {0}'.format(
                        cb['uuid']))
                except Exception as mexc:
                    rx.on_failure('Failed to send indexed event', mexc)
            else:
                rx.logger.info("Indexed and transitioned to {0}".format(
                    resp.get("state", "Unknown")))
        except Exception as iexc:
            rx.on_failure("Failed to accomplish indexing", iexc)
    else:
        rx.on_failure("Failed to interpret indexing request")
Ejemplo n.º 9
0
def main():

    rx = Reactor()
    m = AttrDict(rx.context.message_dict)

    if m == {}:
        try:
            jsonmsg = json.loads(rx.context.raw_message)
            m = jsonmsg
        except Exception:
            pass

    #    ['event', 'agavejobs', 'create', 'delete']
    action = "emptypost"
    try:
        for a in ["aloejobs", "event", "agavejobs"]:
            try:
                rx.logger.info("Testing against {} schema".format(a))
                rx.validate_message(m,
                                    messageschema="/schemas/" + a +
                                    ".jsonschema",
                                    permissive=False)
                action = a
                break
            except Exception as exc:
                print("Validation error: {}".format(exc))
        if action is None:
            pprint(m)
            raise ValidationError("Message did not a known schema")
    except Exception as vexc:
        rx.on_failure("Failed to process message", vexc)

    # rx.logger.debug("SCHEMA DETECTED: {}".format(action))

    # store = PipelineJobStore(mongodb=rx.settings.mongodb)
    # Process the event

    # Get URL params from Abaco context
    #
    # These can be overridden by the event body or custom
    # code implemented to process the message. This has a
    # side effect of allowing the manager to process empty
    # POST bodies so long as the right values are presented
    # as URL params.
    #
    # cb_* variables are always overridden by the contents of
    #   the POST body
    #
    cb_event_name = rx.context.get("event", None)
    cb_job_uuid = rx.context.get("uuid", None)
    cb_token = rx.context.get("token", "null")
    # Accept a 'note' as a URL parameter
    # TODO - urldecode the contents of 'note'
    cb_note = rx.context.get("note", "Event had no JSON payload")
    # NOTE - contents of cb_data will be overridden in create, event. aloejob
    cb_data = {"note": cb_note}
    # Accept 'status', the Aloe-centric name for job.state
    # as well as 'state'
    cb_agave_status = rx.context.get("status", rx.context.get("state", None))

    # Prepare template PipelineJobsEvent
    event_dict = {
        "uuid": cb_job_uuid,
        "name": cb_event_name,
        "token": cb_token,
        "data": cb_data,
    }

    # This is the default message schema 'event'
    if action == "event":
        # Filter message and override values in event_dict with its contents
        for k in ["uuid", "name", "token", "data"]:
            event_dict[k] = m.get(k, event_dict.get(k))

    # AgaveJobs can update the status of an existing job but cannot
    # create one. To do so, an Agave job must be launched
    # using the PipelineJobsAgaveProxy resource.
    if action == "agavejobs":
        rx.on_failure("Agave job callbacks are no longer supported")
    elif action == "aloejobs":
        try:
            # Aloe jobs POST their current JSON representation to
            # callback URL targets. The POST body contains a 'status' key.
            # If for some reason it doesn't, job status is determined by
            # the 'state' or 'status' URL parameter.
            if cb_agave_status is None:
                cb_agave_status = m.get("status", None)
            # Agave job message bodies include 'id' which is the jobId
            mes_agave_job_id = m.get("id", None)
            rx.logger.debug("aloe_status: {}".format(cb_agave_status))
            if cb_agave_status is not None:
                cb_agave_status = cb_agave_status.upper()
        except Exception as exc:
            rx.on_failure(
                "Aloe callback POST and associated URL parameters were missing some required fields",
                exc,
            )

        # If the job status is 'RUNNING' then use a subset of the POST for
        # event.data. Otherwise, create an event.data from the most recent
        # entry in the Agave job history. One small detail to note is that
        # callbacks are sent at the beginning of event processing in the
        # Agave jobs service and so a handful of fields in the job record
        # that are late bound are not yet populated when the event is sent.
        if cb_agave_status == "RUNNING":
            cb_data = minify_job_dict(dict(m))
        else:
            cb_data = {"status": cb_agave_status}
            # Fetch latest history entry to put in event.data
            try:
                # Is there a better way than grabbing entire history that can
                # be implemented in a pure Agave call? Alternatively, we could
                # cache last offset for this job in rx.state but that will
                # limit our scaling to one worker
                #
                agave_job_latest_history = rx.client.jobs.getHistory(
                    jobId=mes_agave_job_id,
                    limit=100)[-1].get("description", None)
                if agave_job_latest_history is not None:
                    cb_data["description"] = agave_job_latest_history
            except Exception as agexc:
                rx.logger.warning("Failed to get history for {}: {}".format(
                    mes_agave_job_id, agexc))

        # Map the Agave job status to an PipelineJobsEvent name
        if cb_event_name is None and cb_agave_status is not None:
            cb_event_name = AgaveEvents.agavejobs.get(cb_agave_status,
                                                      "update")
            rx.logger.debug("Status: {} => Event: {}".format(
                cb_agave_status, cb_event_name))

        # Event name and data can be updated as part of processing an Agave POST
        # so apply the current values to event_dict here
        event_dict["name"] = cb_event_name
        event_dict["data"] = cb_data

    # Sanity check event_dict and token
    if event_dict["uuid"] is None or event_dict[
            "name"] is None or cb_token is None:
        rx.on_failure("No actionable event was received.")

    # Instantiate a job instance to leverage the MPJ framework
    store = ManagedPipelineJobInstance(rx.settings.mongodb,
                                       event_dict["uuid"],
                                       agave=rx.client)

    # Handle event...
    try:

        # First, proxy events. This code forwards index and indexed events to the jobs-indexer
        # Proxy 'index'
        if event_dict["name"] == "index":
            rx.logger.info("Forwarding 'index'")
            index_mes = {
                "name": "index",
                "uuid": event_dict["uuid"],
                "token": event_dict["token"],
            }
            rx.send_message(rx.settings.pipelines.job_indexer_id,
                            index_mes,
                            retryMaxAttempts=10)
            # Disable this since it should be picked up via events-manager subscription
            # message_control_annotator(up_job, ["INDEXING"], rx)

        # Proxy 'indexed'
        elif event_dict["name"] == "indexed":
            rx.logger.info("Forwarding 'indexed'")
            index_mes = {
                "name": "indexed",
                "uuid": event_dict["uuid"],
                "token": event_dict["token"],
            }
            rx.send_message(rx.settings.pipelines.job_indexer_id,
                            index_mes,
                            retryMaxAttempts=10)
            # Disable this since it should be picked up via events-manager subscription
            # message_control_annotator(up_job, ["FINISHED"], rx)

        # Handle all other events
        else:
            rx.logger.info("Handling '{}'".format(event_dict["name"]))
            # Get the current state of the MPJ. We use this to detect if
            # handling the event has resulted in a change of state
            store_state = store.state
            last_event = store.last_event

            # Send event at the beginning of state change so subscribers can pick
            # up, for instance, a case where the job receives an index event and
            # is in the FINISHED state.
            if rx.settings.state_enter:
                forward_event(event_dict["uuid"], event_dict['name'],
                              store_state, {'last_event': last_event}, rx)

            up_job = store.handle(event_dict, cb_token)
            if rx.settings.state_exit:
                forward_event(up_job["uuid"], event_dict['name'],
                              up_job["state"],
                              {"last_event": up_job["last_event"]}, rx)

    except Exception as exc:
        rx.on_failure("Event not processed", exc)

    rx.on_success("Processed event in {} usec".format(rx.elapsed()))
Ejemplo n.º 10
0
def main():

    r = Reactor()
    m = AttrDict(r.context.message_dict)
    if m == {}:
        try:
            print(r.context.raw_message)
            jsonmsg = json.loads(r.context.raw_message)
            m = jsonmsg
        except Exception:
            pass

    # Allow passed vars to override
    token = os.environ.get('token', None)
    uuid = os.environ.get('uuid', None)
    if uuid:
        action = os.environ.get('action', 'show')
    else:
        action = None

    try:
        for a in ['create']:
            try:
                schema_file = '/schemas/' + a + '.jsonschema'
                r.validate_message(m,
                                   messageschema=schema_file,
                                   permissive=False)
                action = a
                break
            except Exception as exc:
                if action not in ('enable', 'disable'):
                    r.logger.debug('Validation to "{0}" failed: {1}\n'.format(
                        a, exc))
        if action is None:
            raise ValidationError('Message did not match any known schema')
    except Exception as vexc:
        r.on_failure('Failed to process message', vexc)

    r.logger.debug('Action selected: {}'.format(action))

    # Set up Store objects
    pipe_store = PipelineStore(mongodb=r.settings.mongodb)

    if action == 'get':
        resp = pipe_store.find_one_by_uuid(uuid)
        r.logger.info(resp)
        r.on_success('Exists and was printed to execution log')

    if action == 'create':
        create_dict = copy.deepcopy(m)
        try:
            if uuid and token:
                r.logger.info('Replacing {}'.format(uuid))
                pipeline = pipe_store.add_update_document(
                    create_dict,
                    uuid=uuid,
                    token=token,
                    strategy=strategies.REPLACE)
            else:
                r.logger.info('Creating pipeline...')
                pipeline = pipe_store.add_update_document(create_dict)
            r.on_success('Wrote pipeline {}; Update token: {}'.format(
                pipeline['uuid'], pipeline['_update_token']))
        except Exception as exc:
            r.on_failure('Write failed', exc)

    if action == 'disable':
        try:
            r.logger.info('Disabling pipeline {}'.format(uuid))
            resp = pipe_store.delete_document(uuid, token, force=False)
            r.on_success('Success')
        except Exception as exc:
            r.on_failure('Disable failed', exc)

    if action == 'enable':
        try:
            r.logger.info('Enabling pipeline {}'.format(uuid))
            resp = pipe_store.undelete(uuid, token)
            r.on_success('Success')
        except Exception as exc:
            r.on_failure('Enable failed', exc)
Ejemplo n.º 11
0
def main():

    def on_failure(message, exception):
        # if r.settings.pipelines.active:
        #     job.fail(message)
        r.on_failure(message, exception)

    def on_success(message):
        # if r.settings.pipelines.active:
        #     job.finish(message)
        r.on_success(message)

    r = Reactor()
    m = AttrDict(r.context.message_dict)
    # ! This code fixes an edge case and will be moved lower in the stack
    if m == {}:
        try:
            jsonmsg = json.loads(r.context.raw_message)
            m = jsonmsg
        except Exception:
            pass

    # Use JSONschema-based message validator
    if not r.validate_message(m):
        r.on_failure('Invalid message received', ValueError())

    # Process options. Eventually move this into a Reactor method.
    # May need to add a filter to prevent some things from being over-written
    options_settings = {}
    if '__options' in m:
        # allow override of settings
        try:
            options_settings = m.get('options', {}).get('settings', {})
            if isinstance(options_settings, dict):
                options_settings = AttrDict(options_settings)
            r.settings = r.settings + options_settings
        except Exception as exc:
            on_failure('Failed to handle options', exc)

    agave_uri = m.get('uri')
    agave_sys, agave_path, agave_file = datacatalog.agavehelpers.from_agave_uri(agave_uri)
    agave_full_path = os.path.join(agave_path, agave_file)

    # if r.settings.pipelines.active:
    #     job = datacatalog.managers.pipelinejobs.ManagedPipelineJob(
    #         r.settings.mongodb,
    #         r.settings.pipelines,
    #         instanced=False,
    #         archive_path=agave_path
    #     )
    #     job.setup().run({'Processing': agave_uri})

    # r.logger.debug('Downloading file')
    # LOCALFILENAME = r.settings.downloaded
    # try:
    #     bacanora.download(r.client, agave_full_path, LOCALFILENAME, agave_sys)
    # except Exception as exc:
    #     # job.fail('Download failed')
    #     on_failure('Failed to download {}'.format(agave_file), exc)

    # TODO - Add optional validation of file references before loading data

    try:
        r.logger.debug(
            'Initializing SampleSetProcessor with {}'.format(r.client))
        db = datacatalog.managers.sampleset.SampleSetProcessor(
            r.settings.mongodb,
            agave=r.client,
            samples_uri=agave_uri,
            path_prefix=agave_path).setup()

        # Validate the downloaded file
        # (optional, controlled by config.yml#validate)
        if r.settings.validate:
            r.logger.debug('Validating {}'.format('agave_file'))
            try:
                resolver = jsonschema.RefResolver('', '').resolve_remote(SCHEMA_URI)
                instance = json.load(open(agave_file, 'r'))
                assert jsonschema.validate(instance, resolver, format_checker=formatChecker()) is None
            except Exception as exc:
                on_failure(
                    'Failed to validate metadata file {}'.format(agave_file), exc)

        r.logger.debug('Now calling SampleSetProcessor.process()')
        dbp = db.process()
        assert dbp is True
    except Exception as exc:
        on_failure('Ingest failed for {}'.format(agave_file), exc)

    if not r.local:
        r.loggers.slack.info(
            ':mario_star: Ingested {} ({} usec)'.format(agave_uri, r.elapsed()))

    on_success('Ingest complete for {} ({} usec)'.format(agave_uri, r.elapsed()))
Ejemplo n.º 12
0
def main():
    # Minimal Message Body:
    # { "uri": "s3://uploads/path/to/target.txt"}

    r = Reactor()
    m = AttrDict(r.context.message_dict)
    # ! This code fixes an edge case and will be moved lower in the stack
    if m == {}:
        try:
            jsonmsg = json.loads(r.context.raw_message)
            m = jsonmsg
        except Exception:
            pass

    # Use JSONschema-based message validator
    if not r.validate_message(m):
        r.on_failure('Message was invalid', None)

    # Rename m.Key so it makes semantic sense elsewhere in the code
    s3_uri = m.get('uri')
    if s3_uri.endswith('/'):
        s3_uri = s3_uri[:-1]
    only_sync = m.get('sync', True)
    generated_by = m.get('generated_by', [])
    r.logger.info('Received S3 URI {}'.format(s3_uri))

    sh = S3Helper()
    ah = AgaveHelper(r.client)

    # Map POSIX source and destination
    s3_bucket, srcpath, srcfile = sh.from_s3_uri(s3_uri)
    # print(s3_bucket, srcpath, srcfile)
    s3_full_relpath = os.path.join(s3_bucket, srcpath, srcfile)
    if r.settings.safen_paths:
        # Munge out unicode characters on upload. Default for safen_path
        # also transforms spaces into hyphen character
        ag_full_relpath = safen_path(s3_full_relpath,
                                     no_unicode=True,
                                     no_spaces=True)
        if ag_full_relpath != s3_full_relpath:
            r.logger.warning('Safened path: {} => {}'.format(
                s3_full_relpath, ag_full_relpath))
    else:
        ag_full_relpath = s3_full_relpath

    ag_uri = 'agave://data-sd2e-community/' + ag_full_relpath
    r.logger.info('Generated Tapis resource: {}'.format(ag_uri))

    posix_src = sh.mapped_catalog_path(s3_full_relpath)
    posix_dest = ah.mapped_posix_path(os.path.join('/', ag_full_relpath))
    # agave_full_path = agave_dest
    r.logger.debug('POSIX src: {}'.format(posix_src))
    r.logger.debug('POSIX dst: {}'.format(posix_dest))

    def cmpfiles(posix_src, posix_dest, mtime=True, size=True, cksum=False):

        # Existence
        if not os.path.exists(posix_dest):
            return False

        if not os.path.exists(posix_src):
            return False

        # Both files exist, so read in POSIX stat
        stat_src = os.stat(posix_src)
        stat_dest = os.stat(posix_dest)

        # Modification time (conditional)
        if mtime:
            # Mtime on source should never be more recent than
            # destination, as destination is a result of a copy
            # operation. We might need to add ability to account
            # for clock skew but at present we assume source and
            # destination filesystems are managed by the same host
            if stat_src.st_mtime > stat_dest.st_mtime:
                return False
        # Size (conditional)
        if size:
            if stat_src.st_size != stat_dest.st_size:
                return False
        if cksum:
            # Not implemented
            # TODO Implement very fast hasher instead of sha256 for sync
            #      1. https://github.com/kalafut/py-imohash
            #      2. https://pypi.org/project/xxhash/
            raise NotImplementedError(
                'Checksum comparison is not yet implemented')

        # None of the False tests returned so we can safely return True
        return True

    to_process = list()
    # Is the source physically a FILE?
    if sh.isfile(posix_src):
        # If in sync mode, check if source and destination differ
        if only_sync is True and cmpfiles(posix_src, posix_dest, mtime=False):
            # if os.path.exists(posix_dest) and only_sync is True:
            r.logger.debug('Compared: src == dest {}, {}'.format(
                posix_src, posix_dest))
        else:
            # Not in sync mode - force overwrite destination with source
            r.logger.debug('Compared: src != dest {}, {}'.format(
                posix_src, posix_dest))
            copyfile(r, posix_src, posix_dest, ag_uri)
            routemsg(r, ag_uri)
    elif sh.isdir(posix_src):
        # It's a directory. Recurse through it and launch file messages to self
        r.logger.debug('Directory found: {}'.format(posix_src))
        to_process = sh.listdir(posix_src,
                                recurse=True,
                                bucket=s3_bucket,
                                directories=False)
        pprint(to_process)
        r.logger.info('Sync tasks found: {}'.format(len(to_process)))

        # List to_list is constructed in POSIX ls order. Adding a shuffle
        # spreads the processing evenly over all files
        shuffle(to_process)
        batch_sub = 0
        for procpath in to_process:
            try:
                r.logger.debug('Processing {}'.format(procpath))
                # Here is the meat of the directory syncing behavior
                posix_src = sh.mapped_catalog_path(procpath)
                posix_dest = ah.mapped_posix_path(os.path.join('/', procpath))
                if (only_sync is False or
                        cmpfiles(posix_src, posix_dest, mtime=False) is False):
                    r.logger.info('Copying {}'.format(procpath))
                    actor_id = r.uid
                    resp = dict()
                    s3_msg_uri = 's3://' + procpath
                    message = {
                        'uri': s3_msg_uri,
                        'generated_by': generated_by,
                        'sync': only_sync
                    }

                    if r.local is False:
                        try:
                            r.logger.debug(
                                'Messaging {} with copy request'.format(
                                    actor_id))
                            resp = r.send_message(actor_id,
                                                  message,
                                                  retryMaxAttempts=3,
                                                  ignoreErrors=False)
                            if 'executionId' in resp:
                                r.logger.info('Message response: {}'.format(
                                    resp['executionId']))
                            else:
                                raise AgaveError('Message failed')
                        except Exception:
                            raise
                    else:
                        r.logger.debug(message)

                    batch_sub += 1
                    # Always sleep a little bit between task submissions
                    sleep(random() * r.settings.batch.task_sleep_duration)
                    # Sleep a little longer every N submissions
                    if batch_sub > r.settings.batch.size:
                        batch_sub = 0
                        if r.settings.batch.randomize_sleep:
                            sleep(random() * r.settings.batch.sleep_duration)
                        else:
                            sleep(r.settings.batch.sleep_duration)
                else:
                    r.logger.debug('Copy not required for {}'.format(procpath))
            except Exception as exc:
                r.logger.error('Copy operation failed for {}: {}'.format(
                    ag_full_relpath, exc))
    else:
        r.on_failure('Process failed and {} was not synced'.format(posix_src))
Ejemplo n.º 13
0
def main():
    rx = Reactor()
    mes = AttrDict(rx.context.message_dict)

    mongodb_conn = rx.settings.mongodb

    # ! This code fixes an edge case in JSON serialization
    if mes == {}:
        try:
            jsonmsg = json.loads(rx.context.raw_message)
            mes = AttrDict(jsonmsg)
        except Exception as exc:
            rx.on_failure('Failed to load JSON from message', exc)

    # Check incoming message against the default JSON schema
    try:
        rx.validate_message(mes, permissive=False)
    except Exception as exc:
        rx.on_failure('Failed to validate message to schema', exc)

    # Verify appId is known to Agave apps API. Requires the invoking
    # user has a tenant admin role unless the appId is public
    agave_job = mes.get('job_definition')
    agave_appid = agave_job.get('appId')
    agave_app_details = None
    job_params = mes.get('parameters')
    instanced_archive_path = mes.get('instanced', True)
    rx.logger.info(
        'Received request to manage execution of {}'.format(agave_appid))
    try:
        agave_app_details = rx.client.apps.get(appId=agave_appid)
    except HTTPError as http_err:
        rx.on_failure(
            '{} is not a known Agave application'.format(
                agave_appid), http_err)
    except Exception as generic_exception:
        rx.on_failure(
            'Failed to look up Agave application', generic_exception)

    # Look up the Pipeline record for this Agave appId.
    #
    # Note that this requires a convention where the standalone Agave app is
    # registered in the Pipelines system with pipeline.id == agave.app.id
    pipeline_uuid = None
    try:
        manager_stores = Manager.init_stores(mongodb_conn)
        pipeline_rec = manager_stores['pipeline'].find_one_by_id(id=agave_appid)
        if pipeline_rec is None:
            raise ValueError("No 'pipelines' record found in database")
        else:
            pipeline_uuid = pipeline_rec.get('uuid')
    except Exception as generic_exception:
        rx.on_failure('Failed to resolve appId {} to a Pipeline record'.format(
            agave_appid), generic_exception)

    def cancel_job(message='an error occurred', exception=None):
        """Helper function to cancel a failed job
        """
        fmt_message = 'PipelineJob {} canceled because {}'.format(
            job_uuid, message)
        try:
            job.cancel()
        except Exception as job_cancel_exception:
            rx.logger.warning(
                'Failed to cancel PipelineJob {} because {}'.format(
                    job_uuid, job_cancel_exception))

        rx.on_failure(fmt_message, exception)

    def fail_job(message='an error occurred', exception=None):
        """Helper function to fail a job
        """
        fmt_message = 'PipelineJob {} failed because {}'.format(
            job_uuid, message)
        try:
            job.fail(data={'message': message})
        except Exception as job_fail_exception:
            rx.logger.warning(
                'Unable to update PipelineJob state for {} because {}'.format(
                    job_uuid, job_fail_exception))

        rx.on_failure(fmt_message, exception)

    # Initialize the ManagedPipelineJob. It will be in the jobs collection
    # with a status of CREATED.
    job = None
    job_uuid = None

    rx.logger.info('Building initial job.data')
    init_data = agave_job
    mes_data = mes.get('data', {})
    for k, v in job_params.items():
        if v is not None and isinstance(v, str):
            init_data[k] = v
    # init_data = {**init_data, **mes_data}

    try:
        job = ManagedPipelineJob(rx.settings.mongodb,
                                 rx.settings.pipelines.job_manager_id,
                                 rx.settings.pipelines.updates_nonce,
                                 pipeline_uuid=pipeline_uuid,
                                 data=init_data,
                                 session=rx.nickname,
                                 agent=rx.uid,
                                 task=rx.execid,
                                 instanced=instanced_archive_path,
                                 archive_path_patterns=mes.get(
                                     'index_patterns', []),
                                 ** job_params
                                 )

        job.setup(mes_data)

        job_uuid = job.uuid
    except Exception as generic_exception:
        if job is not None:
            cancel_job(message='Failed to set up ManagedPipelineJob',
                       exception=generic_exception)
        else:
            rx.on_failure('Failed to set up ManagedPipelineJob', generic_exception)

    # Extend the incoming Agave job definition to update the PipelineJob.
    # Set the archivePath and archiveSystem from the ManagedPipelineJob
    #
    # The former is accomplished by adding custom notifications built from
    # the job's 'callback' property, which was initialized on job.setup(). Any
    # pre-existing notifications (email, other callbacks) are preserved.
    try:
        if 'notifications' not in agave_job:
            agave_job['notifications'] = list()
        # for event in ('SUBMITTING', 'STAGING_JOB', 'RUNNING', 'ARCHIVING', 'ARCHIVING_FINIS', 'FINISHED', 'FAILED'):
            # Capture all Agave job states
            notification = {'event': '*',
                            'persistent': True,
                            'url': job.callback + '&status=${STATUS}&note=${JOB_ERROR}'}
            agave_job['notifications'].append(notification)

            notification = {'event': 'FINISHED',
                            'persistent': False,
                            'url': job.indexer_callback}
            agave_job['notifications'].append(notification)

        agave_job['archiveSystem'] = job.archive_system
        agave_job['archivePath'] = job.archive_path
        agave_job['archive'] = True

    except Exception as generic_exception:
        cancel_job(
            message='Failed to prepare Agave job definition',
            exception=generic_exception)

    if rx.local:
        print(json.dumps(agave_job, indent=4))
        sys.exit(0)

    # Launch the Agave job
    agave_job_id = None
    try:
        resp = rx.client.jobs.submit(body=agave_job)
        agave_job_id = None
        if 'id' in resp:
            agave_job_id = resp['id']
        else:
            raise KeyError('Invalid response received from jobs.submit()')
    except HTTPError as h:
        http_err_resp = agaveutils.process_agave_httperror(h)
        fail_job(
            message='Encountered API error: {}'.format(http_err_resp),
            exception=HTTPError)
    except Exception as job_submit_exception:
        fail_job(message='Failed to launch {}'.format(
            agave_appid), exception=job_submit_exception)

    # Update the PipelineJob status
    #
    # This will create an entry in its history with an explicit link to
    # the job asset. If this doesn't succeed, we don't fail the job since
    # the expensive part (the Agave job) has been submitted.
    try:
        job_uri = job.canonicalize_job(agave_job_id)
        job.run(data={'job_link': job_uri})
    except Exception as job_update_exception:
        rx.logger.warning(
            'Unable to update status of job {} because {}'.format(
                job_uuid, job_update_exception))

    # If no other exit state has been encountered, report success
    rx.on_success('ManagedPipelineJob {} is managing Agave job {} ({} usec)'.format(
        job_uuid, agave_job_id, rx.elapsed()))