Example #1
0
# ------------------------------------------------------------------------------
#
# set up the connection to EC2
#

if not 'EC2_URL' in os.environ: usage("no %s in environment" % 'EC2_URL')
if not 'EC2_ACCESS_KEY' in os.environ:
    usage("no %s in environment" % 'EC2_ACCESS_KEY')
if not 'EC2_SECRET_KEY' in os.environ:
    usage("no %s in environment" % 'EC2_SECRET_KEY')
if not 'EC2_KEYPAIR_ID' in os.environ:
    usage("no %s in environment" % 'EC2_KEYPAIR_ID')
if not 'EC2_KEYPAIR' in os.environ:
    usage("no %s in environment" % 'EC2_KEYPAIR')

server = saga.Url(os.environ['EC2_URL'])

# in order to connect to EC2, we need an EC2 ID and KEY
c1 = saga.Context('ec2')
c1.user_id = os.environ['EC2_ACCESS_KEY']
c1.user_key = os.environ['EC2_SECRET_KEY']
c1.server = server

# in order to access a created VM, we additionally need to point to the ssh
# key which is used for EC2 VM contextualization, i.e. as EC2 'keypair'.
# If the keypair is not yet registered on EC2, it will be registered by SAGA
# -- but then a user_key *must* be specified (only the public key is ever
# transfererd to EC2).
c2 = saga.Context('ec2_keypair')
c2.token = os.environ['EC2_KEYPAIR_ID']
c2.user_cert = os.environ['EC2_KEYPAIR']
Example #2
0
    def _handle_pilot_input_staging(self, pilot, sds):

        pid = pilot['uid']

        # NOTE: no unit sandboxes defined!
        src_context = {
            'pwd': pilot['client_sandbox'],
            'pilot': pilot['pilot_sandbox'],
            'resource': pilot['resource_sandbox']
        }
        tgt_context = {
            'pwd': pilot['pilot_sandbox'],
            'pilot': pilot['pilot_sandbox'],
            'resource': pilot['resource_sandbox']
        }

        # Iterate over all directives
        for sd in sds:

            # TODO: respect flags in directive

            action = sd['action']
            flags = sd['flags']
            did = sd['uid']
            src = sd['source']
            tgt = sd['target']

            assert (action in [COPY, LINK, MOVE, TRANSFER])

            self._prof.prof('staging_in_start', uid=pid, msg=did)

            src = complete_url(src, src_context, self._log)
            tgt = complete_url(tgt, tgt_context, self._log)

            if action in [COPY, LINK, MOVE]:
                self._prof.prof('staging_in_fail', uid=pid, msg=did)
                raise ValueError("invalid action '%s' on pilot level" % action)

            self._log.info('transfer %s to %s', src, tgt)

            # FIXME: make sure that tgt URL points to the right resource
            # FIXME: honor sd flags if given (recursive...)
            flags = rsfs.CREATE_PARENTS

            if os.path.isdir(src.path):
                flags |= rsfs.RECURSIVE

            # Define and open the staging directory for the pilot
            # We use the target dir construct here, so that we can create
            # the directory if it does not yet exist.

            # url used for cache (sandbox url w/o path)
            tmp = rs.Url(pilot['pilot_sandbox'])
            tmp.path = '/'
            key = str(tmp)

            self._log.debug("rs.file.Directory ('%s')", key)

            with self._cache_lock:
                if key in self._saga_fs_cache:
                    fs = self._saga_fs_cache[key]

                else:
                    fs = rsfs.Directory(key, session=self._session)
                    self._saga_fs_cache[key] = fs

            fs.copy(src, tgt, flags=flags)

            sd['pmgr_state'] = rps.DONE

            self._prof.prof('staging_in_stop', uid=pid, msg=did)

        self.publish(
            rpc.CONTROL_PUBSUB, {
                'cmd': 'pilot_staging_input_result',
                'arg': {
                    'pilot': pilot,
                    'sds': sds
                }
            })
Example #3
0
    def run(self):
        """Starts the process when Process.start() is called.
        """

        # make sure to catch sys.exit (which raises SystemExit)
        try:

            logger.info("Starting InputFileTransferWorker")

            # Try to connect to the database and create a tailable cursor.
            try:
                db = self._session.get_db()
                um_col = db["%s.cu" % self._session.uid]
                logger.debug(
                    "Connected to MongoDB. Serving requests for UnitManager %s."
                    % self.unit_manager_id)

            except Exception as e:
                logger.exception("Connection error: %s" % e)
                raise

            while not self._stop.is_set() and \
                  not self._session._terminate.is_set():

                # See if we can find a ComputeUnit that is waiting for
                # input file transfer.
                ts = timestamp()
                compute_unit = um_col.find_and_modify(
                    query={
                        "unitmanager": self.unit_manager_id,
                        "state": PENDING_INPUT_STAGING,
                    },
                    update={
                        "$set": {
                            "state": STAGING_INPUT
                        },
                        "$push": {
                            "statehistory": {
                                "state": STAGING_INPUT,
                                "timestamp": ts
                            }
                        }
                    })

                if compute_unit is None:
                    # Sleep a bit if no new units are available.
                    time.sleep(IDLE_TIME)

                else:
                    compute_unit_id = None
                    state = STAGING_INPUT

                    try:
                        log_messages = []

                        # We have found a new CU. Now we can process the transfer
                        # directive(s) wit SAGA.
                        compute_unit_id = str(compute_unit["_id"])

                        logger.debug("InputStagingController: unit found: %s" %
                                     compute_unit_id)
                        self._session.prof.prof('advance',
                                                uid=compute_unit_id,
                                                msg=state,
                                                state=state)

                        remote_sandbox = compute_unit["sandbox"]
                        input_staging = compute_unit.get(
                            "FTW_Input_Directives", [])

                        # if we do staging, create the CU's directory in case it doesn't exist yet.
                        if input_staging:
                            log_msg = "InputStagingController: Creating ComputeUnit sandbox directory %s." % remote_sandbox
                            log_messages.append(log_msg)
                            logger.info(log_msg)

                            # Creating/initialising the sandbox directory.
                            try:
                                logger.debug("saga.fs.Directory ('%s')" %
                                             remote_sandbox)

                                # url used for saga
                                remote_sandbox_url = saga.Url(remote_sandbox)

                                # keyurl and key used for cache
                                remote_sandbox_keyurl = saga.Url(
                                    remote_sandbox)
                                remote_sandbox_keyurl.path = '/'
                                remote_sandbox_key = str(remote_sandbox_keyurl)

                                if remote_sandbox_key not in self._saga_dirs:
                                    self._saga_dirs[remote_sandbox_key] = \
                                            saga.filesystem.Directory(remote_sandbox_url,
                                                    flags=saga.filesystem.CREATE_PARENTS,
                                                    session=self._session)

                                saga_dir = self._saga_dirs[remote_sandbox_key]
                            except Exception as e:
                                logger.exception('Error: %s' % e)
                                raise

                            logger.info(
                                "InputStagingController: Processing input file transfers for ComputeUnit %s"
                                % compute_unit_id)

                        # Loop over all transfer directives and execute them.
                        for sd in input_staging:

                            logger.debug(
                                "InputStagingController: sd: %s : %s" %
                                (compute_unit_id, sd))

                            # Check if there was a cancel request
                            state_doc = um_col.find_one(
                                {"_id": compute_unit_id}, fields=["state"])
                            if state_doc['state'] == CANCELED:
                                self._session.prof.prof('advance',
                                                        uid=compute_unit_id,
                                                        msg=CANCELED,
                                                        state=CANCELED)
                                logger.info(
                                    "Compute Unit Canceled, interrupting input file transfers."
                                )
                                state = CANCELED
                                # Break out of the loop for this CU's SD's
                                break

                            abs_src = os.path.abspath(sd['source'])
                            input_file_url = saga.Url("file://localhost%s" %
                                                      abs_src)
                            if not sd['target']:
                                target = '%s/%s' % (remote_sandbox,
                                                    os.path.basename(abs_src))
                            else:
                                target = "%s/%s" % (remote_sandbox,
                                                    sd['target'])

                            log_msg = "Transferring input file %s -> %s" % (
                                input_file_url, target)
                            log_messages.append(log_msg)
                            logger.debug(log_msg)

                            # Execute the transfer.
                            if CREATE_PARENTS in sd['flags']:
                                copy_flags = saga.filesystem.CREATE_PARENTS
                            else:
                                copy_flags = 0

                            try:
                                saga_dir.copy(input_file_url,
                                              target,
                                              flags=copy_flags)
                            except Exception as e:
                                logger.exception(e)
                                raise Exception("copy failed(%s)" % e.message)

                        # If this CU was canceled we can skip the remainder of this loop,
                        # to process more CUs.
                        if state == CANCELED:
                            continue

                        # All IFTW staging done for this CU.  Push it out, by
                        # setting the state as 'AGENT_ATGING_INPUT_PENDING and
                        # sending it to mongodb.  We mark the CU under 'umgr'
                        # control -- once the agent picks it up, it will be
                        # marked as under 'agent' control, before the
                        # agent_stging_output_component passes control back in
                        # a similar manner.
                        um_col.update({'_id': compute_unit_id}, {
                            '$set': {
                                'state': AGENT_STAGING_INPUT_PENDING,
                                'control': 'umgr'
                            },
                            '$push': {
                                'statehistory': {
                                    'state': AGENT_STAGING_INPUT_PENDING,
                                    'timestamp': ts
                                },
                                'log': {
                                    'timestamp':
                                    timestamp(),
                                    'message':
                                    'push unit to agent after ftw staging'
                                }
                            }
                        })
                        logger.debug(
                            "InputStagingController: %s : push to agent" %
                            compute_unit_id)
                        self._session.prof.prof(
                            'advance',
                            uid=compute_unit_id,
                            msg=AGENT_STAGING_INPUT_PENDING,
                            state=AGENT_STAGING_INPUT_PENDING)

                    except Exception as e:

                        # Update the CU's state to 'FAILED'.
                        ts = timestamp()
                        logentry = {
                            'message': "Input transfer failed: %s" % e,
                            'timestamp': ts
                        }

                        um_col.update({'_id': compute_unit_id}, {
                            '$set': {
                                'state': FAILED
                            },
                            '$push': {
                                'statehistory': {
                                    'state': FAILED,
                                    'timestamp': ts
                                },
                                'log': logentry
                            }
                        })
                        self._session.prof.prof('advance',
                                                uid=compute_unit_id,
                                                msg=FAILED,
                                                state=FAILED)

                        logger.exception(str(logentry))
                        raise

        except SystemExit as e:
            logger.debug(
                "input file transfer thread caught system exit -- forcing application shutdown"
            )
            thread.interrupt_main()
Example #4
0
    def initialize(self,
                   url,
                   session=None,
                   prompt=None,
                   logger=None,
                   posix=True,
                   interactive=True):

        with self.rlock:

            # make sure we have a valid url type
            url = saga.Url(url)

            if not prompt:
                prompt = "^(.*[\$#%>\]])\s*$"

            if not logger:
                logger = self.logger

            # collect all information we have/need about the requested master
            # connection
            info = self._create_master_entry(url, session, prompt, logger,
                                             posix, interactive)

            # we got master info - register the master, and create the instance!
            type_s = str(info['shell_type'])
            user_s = str(info['user'])
            host_s = str(info['host_str'])

            # Now, if we don't have that master, yet, we need to instantiate it
            if not host_s in self.registry: self.registry[host_s] = {}
            if not user_s in self.registry[host_s]:
                self.registry[host_s][user_s] = {}
            if not type_s in self.registry[host_s][user_s]:

                # new master: create an instance, and register it
                m_cmd = info['scripts'][info['shell_type']]['master'] % info

                logger.debug ("open master pty for [%s] [%s] %s: %s'" \
                                % (type_s, host_s, user_s, m_cmd))

                info['pty'] = supp.PTYProcess(m_cmd, logger=logger)
                if not info['pty'].alive():
                    raise se.NoSuccess._log (logger, \
                          "Shell not connected to %s" % info['host_str'])

                # authorization, prompt setup, etc.  Initialize as shell if not
                # explicitly marked as non-posix shell
                self._initialize_pty(info['pty'], info)

                # master was created - register it
                self.registry[host_s][user_s][type_s] = info

            else:
                # we already have a master: make sure it is alive, and restart as
                # needed
                info = self.registry[host_s][user_s][type_s]

                if not info['pty'].alive(recover=True):
                    raise se.IncorrectState._log (logger, \
                          "Lost shell connection to %s" % info['host_str'])

            return info
Example #5
0
def fetch_profiles(sid,
                   dburl=None,
                   client=None,
                   tgt=None,
                   access=None,
                   session=None,
                   skip_existing=False):
    '''
    sid: session for which all profiles are fetched
    client: dir to look for client session profiles
    tgt: dir to store the profile in

    returns list of file names
    '''

    ret = list()

    if not dburl:
        dburl = os.environ['RADICAL_PILOT_DBURL']

    if not dburl:
        raise RuntimeError('Please set RADICAL_PILOT_DBURL')

    if not client:
        client = os.getcwd()

    if not tgt:
        tgt = os.getcwd()

    if not tgt.startswith('/') and '://' not in tgt:
        tgt = "%s/%s" % (os.getcwd(), tgt)

    # we always create a session dir as real target
    tgt_url = saga.Url("%s/%s/" % (tgt, sid))

    # Turn URLs without schema://host into file://localhost,
    # so that they dont become interpreted as relative.
    if not tgt_url.schema:
        tgt_url.schema = 'file'
    if not tgt_url.host:
        tgt_url.host = 'localhost'

    # first fetch session profile
    # FIXME: should we record pwd or profile location in db session?  Or create
    #        a sandbox like dir for storing profiles and logs?
    client_profile = "%s/%s.prof" % (client, sid)

    ftgt = saga.Url('%s/%s' % (tgt_url, os.path.basename(client_profile)))
    ret.append("%s" % ftgt.path)

    if skip_existing and os.path.isfile(ftgt.path) \
            and os.stat(ftgt.path).st_size > 0:

        logger.report.info("\t- %s\n" % client_profile.split('/')[-1])

    else:

        logger.report.info("\t+ %s\n" % client_profile.split('/')[-1])
        prof_file = saga.filesystem.File(client_profile, session=session)
        prof_file.copy(ftgt, flags=saga.filesystem.CREATE_PARENTS)
        prof_file.close()

    _, db, _, _, _ = ru.mongodb_connect(dburl)

    json_docs = get_session_docs(db, sid)

    pilots = json_docs['pilot']
    num_pilots = len(pilots)
    #  print "Session: %s" % sid
    #  print "Number of pilots in session: %d" % num_pilots

    for pilot in pilots:

        # print "Processing pilot '%s'" % pilot['_id']

        sandbox_url = saga.Url(pilot['sandbox'])

        if access:
            # Allow to use a different access scheme than used for the the run.
            # Useful if you ran from the headnode, but would like to retrieve
            # the profiles to your desktop (Hello Titan).
            access_url = saga.Url(access)
            sandbox_url.schema = access_url.schema
            sandbox_url.host = access_url.host

        # print "Overriding remote sandbox: %s" % sandbox_url

        sandbox = saga.filesystem.Directory(sandbox_url, session=session)

        # Try to fetch a tarball of profiles, so that we can get them all in one (SAGA) go!
        PROFILES_TARBALL = '%s.prof.tgz' % pilot['_id']
        tarball_available = False
        try:
            if sandbox.is_file(PROFILES_TARBALL):
                print "Profiles tarball exists!"

                ftgt = saga.Url('%s/%s' % (tgt_url, PROFILES_TARBALL))

                if skip_existing and os.path.isfile(ftgt.path) \
                        and os.stat(ftgt.path).st_size > 0:

                    print "Skipping fetching of '%s/%s' to '%s'." % (
                        sandbox_url, PROFILES_TARBALL, tgt_url)
                    tarball_available = True
                else:

                    print "Fetching '%s%s' to '%s'." % (
                        sandbox_url, PROFILES_TARBALL, tgt_url)
                    prof_file = saga.filesystem.File(
                        "%s%s" % (sandbox_url, PROFILES_TARBALL),
                        session=session)
                    prof_file.copy(ftgt, flags=saga.filesystem.CREATE_PARENTS)
                    prof_file.close()

                    tarball_available = True
            else:
                print "Profiles tarball doesnt exists!"

        except saga.DoesNotExist:
            print "exception(TODO): profiles tarball doesnt exists!"

        try:
            os.mkdir("%s/%s" % (tgt_url.path, pilot['_id']))
        except OSError:
            pass

        # We now have a local tarball
        if tarball_available:
            print "Extracting tarball %s into '%s'." % (ftgt.path,
                                                        tgt_url.path)
            tarball = tarfile.open(ftgt.path)
            tarball.extractall("%s/%s" % (tgt_url.path, pilot['_id']))

            profiles = glob.glob("%s/*.prof" % tgt_url.path)
            print "Tarball %s extracted to '%s/%s/'." % (
                ftgt.path, tgt_url.path, pilot['_id'])
            ret.extend(profiles)

            # If extract succeeded, no need to fetch individual profiles
            continue

        # If we dont have a tarball (for whichever reason), fetch individual profiles
        profiles = sandbox.list('*.prof')

        for prof in profiles:

            ftgt = saga.Url('%s/%s/%s' % (tgt_url, pilot['_id'], prof))
            ret.append("%s" % ftgt.path)

            if skip_existing and os.path.isfile(ftgt.path) \
                             and os.stat(ftgt.path).st_size > 0:

                logger.report.info("\t- %s\n" % str(prof).split('/')[-1])
                continue

            logger.report.info("\t+ %s\n" % str(prof).split('/')[-1])
            prof_file = saga.filesystem.File("%s%s" % (sandbox_url, prof),
                                             session=session)
            prof_file.copy(ftgt, flags=saga.filesystem.CREATE_PARENTS)
            prof_file.close()

    return ret
Example #6
0
def fetch_logfiles(sid,
                   dburl=None,
                   src=None,
                   tgt=None,
                   access=None,
                   session=None,
                   skip_existing=False,
                   fetch_client=False,
                   log=None):
    '''
    sid: session for which all logfiles are fetched
    src: dir to look for client session logfiles
    tgt: dir to store the logfile in

    returns list of file names
    '''

    if not log and session:
        log = session._log
        rep = session._rep
    elif not log:
        log = ru.Logger('radical.pilot.utils')
        rep = ru.Reporter('radical.pilot.utils')

    ret = list()

    if not dburl:
        dburl = os.environ['RADICAL_PILOT_DBURL']

    if not dburl:
        raise RuntimeError('Please set RADICAL_PILOT_DBURL')

    if not src:
        src = os.getcwd()

    if not tgt:
        tgt = os.getcwd()

    if not tgt.startswith('/') and '://' not in tgt:
        tgt = "%s/%s" % (os.getcwd(), tgt)

    # we always create a session dir as real target
    tgt_url = saga.Url("%s/%s/" % (tgt, sid))

    # Turn URLs without schema://host into file://localhost,
    # so that they dont become interpreted as relative.
    if not tgt_url.schema:
        tgt_url.schema = 'file'
    if not tgt_url.host:
        tgt_url.host = 'localhost'

    if fetch_client:
        # first fetch session logfile
        client_logfile = "%s/%s.log" % (src, sid)

        ftgt = saga.Url('%s/%s' % (tgt_url, os.path.basename(client_logfile)))
        ret.append("%s" % ftgt.path)

        if skip_existing and os.path.isfile(ftgt.path) \
                and os.stat(ftgt.path).st_size > 0:
            pass
        else:
            log_file = saga.filesystem.File(client_logfile, session=session)
            log_file.copy(ftgt, flags=saga.filesystem.CREATE_PARENTS)
            log_file.close()

    _, db, _, _, _ = ru.mongodb_connect(dburl)

    json_docs = get_session_docs(db, sid)

    pilots = json_docs['pilot']
    num_pilots = len(pilots)
    log.info("Session: %s", sid)
    log.info("Number of pilots in session: %d", num_pilots)

    for pilot in pilots:

        try:
            sandbox_url = saga.Url(pilot['pilot_sandbox'])

            if access:
                # Allow to use a different access schema than used for the the run.
                # Useful if you ran from the headnode, but would like to retrieve
                # the logfiles to your desktop (Hello Titan).
                access_url = saga.Url(access)
                sandbox_url.schema = access_url.schema
                sandbox_url.host = access_url.host

            sandbox = saga.filesystem.Directory(sandbox_url, session=session)

            # Try to fetch a tarball of logfiles, so that we can get them all in one (SAGA) go!
            LOGFILES_TARBALL = '%s.log.tgz' % pilot['uid']
            tarball_available = False
            try:
                if  sandbox.is_file(LOGFILES_TARBALL) and \
                    sandbox.get_size(LOGFILES_TARBALL):

                    log.info("logfiles tarball exists")
                    ftgt = saga.Url('%s/%s' % (tgt_url, LOGFILES_TARBALL))

                    if skip_existing and os.path.isfile(ftgt.path) \
                            and os.stat(ftgt.path).st_size > 0:

                        log.info("Skip fetching of '%s/%s' to '%s'.",
                                 sandbox_url, LOGFILES_TARBALL, tgt_url)
                        tarball_available = True
                    else:

                        log.info("Fetching '%s%s' to '%s'.", sandbox_url,
                                 LOGFILES_TARBALL, tgt_url)
                        log_file = saga.filesystem.File(
                            "%s%s" % (sandbox_url, LOGFILES_TARBALL),
                            session=session)
                        log_file.copy(ftgt,
                                      flags=saga.filesystem.CREATE_PARENTS)
                        log_file.close()

                        tarball_available = True
                else:
                    log.warn("logiles tarball doesnt exists")

            except saga.DoesNotExist:
                log.warn("logfiles tarball doesnt exists")

            try:
                os.mkdir("%s/%s" % (tgt_url.path, pilot['uid']))
            except OSError:
                pass

            # We now have a local tarball
            if tarball_available:
                log.debug("Extract tarball %s to %s", ftgt.path, tgt_url.path)

                try:
                    tarball = tarfile.open(ftgt.path)
                    tarball.extractall("%s/%s" % (tgt_url.path, pilot['uid']))

                    logfiles = glob.glob("%s/%s/*.log" %
                                         (tgt_url.path, pilot['uid']))
                    log.info("tarball %s extracted to '%s/%s/'.", ftgt.path,
                             tgt_url.path, pilot['uid'])
                    ret.extend(logfiles)
                    os.unlink(ftgt.path)

                except Exception as e:
                    log.warn('could not extract tarball %s [%s]', ftgt.path, e)

                # If extract succeeded, no need to fetch individual logfiles
                rep.ok("+ %s (logfiles)\n" % pilot['uid'])
                continue

            # If we dont have a tarball (for whichever reason), fetch individual logfiles
            logfiles = sandbox.list('*.log')

            for logfile in logfiles:

                ftgt = saga.Url('%s/%s/%s' % (tgt_url, pilot['uid'], logfile))
                ret.append("%s" % ftgt.path)

                if skip_existing and os.path.isfile(ftgt.path) \
                                 and os.stat(ftgt.path).st_size > 0:

                    continue

                log_file = saga.filesystem.File("%s%s" %
                                                (sandbox_url, logfile),
                                                session=session)
                log_file.copy(ftgt, flags=saga.filesystem.CREATE_PARENTS)
                log_file.close()

            rep.ok("+ %s (logfiles)\n" % pilot['uid'])

        except Exception as e:
            rep.error("- %s (logfiles)\n" % pilot['uid'])

    return ret
Example #7
0
    def stage_in(self, directives):
        """Stages the content of the staging directive into the pilot's
        staging area"""

        # Wait until we can assume the pilot directory to be created
        if self.state == NEW:
            self.wait(
                state=[PENDING_LAUNCH, LAUNCHING, PENDING_ACTIVE, ACTIVE])
        elif self.state in [DONE, FAILED, CANCELED]:
            raise Exception(
                "Pilot already finished, no need to stage anymore!")

        # Iterate over all directives
        for directive in expand_staging_directive(directives):

            # TODO: respect flags in directive

            src_url = saga.Url(directive['source'])
            action = directive['action']

            # Convert the target url into a SAGA Url object
            tgt_url = saga.Url(directive['target'])
            # Create a pointer to the directory object that we will use
            tgt_dir_url = tgt_url

            if tgt_url.path.endswith('/'):
                # If the original target was a directory (ends with /),
                # we assume that the user wants the same filename as the source.
                tgt_filename = os.path.basename(src_url.path)
            else:
                # Otherwise, extract the filename and update the directory
                tgt_filename = os.path.basename(tgt_dir_url.path)
                tgt_dir_url.path = os.path.dirname(tgt_dir_url.path)

            # Handle special 'staging' scheme
            if tgt_dir_url.scheme == 'staging':

                # We expect a staging:///relative/path/file.txt URI,
                # as hostname would have unclear semantics currently.
                if tgt_dir_url.host:
                    raise Exception(
                        "hostname not supported with staging:// scheme")

                # Remove the leading slash to get a relative path from the staging area
                rel_path = os.path.relpath(tgt_dir_url.path, '/')

                # Now base the target directory relative of the sandbox and staging prefix
                tgt_dir_url = saga.Url(
                    os.path.join(self.sandbox, STAGING_AREA, rel_path))

            # Define and open the staging directory for the pilot
            # We use the target dir construct here, so that we can create
            # the directory if it does not yet exist.
            target_dir = saga.filesystem.Directory(
                tgt_dir_url, flags=saga.filesystem.CREATE_PARENTS)

            if action == LINK:
                # TODO: Does this make sense?
                #log_message = 'Linking %s to %s' % (source, abs_target)
                #os.symlink(source, abs_target)
                logger.error(
                    "action 'LINK' not supported on pilot level staging")
                raise ValueError(
                    "action 'LINK' not supported on pilot level staging")
            elif action == COPY:
                # TODO: Does this make sense?
                #log_message = 'Copying %s to %s' % (source, abs_target)
                #shutil.copyfile(source, abs_target)
                logger.error(
                    "action 'COPY' not supported on pilot level staging")
                raise ValueError(
                    "action 'COPY' not supported on pilot level staging")
            elif action == MOVE:
                # TODO: Does this make sense?
                #log_message = 'Moving %s to %s' % (source, abs_target)
                #shutil.move(source, abs_target)
                logger.error(
                    "action 'MOVE' not supported on pilot level staging")
                raise ValueError(
                    "action 'MOVE' not supported on pilot level staging")
            elif action == TRANSFER:
                log_message = 'Transferring %s to %s' % (
                    src_url, os.path.join(str(tgt_dir_url), tgt_filename))
                logger.info(log_message)
                # Transfer the source file to the target staging area
                target_dir.copy(src_url, tgt_filename)
            else:
                raise Exception('Action %s not supported' % action)
Example #8
0
    def _handle_unit(self, unit, actionables):

        uid = unit['uid']

        src_context = {
            'pwd': unit['unit_sandbox'],  # !!!
            'unit': unit['unit_sandbox'],
            'pilot': unit['pilot_sandbox'],
            'resource': unit['resource_sandbox']
        }
        tgt_context = {
            'pwd': os.getcwd(),  # !!!
            'unit': unit['unit_sandbox'],
            'pilot': unit['pilot_sandbox'],
            'resource': unit['resource_sandbox']
        }

        # url used for cache (sandbox url w/o path)
        tmp = rs.Url(unit["unit_sandbox"])
        tmp.path = '/'
        key = str(tmp)

        if key not in self._cache:
            self._cache[key] = rs.filesystem.Directory(tmp,
                                                       session=self._session)
        saga_dir = self._cache[key]

        # Loop over all transfer directives and execute them.
        for sd in actionables:

            action = sd['action']
            flags = sd['flags']
            did = sd['uid']
            src = sd['source']
            tgt = sd['target']

            self._prof.prof('staging_out_start', uid=uid, msg=did)

            self._log.debug('src: %s', src)
            self._log.debug('tgt: %s', tgt)

            src = rpsd.complete_url(src, src_context, self._log)
            tgt = rpsd.complete_url(tgt, tgt_context, self._log)

            self._log.debug('src: %s', src)
            self._log.debug('tgt: %s', tgt)

            # Check if the src is a folder, if true
            # add recursive flag if not already specified
            if saga_dir.is_dir(src.path):
                flags |= rs.filesystem.RECURSIVE

            # Always set CREATE_PARENTS
            flags |= rs.filesystem.CREATE_PARENTS

            saga_dir.copy(src, tgt, flags=flags)
            self._prof.prof('staging_out_stop', uid=uid, msg=did)

        # all staging is done -- at this point the unit is final
        unit['state'] = unit['target_state']
        self.advance(unit, publish=True, push=True)
    def run(self):
        """Starts the process when Process.start() is called.
        """

        # make sure to catch sys.exit (which raises SystemExit)
        try:
            # Get directory where this module lives
            mod_dir = os.path.dirname(os.path.realpath(__file__))

            # Try to connect to the database
            try:
                connection = self.db_connection_info.get_db_handle()
                db = connection[self.db_connection_info.dbname]
                pilot_col = db["%s.p" % self.db_connection_info.session_id]
                logger.debug(
                    "Connected to MongoDB. Serving requests for PilotManager %s."
                    % self.pilot_manager_id)

            except Exception as e:
                logger.exception("Connection error: %s" % e)
                return

            last_job_check = time.time()

            while not self._stop.is_set():

                # Periodically, we pull up all ComputePilots that are pending
                # execution or were last seen executing and check if the corresponding
                # SAGA job is still pending in the queue. If that is not the case,
                # we assume that the job has failed for some reasons and update
                # the state of the ComputePilot accordingly.
                if last_job_check + JOB_CHECK_INTERVAL < time.time():
                    last_job_check = time.time()
                    self.check_pilot_states(pilot_col)

                # See if we can find a ComputePilot that is waiting to be launched.
                # If we find one, we use SAGA to create a job service, a job
                # description and a job that is then send to the local or remote
                # queueing system. If this succedes, we set the ComputePilot's
                # state to pending, otherwise to failed.
                compute_pilot = None

                ts = datetime.datetime.utcnow()
                compute_pilot = pilot_col.find_and_modify(
                    query={
                        "pilotmanager": self.pilot_manager_id,
                        "state": PENDING_LAUNCH
                    },
                    update={
                        "$set": {
                            "state": LAUNCHING
                        },
                        "$push": {
                            "statehistory": {
                                "state": LAUNCHING,
                                "timestamp": ts
                            }
                        }
                    })

                if not compute_pilot:
                    time.sleep(IDLE_TIMER)

                else:
                    try:
                        # ------------------------------------------------------
                        #
                        # LAUNCH THE PILOT AGENT VIA SAGA
                        #
                        logentries = []
                        pilot_id = str(compute_pilot["_id"])

                        logger.info("Launching ComputePilot %s" % pilot_id)

                        # ------------------------------------------------------
                        # Database connection parameters
                        session_uid = self.db_connection_info.session_id
                        database_url = self.db_connection_info.dburl
                        database_name = self.db_connection_info.dbname
                        database_auth = self.db_connection_info.dbauth

                        # ------------------------------------------------------
                        # pilot description and resource configuration
                        number_cores = compute_pilot['description']['cores']
                        runtime = compute_pilot['description']['runtime']
                        queue = compute_pilot['description']['queue']
                        project = compute_pilot['description']['project']
                        cleanup = compute_pilot['description']['cleanup']
                        resource_key = compute_pilot['description']['resource']
                        schema = compute_pilot['description']['access_schema']
                        memory = compute_pilot['description']['memory']
                        pilot_sandbox = compute_pilot['sandbox']
                        global_sandbox = compute_pilot['global_sandbox']

                        # we expand and exchange keys in the resource config,
                        # depending on the selected schema so better use a deep
                        # copy..
                        resource_cfg = self._session.get_resource_config(
                            resource_key, schema)

                        # import pprint
                        # pprint.pprint (resource_cfg)

                        # ------------------------------------------------------
                        # get parameters from cfg, set defaults where needed
                        agent_mongodb_endpoint = resource_cfg.get(
                            'agent_mongodb_endpoint', database_url)
                        agent_spawner = resource_cfg.get(
                            'agent_spawner', DEFAULT_AGENT_SPAWNER)
                        agent_type = resource_cfg.get('agent_type',
                                                      DEFAULT_AGENT_TYPE)
                        agent_scheduler = resource_cfg.get('agent_scheduler')
                        tunnel_bind_device = resource_cfg.get(
                            'tunnel_bind_device')
                        default_queue = resource_cfg.get('default_queue')
                        forward_tunnel_endpoint = resource_cfg.get(
                            'forward_tunnel_endpoint')
                        js_endpoint = resource_cfg.get('job_manager_endpoint')
                        lrms = resource_cfg.get('lrms')
                        mpi_launch_method = resource_cfg.get(
                            'mpi_launch_method')
                        pre_bootstrap = resource_cfg.get('pre_bootstrap')
                        python_interpreter = resource_cfg.get(
                            'python_interpreter')
                        spmd_variation = resource_cfg.get('spmd_variation')
                        task_launch_method = resource_cfg.get(
                            'task_launch_method')
                        rp_version = resource_cfg.get('rp_version',
                                                      DEFAULT_RP_VERSION)
                        virtenv_mode = resource_cfg.get(
                            'virtenv_mode', DEFAULT_VIRTENV_MODE)
                        virtenv = resource_cfg.get('virtenv', DEFAULT_VIRTENV)
                        stage_cacerts = resource_cfg.get(
                            'stage_cacerts', 'False')

                        if stage_cacerts.lower() == 'true':
                            stage_cacerts = True
                        else:
                            stage_cacerts = False

                        # expand variables in virtenv string
                        virtenv = virtenv % {
                            'pilot_sandbox': saga.Url(pilot_sandbox).path,
                            'global_sandbox': saga.Url(global_sandbox).path
                        }

                        # Check for deprecated global_virtenv
                        global_virtenv = resource_cfg.get('global_virtenv')
                        if global_virtenv:
                            logger.warn(
                                "'global_virtenv' keyword is deprecated -- use 'virtenv' and 'virtenv_mode'"
                            )
                            virtenv = global_virtenv
                            virtenv_mode = 'use'

                        # set default scheme, host, port and dbname if not set
                        db_url = saga.Url(agent_mongodb_endpoint)
                        if not db_url.scheme: db_url.scheme = 'mongodb'
                        if not db_url.host: db_url.host = 'localhost'
                        if not db_url.port: db_url.port = 27017
                        if not database_name: database_name = 'radicalpilot'

                        # Create a host:port string for use by the bootstrapper.
                        database_hostport = "%s:%d" % (db_url.host,
                                                       db_url.port)

                        # ------------------------------------------------------
                        # Copy the bootstrap shell script.  This also creates
                        # the sandbox. We use always "default_bootstrapper.sh"
                        bootstrapper = 'default_bootstrapper.sh'
                        bootstrapper_path = os.path.abspath("%s/../bootstrapper/%s" \
                                % (mod_dir, bootstrapper))

                        msg = "Using bootstrapper %s" % bootstrapper_path
                        logentries.append(Logentry(msg, logger=logger.info))

                        bs_script_url = saga.Url("file://localhost/%s" %
                                                 bootstrapper_path)
                        bs_script_tgt = saga.Url("%s/pilot_bootstrapper.sh" %
                                                 pilot_sandbox)

                        msg = "Copying bootstrapper '%s' to agent sandbox (%s)." \
                                % (bs_script_url, bs_script_tgt)
                        logentries.append(Logentry(msg, logger=logger.debug))

                        bs_script = saga.filesystem.File(bs_script_url,
                                                         session=self._session)
                        bs_script.copy(bs_script_tgt,
                                       flags=saga.filesystem.CREATE_PARENTS)
                        bs_script.close()

                        # ------------------------------------------------------
                        # the version of the agent is derived from
                        # rp_version, which has the following format
                        # and interpretation:
                        #
                        # case rp_version:
                        #   @<token>:
                        #   @tag/@branch/@commit: # no sdist staging
                        #       git clone $github_base radical.pilot.src
                        #       (cd radical.pilot.src && git checkout token)
                        #       pip install -t $VIRTENV/rp_install/ radical.pilot.src
                        #       rm -rf radical.pilot.src
                        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
                        #
                        #   release: # no sdist staging
                        #       pip install -t $VIRTENV/rp_install radical.pilot
                        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
                        #
                        #   local: # needs sdist staging
                        #       tar zxf $sdist.tgz
                        #       pip install -t $VIRTENV/rp_install $sdist/
                        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
                        #
                        #   debug: # needs sdist staging
                        #       tar zxf $sdist.tgz
                        #       pip install -t $SANDBOX/rp_install $sdist/
                        #       export PYTHONPATH=$SANDBOX/rp_install:$PYTHONPATH
                        #
                        #   installed: # no sdist staging
                        #       true
                        # esac
                        #
                        # virtenv_mode
                        #   private : error  if ve exists, otherwise create, then use
                        #   update  : update if ve exists, otherwise create, then use
                        #   create  : use    if ve exists, otherwise create, then use
                        #   use     : use    if ve exists, otherwise error,  then exit
                        #   recreate: delete if ve exists, otherwise create, then use
                        #
                        # examples   :
                        #   [email protected]
                        #   virtenv@devel
                        #   virtenv@release
                        #   virtenv@installed
                        #   stage@local
                        #   stage@/tmp/my_agent.py
                        #
                        # Note that some combinations may be invalid,
                        # specifically in the context of virtenv_mode.  If, for
                        # example, virtenv_mode is 'use', then the 'virtenv:tag'
                        # will not make sense, as the virtenv is not updated.
                        # In those cases, the virtenv_mode is honored, and
                        # a warning is printed.
                        #
                        # Also, the 'stage' mode can only be combined with the
                        # 'local' source, or with a path to the agent (relative
                        # to mod_dir, or absolute).
                        #
                        # A rp_version which does not adhere to the
                        # above syntax is ignored, and the fallback stage@local
                        # is used.

                        if  not rp_version.startswith('@') and \
                            not rp_version in ['installed', 'local', 'debug']:
                            raise ValueError("invalid rp_version '%s'" %
                                             rp_version)

                        stage_sdist = True
                        if rp_version in ['installed', 'release']:
                            stage_sdist = False

                        if rp_version.startswith('@'):
                            stage_sdist = False
                            rp_version = rp_version[1:]  # strip '@'

                        # ------------------------------------------------------
                        # Copy the rp sdist if needed.  We actually also stage
                        # the sdists for radical.utils and radical.saga, so that
                        # we have the complete stack to install...
                        if stage_sdist:

                            for path in [
                                    ru.sdist_path, saga.sdist_path, sdist_path
                            ]:

                                sdist_url = saga.Url("file://localhost/%s" %
                                                     path)
                                msg = "Copying sdist '%s' to sdist sandbox (%s)." % (
                                    sdist_url, pilot_sandbox)
                                logentries.append(
                                    Logentry(msg, logger=logger.debug))

                                sdist_file = saga.filesystem.File(sdist_url)
                                sdist_file.copy("%s/" % (str(pilot_sandbox)))
                                sdist_file.close()

                        # ------------------------------------------------------
                        # some machines cannot run pip due to outdated ca certs.
                        # For those, we also stage an updated cert bundle
                        if stage_cacerts:
                            cc_path = os.path.abspath("%s/../bootstrapper/%s" \
                                    % (mod_dir, 'cacert.pem.gz'))

                            cc_script_url = saga.Url("file://localhost/%s" %
                                                     cc_path)
                            cc_script_tgt = saga.Url("%s/cacert.pem.gz" %
                                                     pilot_sandbox)

                            cc_script = saga.filesystem.File(
                                cc_script_url, session=self._session)
                            cc_script.copy(
                                cc_script_tgt,
                                flags=saga.filesystem.CREATE_PARENTS)
                            cc_script.close()

                        # ------------------------------------------------------
                        # sanity checks
                        if not agent_spawner:
                            raise RuntimeError("missing agent spawner")
                        if not agent_scheduler:
                            raise RuntimeError("missing agent scheduler")
                        if not lrms: raise RuntimeError("missing LRMS")
                        if not mpi_launch_method:
                            raise RuntimeError("missing mpi launch method")
                        if not task_launch_method:
                            raise RuntimeError("missing task launch method")

                        # massage some values
                        debug_level = os.environ.get(
                            'RADICAL_PILOT_AGENT_VERBOSE', logger.level)
                        try:
                            debug_level = int(debug_level)
                        except ValueError:
                            debug_level = {
                                'CRITICAL': 1,
                                'ERROR': 2,
                                'WARNING': 3,
                                'WARN': 3,
                                'INFO': 4,
                                'DEBUG': 5
                            }.get(debug_level, 0)

                        if not queue:
                            queue = default_queue

                        if cleanup and isinstance(cleanup, bool):
                            cleanup = 'luve'  #  l : log files
                            #  u : unit work dirs
                            #  v : virtualenv
                            #  e : everything (== pilot sandbox)
                            #
                            # we never cleanup virtenvs which are not private
                            if virtenv_mode is not 'private':
                                cleanup = cleanup.replace('v', '')

                        sdists = ':'.join(
                            [ru.sdist_name, saga.sdist_name, sdist_name])

                        # set mandatory args
                        bootstrap_args = ""
                        bootstrap_args += " -b '%s'" % sdists
                        bootstrap_args += " -c '%s'" % number_cores
                        bootstrap_args += " -d '%s'" % debug_level
                        bootstrap_args += " -g '%s'" % virtenv
                        bootstrap_args += " -j '%s'" % task_launch_method
                        bootstrap_args += " -k '%s'" % mpi_launch_method
                        bootstrap_args += " -l '%s'" % lrms
                        bootstrap_args += " -m '%s'" % database_hostport
                        bootstrap_args += " -n '%s'" % database_name
                        bootstrap_args += " -o '%s'" % agent_spawner
                        bootstrap_args += " -p '%s'" % pilot_id
                        bootstrap_args += " -q '%s'" % agent_scheduler
                        bootstrap_args += " -r '%s'" % runtime
                        bootstrap_args += " -s '%s'" % session_uid
                        bootstrap_args += " -t '%s'" % agent_type
                        bootstrap_args += " -u '%s'" % virtenv_mode
                        bootstrap_args += " -v '%s'" % rp_version

                        # set optional args
                        if database_auth:
                            bootstrap_args += " -a '%s'" % database_auth
                        if tunnel_bind_device:
                            bootstrap_args += " -D '%s'" % tunnel_bind_device
                        if pre_bootstrap:
                            bootstrap_args += " -e '%s'" % "' -e '".join(
                                pre_bootstrap)
                        if forward_tunnel_endpoint:
                            bootstrap_args += " -f '%s'" % forward_tunnel_endpoint
                        if python_interpreter:
                            bootstrap_args += " -i '%s'" % python_interpreter
                        if cleanup:
                            bootstrap_args += " -x '%s'" % cleanup

                        # ------------------------------------------------------
                        # now that the script is in place and we know where it is,
                        # we can launch the agent
                        js_url = saga.Url(js_endpoint)
                        logger.debug("saga.job.Service ('%s')" % js_url)
                        if js_url in self._shared_worker_data['job_services']:
                            js = self._shared_worker_data['job_services'][
                                js_url]
                        else:
                            js = saga.job.Service(js_url,
                                                  session=self._session)
                            self._shared_worker_data['job_services'][
                                js_url] = js

                        # ------------------------------------------------------
                        # Create SAGA Job description and submit the pilot job

                        jd = saga.job.Description()

                        jd.executable = "/bin/bash"
                        jd.arguments = [
                            "-l pilot_bootstrapper.sh", bootstrap_args
                        ]
                        jd.working_directory = saga.Url(pilot_sandbox).path
                        jd.project = project
                        jd.output = "agent.out"
                        jd.error = "agent.err"
                        jd.total_cpu_count = number_cores
                        jd.wall_time_limit = runtime
                        jd.total_physical_memory = memory
                        jd.queue = queue

                        # Set the SPMD variation only if required
                        if spmd_variation:
                            jd.spmd_variation = spmd_variation

                        if 'RADICAL_PILOT_PROFILE' in os.environ:
                            jd.environment = {'RADICAL_PILOT_PROFILE': 'TRUE'}

                        logger.debug("Bootstrap command line: %s %s" %
                                     (jd.executable, jd.arguments))

                        msg = "Submitting SAGA job with description: %s" % str(
                            jd.as_dict())
                        logentries.append(Logentry(msg, logger=logger.debug))

                        pilotjob = js.create_job(jd)
                        pilotjob.run()

                        # do a quick error check
                        if pilotjob.state == saga.FAILED:
                            raise RuntimeError("SAGA Job state is FAILED.")

                        saga_job_id = pilotjob.id
                        self._shared_worker_data['job_ids'][pilot_id] = [
                            saga_job_id, js_url
                        ]

                        msg = "SAGA job submitted with job id %s" % str(
                            saga_job_id)
                        logentries.append(Logentry(msg, logger=logger.debug))

                        #
                        # ------------------------------------------------------

                        log_dicts = list()
                        for le in logentries:
                            log_dicts.append(le.as_dict())

                        # Update the Pilot's state to 'PENDING_ACTIVE' if SAGA job submission was successful.
                        ts = datetime.datetime.utcnow()
                        ret = pilot_col.update(
                            {
                                "_id": pilot_id,
                                "state": 'Launching'
                            }, {
                                "$set": {
                                    "state": PENDING_ACTIVE,
                                    "saga_job_id": saga_job_id
                                },
                                "$push": {
                                    "statehistory": {
                                        "state": PENDING_ACTIVE,
                                        "timestamp": ts
                                    }
                                },
                                "$pushAll": {
                                    "log": log_dicts
                                }
                            })

                        if ret['n'] == 0:
                            # could not update, probably because the agent is
                            # running already.  Just update state history and
                            # jobid then
                            # FIXME: make sure of the agent state!
                            ret = pilot_col.update({"_id": pilot_id}, {
                                "$set": {
                                    "saga_job_id": saga_job_id
                                },
                                "$push": {
                                    "statehistory": {
                                        "state": PENDING_ACTIVE,
                                        "timestamp": ts
                                    }
                                },
                                "$pushAll": {
                                    "log": log_dicts
                                }
                            })

                    except Exception as e:
                        # Update the Pilot's state 'FAILED'.
                        out, err, log = self._get_pilot_logs(
                            pilot_col, pilot_id)
                        ts = datetime.datetime.utcnow()

                        # FIXME: we seem to be unable to bson/json handle saga
                        # log messages containing an '#'.  This shows up here.
                        # Until we find a clean workaround, make log shorter and
                        # rely on saga logging to reveal the problem.
                        msg = "Pilot launching failed! (%s)" % e
                        logentries.append(Logentry(msg))

                        log_dicts = list()
                        log_messages = list()
                        for le in logentries:
                            log_dicts.append(le.as_dict())
                            log_messages.append(le.message)

                        pilot_col.update(
                            {
                                "_id": pilot_id,
                                "state": {
                                    "$ne": FAILED
                                }
                            }, {
                                "$set": {
                                    "state": FAILED,
                                    "stdout": out,
                                    "stderr": err,
                                    "logfile": log
                                },
                                "$push": {
                                    "statehistory": {
                                        "state": FAILED,
                                        "timestamp": ts
                                    }
                                },
                                "$pushAll": {
                                    "log": log_dicts
                                }
                            })
                        logger.exception('\n'.join(log_messages))

        except SystemExit as e:
            logger.exception(
                "pilot launcher thread caught system exit -- forcing application shutdown"
            )
            import thread
            thread.interrupt_main()
Example #10
0
    def _handle_unit(self, unit, actionables):

        # FIXME: we should created unit sandboxes in a bulk

        uid = unit['uid']

        self._prof.prof("create_sandbox_start", uid=uid)

        src_context = {
            'pwd': os.getcwd(),  # !!!
            'unit': unit['unit_sandbox'],
            'pilot': unit['pilot_sandbox'],
            'resource': unit['resource_sandbox']
        }
        tgt_context = {
            'pwd': unit['unit_sandbox'],  # !!!
            'unit': unit['unit_sandbox'],
            'pilot': unit['pilot_sandbox'],
            'resource': unit['resource_sandbox']
        }

        # we have actionable staging directives, and thus we need a unit
        # sandbox.
        sandbox = rs.Url(unit["unit_sandbox"])
        tmp = rs.Url(unit["unit_sandbox"])

        # url used for cache (sandbox url w/o path)
        tmp.path = '/'
        key = str(tmp)
        self._log.debug('key %s / %s', key, tmp)

        if key not in self._fs_cache:
            self._fs_cache[key] = rs.filesystem.Directory(
                tmp, session=self._session)

        saga_dir = self._fs_cache[key]
        saga_dir.make_dir(sandbox, flags=rs.filesystem.CREATE_PARENTS)
        self._prof.prof("create_sandbox_stop", uid=uid)

        # Loop over all transfer directives and filter out tarball staging
        # directives.  Those files are added into a tarball, and a single
        # actionable to stage that tarball replaces the original actionables.

        # create a new actionable list during the filtering
        new_actionables = list()
        tar_file = None

        for sd in actionables:

            # don't touch non-tar SDs
            if sd['action'] != rpc.TARBALL:
                new_actionables.append(sd)

            else:

                action = sd['action']
                flags = sd['flags']  # NOTE: we don't use those
                did = sd['uid']
                src = sd['source']
                tgt = sd['target']

                src = complete_url(src, src_context, self._log)
                tgt = complete_url(tgt, tgt_context, self._log)

                self._prof.prof('staging_in_tar_start', uid=uid, msg=did)

                # create a tarfile on the first match, and register for transfer
                if not tar_file:
                    tmp_file = tempfile.NamedTemporaryFile(
                        prefix='rp_usi_%s.' % uid, suffix='.tar', delete=False)
                    tar_path = tmp_file.name
                    tar_file = tarfile.open(fileobj=tmp_file, mode='w')
                    tar_src = ru.Url('file://localhost/%s' % tar_path)
                    tar_tgt = ru.Url('unit:////%s.tar' % uid)
                    tar_did = ru.generate_id('sd')
                    tar_sd = {
                        'action': rpc.TRANSFER,
                        'flags': rpc.DEFAULT_FLAGS,
                        'uid': tar_did,
                        'source': str(tar_src),
                        'target': str(tar_tgt),
                    }
                    new_actionables.append(tar_sd)

                # add the src file
                tar_file.add(src.path, arcname=tgt.path)

                self._prof.prof('staging_in_tar_stop', uid=uid, msg=did)

        # make sure tarball is flushed to disk
        if tar_file:
            tar_file.close()

        # work on the filtered TRANSFER actionables
        for sd in new_actionables:

            action = sd['action']
            flags = sd['flags']
            did = sd['uid']
            src = sd['source']
            tgt = sd['target']

            if action == rpc.TRANSFER:

                src = complete_url(src, src_context, self._log)
                tgt = complete_url(tgt, tgt_context, self._log)

                # Check if the src is a folder, if true
                # add recursive flag if not already specified
                if os.path.isdir(src.path):
                    flags |= rs.filesystem.RECURSIVE

                # Always set CREATE_PARENTS
                flags |= rs.filesystem.CREATE_PARENTS

                src = complete_url(src, src_context, self._log)
                tgt = complete_url(tgt, tgt_context, self._log)

                self._prof.prof('staging_in_start', uid=uid, msg=did)
                saga_dir.copy(src, tgt, flags=flags)
                self._prof.prof('staging_in_stop', uid=uid, msg=did)

        if tar_file:

            # some tarball staging was done.  Add a staging directive for the
            # agent to untar the tarball, and clean up.
            tar_sd['action'] = rpc.TARBALL
            unit['description']['input_staging'].append(tar_sd)
            os.remove(tar_path)

        # staging is done, we can advance the unit at last
        self.advance(unit,
                     rps.AGENT_STAGING_INPUT_PENDING,
                     publish=True,
                     push=True)
Example #11
0
def main():

    tmp_dir = None

    try:

        tmp_dir = tempfile.mkdtemp(prefix='saga-test-', suffix='-%s' % TEST_NAME,
                                   dir=os.path.expanduser('~/tmp'))

        print 'tmpdir: %s' % tmp_dir

        ctx = saga.Context("x509")
        ctx.user_proxy = '/Users/mark/proj/myproxy/xsede.x509'

        session = saga.Session()
        session.add_context(ctx)

        source_url = saga.Url()
        source_url.schema = 'go'
        source_url.host = SOURCE
        source_url.path = tmp_dir

        target_url = saga.Url()
        target_url.schema = 'go'
        target_url.host = TARGET
        target_url.path = os.path.join('~/saga-tests/', os.path.basename(tmp_dir))

        print "Point to local Directory through GO ..."
        d = saga.filesystem.Directory(source_url)
        print "And check ..."
        assert d.is_dir() == True
        assert d.is_file() == False
        assert d.is_link() == False
        d.close()
        print "Point to remote Directory through GO ..."
        d = saga.filesystem.Directory(target_url, flags=saga.filesystem.CREATE_PARENTS)
        print "And check ..."
        assert d.is_dir() == True
        assert d.is_file() == False
        assert d.is_link() == False
        d.close()

        print "Point to local file through GO, before creation ..."
        caught = False
        try:
            saga.filesystem.File(os.path.join(str(source_url), FILE_A_level_0))
        except saga.DoesNotExist:
            caught = True
        assert caught == True

        print "Create actual file ..."
        touch(tmp_dir, FILE_A_level_0)
        print "Try again ..."
        f = saga.filesystem.File(os.path.join(str(source_url), FILE_A_level_0))
        assert f.is_file() == True
        assert f.is_dir() == False
        assert f.is_link() == False
        f.close()

        print "Copy local file to remote, using different filename ..."
        d = saga.filesystem.Directory(target_url, flags=saga.filesystem.CREATE_PARENTS)
        d.copy(os.path.join(str(source_url), FILE_A_level_0), FILE_A_level_0+COPIED_SUFFIX)
        d.close()
        f = saga.filesystem.File(os.path.join(str(target_url), FILE_A_level_0+COPIED_SUFFIX))
        assert f.is_file() == True
        assert f.is_dir() == False
        assert f.is_link() == False
        f.close()

        print "Copy local file to remote, keeping filename in tact ..."
        d = saga.filesystem.Directory(target_url, flags=saga.filesystem.CREATE_PARENTS)
        d.copy(os.path.join(str(source_url), FILE_A_level_0), FILE_A_level_0)
        d.close()
        f = saga.filesystem.File(os.path.join(str(target_url), FILE_A_level_0))
        assert f.is_file() == True
        assert f.is_dir() == False
        assert f.is_link() == False
        f.close()

        print 'Create file in level 1 ...'
        tree = LEVEL_1
        os.mkdir(os.path.join(tmp_dir, tree))
        touch(os.path.join(tmp_dir, tree), FILE_A_level_1)
        print "Test local file ..."
        f = saga.filesystem.File(os.path.join(str(source_url), tree, FILE_A_level_1))
        assert f.is_file() == True
        assert f.is_dir() == False
        assert f.is_link() == False
        f.close()

        print "Copy local file to remote, keeping filename in tact ..."
        d = saga.filesystem.Directory(os.path.join(str(target_url), tree), flags=saga.filesystem.CREATE_PARENTS)
        d.copy(os.path.join(str(source_url), tree, FILE_A_level_1), FILE_A_level_1)
        d.close()

        print "Test file after transfer ..."
        f = saga.filesystem.File(os.path.join(str(target_url), tree, FILE_A_level_1))
        assert f.is_file() == True
        assert f.is_dir() == False
        assert f.is_link() == False
        f.close()

        print "Copy non-existent local file to remote, keeping filename in tact ..."
        d = saga.filesystem.Directory(str(target_url), flags=saga.filesystem.CREATE_PARENTS)
        try:
            d.copy(os.path.join(str(source_url), NON_EXISTING_FILE), NON_EXISTING_FILE)
        except saga.DoesNotExist:
            caught = True
        assert caught == True

        print "Test file after (non-)transfer ..."
        caught = False
        try:
            saga.filesystem.File(os.path.join(str(target_url), NON_EXISTING_FILE))
        except saga.DoesNotExist:
            caught = True
        assert caught == True

        # destination = "go://gridftp.stampede.tacc.xsede.org/~/tmp/"
        # #destination = "go://oasis-dm.sdsc.xsede.org/~/tmp/"
        # #destination = "go://ncsa#BlueWaters/~/tmp/"
        # #destination = "go://marksant#netbook/Users/mark/tmp/go/"
        # src_filename = "my_file"
        # dst_filename = "my_file_"
        # rt_filename = "my_file__"
        #
        # # open home directory on a remote machine
        # source_dir = saga.filesystem.Directory(source)
        #
        # # copy .bash_history to /tmp/ on the local machine
        # source_dir.copy(src_filename, os.path.join(destination, dst_filename))
        #
        # # list 'm*' in local /tmp/ directory
        # dest_dir = saga.filesystem.Directory(destination)
        # for entry in dest_dir.list(pattern='%s*' % src_filename[0]):
        #     print entry
        #
        # dest_file = saga.filesystem.File(os.path.join(destination, dst_filename))
        # assert dest_file.is_file() == True
        # assert dest_file.is_link() == False
        # assert dest_file.is_dir() == False
        # print 'Size: %d' % dest_file.get_size()
        #
        # dest_file.copy(source)
        #
        # dest_file.copy(os.path.join(source+'broken', rt_filename))

        print "Before return 0"
        return 0

    except saga.SagaException as ex:
        # Catch all saga exceptions
        print "An exception occurred: (%s) %s " % (ex.type, (str(ex)))
        # Trace back the exception. That can be helpful for debugging.
        print " \n*** Backtrace:\n %s" % ex.traceback

        print "before return -1"
        return -1

    finally:

        print "and finally ..."

        if CLEANUP and tmp_dir:
            shutil.rmtree(tmp_dir)
Example #12
0
__author__    = "Andre Merzky"
__copyright__ = "Copyright 2012-2013, The SAGA Project"
__license__   = "MIT"


import re
import time
import saga
import saga.utils.pty_shell as sups

try :
    shell = sups.PTYShell (saga.Url ("fork://localhost"), [])
    shell.run_async ("(sftp -b - localhost || (printf \"SFTP_ABORT\n\"; false)) <<EOT")
    shell.send ("progress\nput /home/merzky/downloads/totalview*.sh /tmp/t\nEOT\n")

  # pat_bof = re.compile ("(?P<perc>\d+\%).*(?P<time>--:--)\s*ETA")
    pat_bof = re.compile ("(?P<perc>\d+)\%\s+(?P<size>.+?)\s+(?P<perf>.+?)\s+(?P<time>--:--)\s*ETA")
    pat_eta = re.compile ("(?P<perc>\d+)\%\s+(?P<size>.+?)\s+(?P<perf>.+?)\s+(?P<time>\d\d:\d\d)\s*ETA")
    pat_eof = re.compile ("(?P<perc>\d+)\%\s+(?P<size>.+?)\s+(?P<perf>.+?)\s+(?P<time>\d\d:\d\d)\s*\n")
    pat_def = re.compile ("^sftp>.*\n")

    begin = True
    error   = ""

    while True :
        ret, out = shell.find (['ETA$', 'SFTP_ABORT\n', '\n'])
        progress    = None

        # ----------------------------------------------------------------------
        # found ETA - transfer is in progress
    def run(self):
        """Starts the process when Process.start() is called.
        """

        # make sure to catch sys.exit (which raises SystemExit)
        try :

            logger.info("Starting InputFileTransferWorker")

            # Try to connect to the database and create a tailable cursor.
            try:
                connection = self.db_connection_info.get_db_handle()
                db = connection[self.db_connection_info.dbname]
                um_col = db["%s.cu" % self.db_connection_info.session_id]
                logger.debug("Connected to MongoDB. Serving requests for UnitManager %s." % self.unit_manager_id)

            except Exception as e :
                logger.exception("Connection error: %s" % e)
                raise

            try :
                while not self._stop.is_set():
                    # See if we can find a ComputeUnit that is waiting for
                    # input file transfer.
                    compute_unit = None

                    ts = datetime.datetime.utcnow()
                    compute_unit = um_col.find_and_modify(
                        query={"unitmanager": self.unit_manager_id,
                               "FTW_Input_Status": PENDING},
                        update={"$set" : {"FTW_Input_Status": EXECUTING,
                                          "state": STAGING_INPUT},
                                "$push": {"statehistory": {"state": STAGING_INPUT, "timestamp": ts}}},
                        limit=BULK_LIMIT # TODO: bulklimit is probably not the best way to ensure there is just one
                    )
                    # FIXME: AM: find_and_modify is not bulkable!
                    state = STAGING_INPUT

                    if compute_unit is None:
                        # Sleep a bit if no new units are available.
                        time.sleep(IDLE_TIME) 

                    else:
                        compute_unit_id = None
                        try:
                            log_messages = []

                            # We have found a new CU. Now we can process the transfer
                            # directive(s) wit SAGA.
                            compute_unit_id = str(compute_unit["_id"])
                            remote_sandbox = compute_unit["sandbox"]
                            input_staging = compute_unit["FTW_Input_Directives"]

                            # We need to create the CU's directory in case it doesn't exist yet.
                            log_msg = "Creating ComputeUnit sandbox directory %s." % remote_sandbox
                            log_messages.append(log_msg)
                            logger.info(log_msg)

                            # Creating the sandbox directory.
                            try:
                                logger.debug ("saga.fs.Directory ('%s')" % remote_sandbox)

                                remote_sandbox_keyurl = saga.Url (remote_sandbox)
                                remote_sandbox_keyurl.path = '/'
                                remote_sandbox_key = str(remote_sandbox_keyurl)

                                if  remote_sandbox_key not in self._saga_dirs :
                                    self._saga_dirs[remote_sandbox_key] = \
                                            saga.filesystem.Directory (remote_sandbox_key,
                                                    flags=saga.filesystem.CREATE_PARENTS,
                                                    session=self._session)

                                saga_dir = self._saga_dirs[remote_sandbox_key]
                                saga_dir.make_dir (remote_sandbox, 
                                                   flags=saga.filesystem.CREATE_PARENTS)
                            except Exception as e :
                                logger.exception('Error: %s' % e)
                                # FIXME: why is this exception ignored?  AM


                            logger.info("Processing input file transfers for ComputeUnit %s" % compute_unit_id)
                            # Loop over all transfer directives and execute them.
                            for sd in input_staging:

                                state_doc = um_col.find_one(
                                    {"_id": compute_unit_id},
                                    fields=["state"]
                                )
                                if state_doc['state'] == CANCELED:
                                    logger.info("Compute Unit Canceled, interrupting input file transfers.")
                                    state = CANCELED
                                    break

                                abs_src = os.path.abspath(sd['source'])
                                input_file_url = saga.Url("file://localhost/%s" % abs_src)
                                if not sd['target']:
                                    target = remote_sandbox
                                else:
                                    target = "%s/%s" % (remote_sandbox, sd['target'])

                                log_msg = "Transferring input file %s -> %s" % (input_file_url, target)
                                log_messages.append(log_msg)
                                logger.debug(log_msg)

                                # Execute the transfer.
                                logger.debug ("saga.fs.File ('%s')" % input_file_url)
                                input_file = saga.filesystem.File(
                                    input_file_url,
                                    session=self._session
                                )

                                if CREATE_PARENTS in sd['flags']:
                                    copy_flags = saga.filesystem.CREATE_PARENTS
                                else:
                                    copy_flags = 0

                                try :
                                    input_file.copy(target, flags=copy_flags)
                                except Exception as e :
                                    logger.exception (e)
                                input_file.close()

                                # If all went fine, update the state of this StagingDirective to Done
                                um_col.find_and_modify(
                                    query={"_id" : compute_unit_id,
                                           'FTW_Input_Status': EXECUTING,
                                           'FTW_Input_Directives.state': PENDING,
                                           'FTW_Input_Directives.source': sd['source'],
                                           'FTW_Input_Directives.target': sd['target'],
                                           },
                                    update={'$set': {'FTW_Input_Directives.$.state': 'Done'},
                                            '$push': {'log': {
                                                'timestamp': datetime.datetime.utcnow(), 
                                                'message'  : log_msg}}
                                    }
                                )

                        except Exception as e :
                            # Update the CU's state 'FAILED'.
                            ts = datetime.datetime.utcnow()
                            logentry = {'message'  : "Input transfer failed: %s" % e,
                                        'timestamp': ts}

                            um_col.update({'_id': compute_unit_id}, {
                                '$set': {'state': FAILED},
                                '$push': {
                                    'statehistory': {'state': FAILED, 'timestamp': ts},
                                    'log': logentry
                                }
                            })

                            logger.exception(str(logentry))

                    # Code below is only to be run by the "first" or only worker
                    if self._worker_number > 1:
                        continue

                    # If the CU was canceled we can skip the remainder of this loop.
                    if state == CANCELED:
                        continue

                    #
                    # Check to see if there are more pending Directives, if not, we are Done
                    #
                    cursor_w = um_col.find({"unitmanager": self.unit_manager_id,
                                            "$or": [ {"Agent_Input_Status": EXECUTING},
                                                     {"FTW_Input_Status": EXECUTING}
                                                   ]
                                            }
                                           )
                    # Iterate over all the returned CUs (if any)
                    for cu in cursor_w:
                        # See if there are any FTW Input Directives still pending
                        if cu['FTW_Input_Status'] == EXECUTING and \
                                not any(d['state'] == EXECUTING or d['state'] == PENDING for d in cu['FTW_Input_Directives']):
                            # All Input Directives for this FTW are done, mark the CU accordingly
                            um_col.update({"_id": cu["_id"]},
                                          {'$set': {'FTW_Input_Status': DONE},
                                           '$push': {'log': {
                                                'timestamp': datetime.datetime.utcnow(),
                                                'message'  : 'All FTW Input Staging Directives done - %d.' % self._worker_number}}
                                           }
                            )

                        # See if there are any Agent Input Directives still pending or executing,
                        # if not, mark it DONE.
                        if cu['Agent_Input_Status'] == EXECUTING and \
                                not any(d['state'] == EXECUTING or d['state'] == PENDING for d in cu['Agent_Input_Directives']):
                            # All Input Directives for this Agent are done, mark the CU accordingly
                            um_col.update({"_id": cu["_id"]},
                                           {'$set': {'Agent_Input_Status': DONE},
                                            '$push': {'log': {
                                                'timestamp': datetime.datetime.utcnow(), 
                                                'message'  : 'All Agent Input Staging Directives done - %d.' % self._worker_number}}
                                           }
                            )

                    #
                    # Check for all CUs if both Agent and FTW staging is done, we can then mark the CU PendingExecution
                    #
                    ts = datetime.datetime.utcnow()
                    um_col.find_and_modify(
                        query={"unitmanager": self.unit_manager_id,
                               "Agent_Input_Status": { "$in": [ None, DONE ] },
                               "FTW_Input_Status": { "$in": [ None, DONE ] },
                               "state": STAGING_INPUT
                        },
                        update={"$set": {
                                    "state": PENDING_EXECUTION
                                },
                                "$push": {
                                    "statehistory": {"state": PENDING_EXECUTION, "timestamp": ts}
                                }
                        }
                    )

            except Exception as e :

                logger.exception("transfer worker error: %s" % e)
                self._session.close (cleanup=False)
                raise

        except SystemExit as e :
            logger.debug("input file transfer thread caught system exit -- forcing application shutdown")
            import thread
            thread.interrupt_main ()
Example #14
0
    def _start_pilot_bulk(self, resource, schema, pilots):
        """
        For each pilot, we prepare by determining what files need to be staged,
        and what job description needs to be submitted.

        We expect `_prepare_pilot(resource, pilot)` to return a dict with:

            { 
              'js' : saga.job.Description,
              'ft' : [ 
                { 'src' : string  # absolute source file name
                  'tgt' : string  # relative target file name
                  'rem' : bool    # shall we remove src?
                }, 
                ... ]
            }

        When transfering data, we'll ensure that each src is only transferred
        once (in fact, we put all src files into a tarball and unpack that on
        the target side).

        The returned dicts are expected to only contain files which actually
        need staging, ie. which have not been staged during a previous pilot
        submission.  That implies one of two things: either this component is
        stateful, and remembers what has been staged -- which makes it difficult
        to use multiple component instances; or the component inspects the
        target resource for existing files -- which involves additional
        expensive remote hops.
        FIXME: since neither is implemented at this point we won't discuss the
               tradeoffs further -- right now files are unique per pilot bulk.

        Once all dicts are collected, we create one additional file which
        contains the staging information, and then pack all src files into
        a tarball for staging.  We transfer the tarball, and *immediately*
        trigger the untaring on the target resource, which is thus *not* part of
        the bootstrapping process.
        NOTE: this is to avoid untaring race conditions for multiple pilots, and
              also to simplify bootstrapping dependencies -- the bootstrappers
              are likely within the tarball after all...
        """

        rcfg = self._session.get_resource_config(resource, schema)
        sid = self._session.uid

        # we create a fake session_sandbox with all pilot_sandboxes in /tmp, and
        # then tar it up.  Once we untar that tarball on the target machine, we
        # should have all sandboxes and all files required to bootstrap the
        # pilots
        # FIXME: on untar, there is a race between multiple launcher components
        #        within the same session toward the same target resource.
        tmp_dir = os.path.abspath(tempfile.mkdtemp(prefix='rp_agent_tar_dir'))
        tar_name = '%s.%s.tgz' % (sid, self.uid)
        tar_tgt = '%s/%s' % (tmp_dir, tar_name)
        tar_url = rs.Url('file://localhost/%s' % tar_tgt)

        # we need the session sandbox url, but that is (at least in principle)
        # dependent on the schema to use for pilot startup.  So we confirm here
        # that the bulk is consistent wrt. to the schema.
        # FIXME: if it is not, it needs to be splitted into schema-specific
        # sub-bulks
        schema = pilots[0]['description'].get('access_schema')
        for pilot in pilots[1:]:
            assert(schema == pilot['description'].get('access_schema')), \
                    'inconsistent scheme on launch / staging'

        session_sandbox = self._session._get_session_sandbox(pilots[0]).path

        # we will create the session sandbox before we untar, so we can use that
        # as workdir, and pack all paths relative to that session sandbox.  That
        # implies that we have to recheck that all URLs in fact do point into
        # the session sandbox.

        ft_list = list()  # files to stage
        jd_list = list()  # jobs  to submit
        for pilot in pilots:
            info = self._prepare_pilot(resource, rcfg, pilot)
            ft_list += info['ft']
            jd_list.append(info['jd'])
            self._prof.prof('staging_in_start', uid=pilot['uid'])

        for ft in ft_list:
            src = os.path.abspath(ft['src'])
            tgt = os.path.relpath(os.path.normpath(ft['tgt']), session_sandbox)
            # src_dir = os.path.dirname(src)
            tgt_dir = os.path.dirname(tgt)

            if tgt_dir.startswith('..'):
                raise ValueError('staging target %s outside of pilot sandbox' %
                                 ft['tgt'])

            if not os.path.isdir('%s/%s' % (tmp_dir, tgt_dir)):
                os.makedirs('%s/%s' % (tmp_dir, tgt_dir))

            if src == '/dev/null':
                # we want an empty file -- touch it (tar will refuse to
                # handle a symlink to /dev/null)
                open('%s/%s' % (tmp_dir, tgt), 'a').close()
            else:
                os.symlink(src, '%s/%s' % (tmp_dir, tgt))

        # tar.  If any command fails, this will raise.
        cmd = "cd %s && tar zchf %s *" % (tmp_dir, tar_tgt)
        self._log.debug('cmd: %s', cmd)
        try:
            out = sp.check_output(["/bin/sh", "-c", cmd], stderr=sp.STDOUT)
        except Exception:
            self._log.exception('callout failed: %s', out)
            raise
        else:
            self._log.debug('out: %s', out)

        # remove all files marked for removal-after-pack
        for ft in ft_list:
            if ft['rem']:
                os.unlink(ft['src'])

        fs_endpoint = rcfg['filesystem_endpoint']
        fs_url = rs.Url(fs_endpoint)

        self._log.debug("rs.file.Directory ('%s')", fs_url)

        with self._cache_lock:
            if fs_url in self._saga_fs_cache:
                fs = self._saga_fs_cache[fs_url]
            else:
                fs = rsfs.Directory(fs_url, session=self._session)
                self._saga_fs_cache[fs_url] = fs

        tar_rem = rs.Url(fs_url)
        tar_rem.path = "%s/%s" % (session_sandbox, tar_name)

        fs.copy(tar_url, tar_rem, flags=rsfs.CREATE_PARENTS)

        shutil.rmtree(tmp_dir)

        # we now need to untar on the target machine.
        js_url = ru.Url(pilots[0]['js_url'])

        # well, we actually don't need to talk to the lrms, but only need
        # a shell on the headnode.  That seems true for all LRMSs we use right
        # now.  So, lets convert the URL:
        if '+' in js_url.scheme:
            parts = js_url.scheme.split('+')
            if 'gsissh' in parts: js_url.scheme = 'gsissh'
            elif 'ssh' in parts: js_url.scheme = 'ssh'
        else:
            # In the non-combined '+' case we need to distinguish between
            # a url that was the result of a hop or a local lrms.
            if js_url.scheme not in ['ssh', 'gsissh']:
                js_url.scheme = 'fork'

        with self._cache_lock:
            if js_url in self._saga_js_cache:
                js_tmp = self._saga_js_cache[js_url]
            else:
                js_tmp = rs.job.Service(js_url, session=self._session)
                self._saga_js_cache[js_url] = js_tmp

    ## cmd = "tar zmxvf %s/%s -C / ; rm -f %s" % \
        cmd = "tar zmxvf %s/%s -C %s" % \
                (session_sandbox, tar_name, session_sandbox)
        j = js_tmp.run_job(cmd)
        j.wait()

        self._log.debug('tar cmd : %s', cmd)
        self._log.debug('tar done: %s, %s, %s', j.state, j.stdout, j.stderr)

        for pilot in pilots:
            self._prof.prof('staging_in_stop', uid=pilot['uid'])
            self._prof.prof('submission_start', uid=pilot['uid'])

        # look up or create JS for actual pilot submission.  This might result
        # in the same js url as above, or not.
        js_ep = rcfg['job_manager_endpoint']
        with self._cache_lock:
            if js_ep in self._saga_js_cache:
                js = self._saga_js_cache[js_ep]
            else:
                js = rs.job.Service(js_ep, session=self._session)
                self._saga_js_cache[js_ep] = js

        # now that the scripts are in place and configured,
        # we can launch the agent
        jc = rs.job.Container()

        for jd in jd_list:
            self._log.debug('jd: %s', pprint.pformat(jd.as_dict()))
            jc.add(js.create_job(jd))

        jc.run()

        # we assume here that the tasks arrive in the same order as the job
        # descriptions.  For uniform sets of pilots the order does not matter
        # much though.  Either way, this needs confirming on SAGA level
        # FIXME
        for j, jd in zip(jc.get_tasks(), jd_list):

            # do a quick error check
            if j.state == rs.FAILED:
                self._log.error('%s: %s : %s : %s', j.id, j.state, j.stderr,
                                j.stdout)
                raise RuntimeError("SAGA Job state is FAILED. (%s)" % jd.name)

            pilot = None
            pid = jd.name
            for p in pilots:
                if p['uid'] == pid:
                    pilot = p
                    break

            assert (pilot)

            # Update the Pilot's state to 'PMGR_ACTIVE_PENDING' if SAGA job
            # submission was successful.  Since the pilot leaves the scope of
            # the PMGR for the time being, we update the complete DB document
            pilot['$all'] = True

            # FIXME: update the right pilot
            with self._pilots_lock:

                self._pilots[pid] = dict()
                self._pilots[pid]['pilot'] = pilot
                self._pilots[pid]['job'] = j

            # make sure we watch that pilot
            with self._check_lock:
                self._checking.append(pid)

        for pilot in pilots:
            self._prof.prof('submission_stop', uid=pilot['uid'])
Example #15
0
    def _get_resource_sandbox(self, pilot):
        """
        for a given pilot dict, determine the global RP sandbox, based on the
        pilot's 'resource' attribute.
        """

        self.is_valid()

        # FIXME: this should get 'resource, schema=None' as parameters

        resource = pilot['description'].get('resource')
        schema   = pilot['description'].get('access_schema')

        if not resource:
            raise ValueError('Cannot get pilot sandbox w/o resource target')

        # the global sandbox will be the same for all pilots on any resource, so
        # we cache it
        with self._cache_lock:

            if resource not in self._cache['resource_sandbox']:

                # cache miss -- determine sandbox and fill cache
                rcfg   = self.get_resource_config(resource, schema)
                fs_url = rs.Url(rcfg['filesystem_endpoint'])

                # Get the sandbox from either the pilot_desc or resource conf
                sandbox_raw = pilot['description'].get('sandbox')
                if not sandbox_raw:
                    sandbox_raw = rcfg.get('default_remote_workdir', "$PWD")
        
                # If the sandbox contains expandables, we need to resolve those remotely.
                # NOTE: Note that this will only work for (gsi)ssh or shell based access mechanisms
                if '$' not in sandbox_raw and '`' not in sandbox_raw:
                    # no need to expand further
                    sandbox_base = sandbox_raw

                else:
                    js_url = rs.Url(rcfg['job_manager_endpoint'])
        
                    if 'ssh' in js_url.schema.split('+'):
                        js_url.schema = 'ssh'
                    elif 'gsissh' in js_url.schema.split('+'):
                        js_url.schema = 'gsissh'
                    elif 'fork' in js_url.schema.split('+'):
                        js_url.schema = 'fork'
                    elif '+' not in js_url.schema:
                        # For local access to queueing systems use fork
                        js_url.schema = 'fork'
                    else:
                        raise Exception("unsupported access schema: %s" % js_url.schema)
        
                    self._log.debug("rsup.PTYShell('%s')", js_url)
                    shell = rsup.PTYShell(js_url, self)
        
                    ret, out, err = shell.run_sync(' echo "WORKDIR: %s"' % sandbox_raw)
                    if ret == 0 and 'WORKDIR:' in out:
                        sandbox_base = out.split(":")[1].strip()
                        self._log.debug("sandbox base %s: '%s'", js_url, sandbox_base)
                    else:
                        raise RuntimeError("Couldn't get remote working directory.")
        
                # at this point we have determined the remote 'pwd' - the global sandbox
                # is relative to it.
                fs_url.path = "%s/radical.pilot.sandbox" % sandbox_base
        
                # before returning, keep the URL string in cache
                self._cache['resource_sandbox'][resource] = fs_url

            return self._cache['resource_sandbox'][resource]
Example #16
0
    def _prepare_pilot(self, resource, rcfg, pilot):

        pid = pilot["uid"]
        ret = {'ft': list(), 'jd': None}

        # # ----------------------------------------------------------------------
        # # the rcfg can contain keys with string expansion placeholders where
        # # values from the pilot description need filling in.  A prominent
        # # example is `%(pd.project)s`, where the pilot description's `PROJECT`
        # # value needs to be filled in (here in lowercase).
        # expand = dict()
        # for k,v in pilot['description'].iteritems():
        #     if v is None:
        #         v = ''
        #     expand['pd.%s' % k] = v
        #     if isinstance(v, basestring):
        #         expand['pd.%s' % k.upper()] = v.upper()
        #         expand['pd.%s' % k.lower()] = v.lower()
        #     else:
        #         expand['pd.%s' % k.upper()] = v
        #         expand['pd.%s' % k.lower()] = v
        #
        # for k in rcfg:
        #     if isinstance(rcfg[k], basestring):
        #         orig     = rcfg[k]
        #         rcfg[k]  = rcfg[k] % expand
        #         expanded = rcfg[k]
        #         if orig != expanded:
        #             self._log.debug('RCFG:\n%s\n%s', orig, expanded)

        # ----------------------------------------------------------------------
        # Database connection parameters
        sid = self._session.uid
        database_url = self._session.dburl

        # some default values are determined at runtime
        default_virtenv = '%%(resource_sandbox)s/ve.%s.%s' % \
                          (resource, self._rp_version)

        # ----------------------------------------------------------------------
        # pilot description and resource configuration
        number_cores = pilot['description']['cores']
        number_gpus = pilot['description']['gpus']
        runtime = pilot['description']['runtime']
        queue = pilot['description']['queue']
        project = pilot['description']['project']
        cleanup = pilot['description']['cleanup']
        memory = pilot['description']['memory']
        candidate_hosts = pilot['description']['candidate_hosts']

        # ----------------------------------------------------------------------
        # get parameters from resource cfg, set defaults where needed
        agent_launch_method = rcfg.get('agent_launch_method')
        agent_dburl = rcfg.get('agent_mongodb_endpoint', database_url)
        agent_spawner = rcfg.get('agent_spawner', DEFAULT_AGENT_SPAWNER)
        rc_agent_config = rcfg.get('agent_config', DEFAULT_AGENT_CONFIG)
        agent_scheduler = rcfg.get('agent_scheduler')
        tunnel_bind_device = rcfg.get('tunnel_bind_device')
        default_queue = rcfg.get('default_queue')
        forward_tunnel_endpoint = rcfg.get('forward_tunnel_endpoint')
        lrms = rcfg.get('lrms')
        mpi_launch_method = rcfg.get('mpi_launch_method', '')
        pre_bootstrap_0 = rcfg.get('pre_bootstrap_0', [])
        pre_bootstrap_1 = rcfg.get('pre_bootstrap_1', [])
        python_interpreter = rcfg.get('python_interpreter')
        task_launch_method = rcfg.get('task_launch_method')
        rp_version = rcfg.get('rp_version', DEFAULT_RP_VERSION)
        virtenv_mode = rcfg.get('virtenv_mode', DEFAULT_VIRTENV_MODE)
        virtenv = rcfg.get('virtenv', default_virtenv)
        cores_per_node = rcfg.get('cores_per_node', 0)
        gpus_per_node = rcfg.get('gpus_per_node', 0)
        lfs_path_per_node = rcfg.get('lfs_path_per_node', None)
        lfs_size_per_node = rcfg.get('lfs_size_per_node', 0)
        python_dist = rcfg.get('python_dist')
        virtenv_dist = rcfg.get('virtenv_dist', DEFAULT_VIRTENV_DIST)
        cu_tmp = rcfg.get('cu_tmp')
        spmd_variation = rcfg.get('spmd_variation')
        shared_filesystem = rcfg.get('shared_filesystem', True)
        stage_cacerts = rcfg.get('stage_cacerts', False)
        cu_pre_exec = rcfg.get('cu_pre_exec')
        cu_post_exec = rcfg.get('cu_post_exec')
        export_to_cu = rcfg.get('export_to_cu')
        mandatory_args = rcfg.get('mandatory_args', [])
        saga_jd_supplement = rcfg.get('saga_jd_supplement', {})

        import pprint
        self._log.debug(cores_per_node)
        self._log.debug(pprint.pformat(rcfg))

        # make sure that mandatory args are known
        for ma in mandatory_args:
            if pilot['description'].get(ma) is None:
                raise ValueError('attribute "%s" is required for "%s"' %
                                 (ma, resource))

        # get pilot and global sandbox
        resource_sandbox = self._session._get_resource_sandbox(pilot).path
        session_sandbox = self._session._get_session_sandbox(pilot).path
        pilot_sandbox = self._session._get_pilot_sandbox(pilot).path

        pilot['resource_sandbox'] = str(
            self._session._get_resource_sandbox(pilot))
        pilot['pilot_sandbox'] = str(self._session._get_pilot_sandbox(pilot))
        pilot['client_sandbox'] = str(self._session._get_client_sandbox())

        # Agent configuration that is not part of the public API.
        # The agent config can either be a config dict, or
        # a string pointing to a configuration name.  If neither
        # is given, check if 'RADICAL_PILOT_AGENT_CONFIG' is
        # set.  The last fallback is 'agent_default'
        agent_config = pilot['description'].get('_config')
        if not agent_config:
            agent_config = os.environ.get('RADICAL_PILOT_AGENT_CONFIG')
        if not agent_config:
            agent_config = rc_agent_config

        if isinstance(agent_config, dict):

            # use dict as is
            agent_cfg = agent_config

        elif isinstance(agent_config, basestring):
            try:
                # interpret as a config name
                agent_cfg_file = os.path.join(self._conf_dir,
                                              "agent_%s.json" % agent_config)

                self._log.info("Read agent config file: %s", agent_cfg_file)
                agent_cfg = ru.read_json(agent_cfg_file)

                # allow for user level overload
                user_cfg_file = '%s/.radical/pilot/config/%s' \
                              % (os.environ['HOME'], os.path.basename(agent_cfg_file))

                if os.path.exists(user_cfg_file):
                    self._log.info("merging user config: %s" % user_cfg_file)
                    user_cfg = ru.read_json(user_cfg_file)
                    ru.dict_merge(agent_cfg, user_cfg, policy='overwrite')

            except Exception as e:
                self._log.exception("Error reading agent config file: %s" % e)
                raise

        else:
            # we can't handle this type
            raise TypeError(
                'agent config must be string (config name) or dict')

        # expand variables in virtenv string
        virtenv = virtenv % {
            'pilot_sandbox': pilot_sandbox,
            'session_sandbox': session_sandbox,
            'resource_sandbox': resource_sandbox
        }

        # Check for deprecated global_virtenv
        if 'global_virtenv' in rcfg:
            raise RuntimeError("'global_virtenv' is deprecated (%s)" %
                               resource)

        # Create a host:port string for use by the bootstrap_0.
        db_url = rs.Url(agent_dburl)
        if db_url.port:
            db_hostport = "%s:%d" % (db_url.host, db_url.port)
        else:
            db_hostport = "%s:%d" % (db_url.host, 27017)  # mongodb default

        # ----------------------------------------------------------------------
        # the version of the agent is derived from
        # rp_version, which has the following format
        # and interpretation:
        #
        # case rp_version:
        #   @<token>:
        #   @tag/@branch/@commit: # no sdist staging
        #       git clone $github_base radical.pilot.src
        #       (cd radical.pilot.src && git checkout token)
        #       pip install -t $VIRTENV/rp_install/ radical.pilot.src
        #       rm -rf radical.pilot.src
        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
        #
        #   release: # no sdist staging
        #       pip install -t $VIRTENV/rp_install radical.pilot
        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
        #
        #   local: # needs sdist staging
        #       tar zxf $sdist.tgz
        #       pip install -t $VIRTENV/rp_install $sdist/
        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
        #
        #   debug: # needs sdist staging
        #       tar zxf $sdist.tgz
        #       pip install -t $SANDBOX/rp_install $sdist/
        #       export PYTHONPATH=$SANDBOX/rp_install:$PYTHONPATH
        #
        #   installed: # no sdist staging
        #       true
        # esac
        #
        # virtenv_mode
        #   private : error  if ve exists, otherwise create, then use
        #   update  : update if ve exists, otherwise create, then use
        #   create  : use    if ve exists, otherwise create, then use
        #   use     : use    if ve exists, otherwise error,  then exit
        #   recreate: delete if ve exists, otherwise create, then use
        #
        # examples   :
        #   [email protected]
        #   virtenv@devel
        #   virtenv@release
        #   virtenv@installed
        #   stage@local
        #   stage@/tmp/my_agent.py
        #
        # Note that some combinations may be invalid,
        # specifically in the context of virtenv_mode.  If, for
        # example, virtenv_mode is 'use', then the 'virtenv:tag'
        # will not make sense, as the virtenv is not updated.
        # In those cases, the virtenv_mode is honored, and
        # a warning is printed.
        #
        # Also, the 'stage' mode can only be combined with the
        # 'local' source, or with a path to the agent (relative
        # to root_dir, or absolute).
        #
        # A rp_version which does not adhere to the
        # above syntax is ignored, and the fallback stage@local
        # is used.

        if  not rp_version.startswith('@') and \
            not rp_version in ['installed', 'local', 'debug', 'release']:
            raise ValueError("invalid rp_version '%s'" % rp_version)

        if rp_version.startswith('@'):
            rp_version = rp_version[1:]  # strip '@'

        # ----------------------------------------------------------------------
        # sanity checks
        if not python_dist: raise RuntimeError("missing python distribution")
        if not virtenv_dist:
            raise RuntimeError("missing virtualenv distribution")
        if not agent_spawner: raise RuntimeError("missing agent spawner")
        if not agent_scheduler: raise RuntimeError("missing agent scheduler")
        if not lrms: raise RuntimeError("missing LRMS")
        if not agent_launch_method:
            raise RuntimeError("missing agentlaunch method")
        if not task_launch_method:
            raise RuntimeError("missing task launch method")

        # massage some values
        if not queue:
            queue = default_queue

        if cleanup and isinstance(cleanup, bool):
            #  l : log files
            #  u : unit work dirs
            #  v : virtualenv
            #  e : everything (== pilot sandbox)
            if shared_filesystem:
                cleanup = 'luve'
            else:
                # we cannot clean the sandbox from within the agent, as the hop
                # staging would then fail, and we'd get nothing back.
                # FIXME: cleanup needs to be done by the pmgr.launcher, or
                #        someone else, really, after fetching all logs and
                #        profiles.
                cleanup = 'luv'

            # we never cleanup virtenvs which are not private
            if virtenv_mode is not 'private':
                cleanup = cleanup.replace('v', '')

        # add dists to staging files, if needed
        if rp_version in ['local', 'debug']:
            sdist_names = [ru.sdist_name, rs.sdist_name, self._rp_sdist_name]
            sdist_paths = [ru.sdist_path, rs.sdist_path, self._rp_sdist_path]
        else:
            sdist_names = list()
            sdist_paths = list()

        # if cores_per_node is set (!= None), then we need to
        # allocation full nodes, and thus round up
        if cores_per_node:
            cores_per_node = int(cores_per_node)
            number_cores = int(cores_per_node *
                               math.ceil(float(number_cores) / cores_per_node))

        # if gpus_per_node is set (!= None), then we need to
        # allocation full nodes, and thus round up
        if gpus_per_node:
            gpus_per_node = int(gpus_per_node)
            number_gpus = int(gpus_per_node *
                              math.ceil(float(number_gpus) / gpus_per_node))

        # set mandatory args
        bootstrap_args = ""
        bootstrap_args += " -d '%s'" % ':'.join(sdist_names)
        bootstrap_args += " -p '%s'" % pid
        bootstrap_args += " -s '%s'" % sid
        bootstrap_args += " -m '%s'" % virtenv_mode
        bootstrap_args += " -r '%s'" % rp_version
        bootstrap_args += " -b '%s'" % python_dist
        bootstrap_args += " -g '%s'" % virtenv_dist
        bootstrap_args += " -v '%s'" % virtenv
        bootstrap_args += " -y '%d'" % runtime

        # set optional args
        if lrms == "CCM": bootstrap_args += " -c"
        if forward_tunnel_endpoint:
            bootstrap_args += " -f '%s'" % forward_tunnel_endpoint
        if forward_tunnel_endpoint: bootstrap_args += " -h '%s'" % db_hostport
        if python_interpreter:
            bootstrap_args += " -i '%s'" % python_interpreter
        if tunnel_bind_device:
            bootstrap_args += " -t '%s'" % tunnel_bind_device
        if cleanup: bootstrap_args += " -x '%s'" % cleanup

        for arg in pre_bootstrap_0:
            bootstrap_args += " -e '%s'" % arg
        for arg in pre_bootstrap_1:
            bootstrap_args += " -w '%s'" % arg

        agent_cfg['owner'] = 'agent_0'
        agent_cfg['cores'] = number_cores
        agent_cfg['gpus'] = number_gpus
        agent_cfg['lrms'] = lrms
        agent_cfg['spawner'] = agent_spawner
        agent_cfg['scheduler'] = agent_scheduler
        agent_cfg['runtime'] = runtime
        agent_cfg['dburl'] = str(database_url)
        agent_cfg['session_id'] = sid
        agent_cfg['pilot_id'] = pid
        agent_cfg['logdir'] = '.'
        agent_cfg['pilot_sandbox'] = pilot_sandbox
        agent_cfg['session_sandbox'] = session_sandbox
        agent_cfg['resource_sandbox'] = resource_sandbox
        agent_cfg['agent_launch_method'] = agent_launch_method
        agent_cfg['task_launch_method'] = task_launch_method
        agent_cfg['mpi_launch_method'] = mpi_launch_method
        agent_cfg['cores_per_node'] = cores_per_node
        agent_cfg['gpus_per_node'] = gpus_per_node
        agent_cfg['lfs_path_per_node'] = lfs_path_per_node
        agent_cfg['lfs_size_per_node'] = lfs_size_per_node
        agent_cfg['cu_tmp'] = cu_tmp
        agent_cfg['export_to_cu'] = export_to_cu
        agent_cfg['cu_pre_exec'] = cu_pre_exec
        agent_cfg['cu_post_exec'] = cu_post_exec
        agent_cfg['resource_cfg'] = copy.deepcopy(rcfg)
        agent_cfg['debug'] = self._log.getEffectiveLevel()

        # we'll also push the agent config into MongoDB
        pilot['cfg'] = agent_cfg

        # ----------------------------------------------------------------------
        # Write agent config dict to a json file in pilot sandbox.

        agent_cfg_name = 'agent_0.cfg'
        cfg_tmp_handle, cfg_tmp_file = tempfile.mkstemp(prefix='rp.agent_cfg.')
        os.close(cfg_tmp_handle)  # file exists now

        # Convert dict to json file
        self._log.debug("Write agent cfg to '%s'.", cfg_tmp_file)
        self._log.debug(pprint.pformat(agent_cfg))
        ru.write_json(agent_cfg, cfg_tmp_file)

        ret['ft'].append({
            'src': cfg_tmp_file,
            'tgt': '%s/%s' % (pilot_sandbox, agent_cfg_name),
            'rem': True
        })  # purge the tmp file after packing

        # ----------------------------------------------------------------------
        # we also touch the log and profile tarballs in the target pilot sandbox
        ret['ft'].append({
            'src': '/dev/null',
            'tgt': '%s/%s' % (pilot_sandbox, '%s.log.tgz' % pid),
            'rem': False
        })  # don't remove /dev/null
        # only stage profiles if we profile
        if self._prof.enabled:
            ret['ft'].append({
                'src':
                '/dev/null',
                'tgt':
                '%s/%s' % (pilot_sandbox, '%s.prof.tgz' % pid),
                'rem':
                False
            })  # don't remove /dev/null

        # check if we have a sandbox cached for that resource.  If so, we have
        # nothing to do.  Otherwise we create the sandbox and stage the RP
        # stack etc.
        # NOTE: this will race when multiple pilot launcher instances are used!
        with self._cache_lock:

            if resource not in self._sandboxes:

                for sdist in sdist_paths:
                    base = os.path.basename(sdist)
                    ret['ft'].append({
                        'src': sdist,
                        'tgt': '%s/%s' % (session_sandbox, base),
                        'rem': False
                    })

                # Copy the bootstrap shell script.
                bootstrapper_path = os.path.abspath(
                    "%s/agent/%s" % (self._root_dir, BOOTSTRAPPER_0))
                self._log.debug("use bootstrapper %s", bootstrapper_path)

                ret['ft'].append({
                    'src':
                    bootstrapper_path,
                    'tgt':
                    '%s/%s' % (session_sandbox, BOOTSTRAPPER_0),
                    'rem':
                    False
                })

                # Some machines cannot run pip due to outdated CA certs.
                # For those, we also stage an updated certificate bundle
                # TODO: use booleans all the way?
                if stage_cacerts:

                    cc_name = 'cacert.pem.gz'
                    cc_path = os.path.abspath("%s/agent/%s" %
                                              (self._root_dir, cc_name))
                    self._log.debug("use CAs %s", cc_path)

                    ret['ft'].append({
                        'src':
                        cc_path,
                        'tgt':
                        '%s/%s' % (session_sandbox, cc_name),
                        'rem':
                        False
                    })

                self._sandboxes[resource] = True

        # ----------------------------------------------------------------------
        # Create SAGA Job description and submit the pilot job

        jd = rs.job.Description()

        if shared_filesystem:
            bootstrap_tgt = '%s/%s' % (session_sandbox, BOOTSTRAPPER_0)
        else:
            bootstrap_tgt = '%s/%s' % ('.', BOOTSTRAPPER_0)

        jd.name = pid
        jd.executable = "/bin/bash"
        jd.arguments = ['-l %s' % bootstrap_tgt, bootstrap_args]
        jd.working_directory = pilot_sandbox
        jd.project = project
        jd.output = "bootstrap_0.out"
        jd.error = "bootstrap_0.err"
        jd.total_cpu_count = number_cores
        jd.total_gpu_count = number_gpus
        jd.processes_per_host = cores_per_node
        jd.spmd_variation = spmd_variation
        jd.wall_time_limit = runtime
        jd.total_physical_memory = memory
        jd.queue = queue
        jd.candidate_hosts = candidate_hosts
        jd.environment = dict()

        # we set any saga_jd_supplement keys which are not already set above
        for key, val in saga_jd_supplement.iteritems():
            if not jd[key]:
                self._log.debug('supplement %s: %s', key, val)
                jd[key] = val

        if 'RADICAL_PILOT_PROFILE' in os.environ:
            jd.environment['RADICAL_PILOT_PROFILE'] = 'TRUE'

        # for condor backends and the like which do not have shared FSs, we add
        # additional staging directives so that the backend system binds the
        # files from the session and pilot sandboxes to the pilot job.
        jd.file_transfer = list()
        if not shared_filesystem:

            jd.file_transfer.extend([
                'site:%s/%s > %s' %
                (session_sandbox, BOOTSTRAPPER_0, BOOTSTRAPPER_0),
                'site:%s/%s > %s' %
                (pilot_sandbox, agent_cfg_name, agent_cfg_name),
                'site:%s/%s.log.tgz > %s.log.tgz' % (pilot_sandbox, pid, pid),
                'site:%s/%s.log.tgz < %s.log.tgz' % (pilot_sandbox, pid, pid)
            ])

            if 'RADICAL_PILOT_PROFILE' in os.environ:
                jd.file_transfer.extend([
                    'site:%s/%s.prof.tgz > %s.prof.tgz' %
                    (pilot_sandbox, pid, pid),
                    'site:%s/%s.prof.tgz < %s.prof.tgz' %
                    (pilot_sandbox, pid, pid)
                ])

            for sdist in sdist_names:
                jd.file_transfer.extend(
                    ['site:%s/%s > %s' % (session_sandbox, sdist, sdist)])

            if stage_cacerts:
                jd.file_transfer.extend(
                    ['site:%s/%s > %s' % (session_sandbox, cc_name, cc_name)])

        self._log.debug("Bootstrap command line: %s %s", jd.executable,
                        jd.arguments)

        ret['jd'] = jd
        return ret
Example #17
0
def fetch_profiles (sid, dburl=None, src=None, tgt=None, access=None, 
        session=None, skip_existing=False, fetch_client=False, log=None):
    '''
    sid: session for which all profiles are fetched
    src: dir to look for client session profiles ($src/$sid/*.prof)
    tgt: dir to store the profile in
         - $tgt/$sid/*.prof,
         - $tgt/$sid/$pilot_id/*.prof)

    returns list of file names
    '''

    if not log and session:
        log = session._log
        rep = session._rep
    elif not log:
        log = ru.Logger('radical.pilot.utils')
        rep = ru.Reporter('radical.pilot.utils')

    ret = list()

    if not dburl:
        dburl = os.environ['RADICAL_PILOT_DBURL']

    if not dburl:
        raise ValueError('RADICAL_PILOT_DBURL is not set')

    if not src:
        src = os.getcwd()

    if not tgt:
        tgt = os.getcwd()

    if not tgt.startswith('/') and '://' not in tgt:
        tgt = "%s/%s" % (os.getcwd(), tgt)

    # we always create a session dir as real target
    tgt_url = rs.Url("%s/%s/" % (tgt, sid))

    # Turn URLs without schema://host into file://localhost,
    # so that they dont become interpreted as relative.
    if not tgt_url.schema:
        tgt_url.schema = 'file'
    if not tgt_url.host:
        tgt_url.host = 'localhost'

    # first fetch session profile
    if fetch_client:
        client_profiles = glob.glob("%s/%s/*.prof" % (src, sid))
        if not client_profiles:
            raise RuntimeError('no client profiles in %s/%s' % (src, sid))

        for client_profile in client_profiles:

            ftgt = rs.Url('%s/%s' % (tgt_url, os.path.basename(client_profile)))
            ret.append("%s" % ftgt.path)

            if skip_existing and os.path.isfile(ftgt.path) \
                    and os.stat(ftgt.path).st_size > 0:
                pass
            else:
                prof_file = rs.fs.File(client_profile, session=session)
                prof_file.copy(ftgt, flags=rs.fs.CREATE_PARENTS)
                prof_file.close()

            if not os.path.isfile(client_profile):
                raise RuntimeError('client profilefile %s does not exist' % client_profile)

    _, db, _, _, _ = ru.mongodb_connect (dburl)

    json_docs = get_session_docs(db, sid)

    pilots = json_docs['pilot']
    num_pilots = len(pilots)
    log.debug("Session: %s", sid)
    log.debug("Number of pilots in session: %d", num_pilots)

    for pilot in pilots:

        try:
            log.debug("processing pilot '%s'", pilot['uid'])

            sandbox_url = rs.Url(pilot['pilot_sandbox'])

            if access:
                # Allow to use a different access schema than used for the the run.
                # Useful if you ran from the headnode, but would like to retrieve
                # the profiles to your desktop (Hello Titan).
                access_url = rs.Url(access)
                sandbox_url.schema = access_url.schema
                sandbox_url.host   = access_url.host

              # print "Overriding remote sandbox: %s" % sandbox_url

            sandbox = rs.fs.Directory (sandbox_url, session=session)

            # Try to fetch a tarball of profiles, so that we can get them all in one (SAGA) go!
            PROFILES_TARBALL = '%s.prof.tgz' % pilot['uid']
            tarball_available = False
            try:
                if  sandbox.is_file(PROFILES_TARBALL) and \
                    sandbox.get_size(PROFILES_TARBALL):

                    log.info("profiles tarball exists")
                    ftgt = rs.Url('%s/%s' % (tgt_url, PROFILES_TARBALL))

                    if skip_existing and os.path.isfile(ftgt.path) \
                            and os.stat(ftgt.path).st_size > 0:

                        log.info("skip fetching of '%s/%s' to '%s'.", 
                                 sandbox_url, PROFILES_TARBALL, tgt_url)
                        tarball_available = True
                    else:

                        log.info("fetch '%s%s' to '%s'.", sandbox_url, 
                                 PROFILES_TARBALL, tgt_url)

                        prof_file = rs.fs.File("%s%s" % (sandbox_url,
                                            PROFILES_TARBALL), session=session)
                        prof_file.copy(ftgt, flags=rs.fs.CREATE_PARENTS)
                        prof_file.close()

                        tarball_available = True
                else:
                    log.warn("profiles tarball doesnt exists!")

            except rs.DoesNotExist:
                log.exception("exception(TODO): profiles tarball doesnt exists!")

            try:
                os.mkdir("%s/%s" % (tgt_url.path, pilot['uid']))
            except OSError:
                pass

            # We now have a local tarball
            if tarball_available:
                log.info("Extract tarball %s to '%s'.", ftgt.path, tgt_url.path)
                try:
                    tarball = tarfile.open(ftgt.path, mode='r:gz')
                    tarball.extractall("%s/%s" % (tgt_url.path, pilot['uid']))

                    profiles = glob.glob("%s/%s/*.prof" % (tgt_url.path, pilot['uid']))
                    ret.extend(profiles)
                    os.unlink(ftgt.path)

                    # If extract succeeded, no need to fetch individual profiles
                    rep.ok("+ %s (profiles)\n" % pilot['uid'])
                    continue

                except Exception as e:
                    log.warn('could not extract tarball %s [%s]', ftgt.path, e)

            # If we dont have a tarball (for whichever reason), fetch individual profiles
            profiles = sandbox.list('*.prof')
            for prof in profiles:

                ftgt = rs.Url('%s/%s/%s' % (tgt_url, pilot['uid'], prof))
                ret.append("%s" % ftgt.path)

                if skip_existing and os.path.isfile(ftgt.path) \
                                 and os.stat(ftgt.path).st_size > 0:
                    pass
                else:
                    prof_file = rs.fs.File("%s%s" % (sandbox_url, prof), session=session)
                    prof_file.copy(ftgt, flags=rs.fs.CREATE_PARENTS)
                    prof_file.close()

            rep.ok("+ %s (profiles)\n" % pilot['uid'])

        except Exception as e:
            rep.error("- %s (profiles)\n" % pilot['uid'])
            log.exception('failed to fet profile for %s', pilot['uid'])

    return ret
    def run(self):
        """Starts the process when Process.start() is called.
        """

        # make sure to catch sys.exit (which raises SystemExit)
        try:

            # Try to connect to the database and create a tailable cursor.
            try:
                db = self._session.get_db()
                um_col = db["%s.cu" % self._session.uid]
                logger.debug(
                    "Connected to MongoDB. Serving requests for UnitManager %s."
                    % self.unit_manager_id)

            except Exception as e:
                logger.exception("Connection error: %s" % e)
                return

            while not self._stop.is_set() and \
                  not self._session._terminate.is_set():

                # See if we can find a ComputeUnit that is waiting for client output file transfer.
                # FIXME: this method is not bulkable.  See agent pulling for
                #        units for an approach to split the call into two bulkable
                #        ones.
                ts = timestamp()
                compute_unit = um_col.find_and_modify(
                    query={
                        "unitmanager": self.unit_manager_id,
                        "state": PENDING_OUTPUT_STAGING,
                        "control": 'agent'
                    },
                    update={
                        "$set": {
                            "state": STAGING_OUTPUT,
                            "control": 'umgr'
                        },
                        "$push": {
                            "statehistory": {
                                "state": STAGING_OUTPUT,
                                "timestamp": ts
                            }
                        }
                    })

                if compute_unit is None:
                    # Sleep a bit if no new units are available.
                    time.sleep(IDLE_TIME)
                else:
                    logger.info("OFTW CU found, progressing ...")
                    state = STAGING_OUTPUT
                    compute_unit_id = None
                    try:
                        log_messages = []

                        # We have found a new CU. Now we can process the transfer
                        # directive(s) with SAGA.
                        compute_unit_id = str(compute_unit["_id"])

                        self._session.prof.prof('advance',
                                                uid=compute_unit_id,
                                                msg=STAGING_OUTPUT,
                                                state=STAGING_OUTPUT)
                        logger.debug(
                            "OutputStagingController: unit found: %s" %
                            compute_unit_id)

                        remote_sandbox = compute_unit["sandbox"]
                        output_staging = compute_unit.get(
                            "FTW_Output_Directives", [])

                        logger.info(
                            "OutputStagingController: Processing output file transfers for ComputeUnit %s"
                            % compute_unit_id)
                        # Loop over all staging directives and execute them.
                        for sd in output_staging:

                            logger.debug(
                                "OutputStagingController: sd: %s : %s" %
                                (compute_unit_id, sd))

                            # Check if there was a cancel request for this CU
                            # TODO: Can't these cancel requests come from a central place?
                            state_doc = um_col.find_one(
                                {"_id": compute_unit_id}, fields=["state"])
                            if state_doc['state'] == CANCELED:
                                logger.info(
                                    "Compute Unit Canceled, interrupting output file transfers."
                                )
                                self._session.prof.prof('advance',
                                                        uid=compute_unit_id,
                                                        msg=CANCELED,
                                                        state=CANCELED)
                                state = CANCELED
                                # Break out of the loop over all SD's, into the loop over CUs
                                break

                            abs_src = "%s/%s" % (remote_sandbox, sd['source'])

                            if os.path.basename(sd['target']) == sd['target']:
                                abs_target = "file://localhost%s" % os.path.join(
                                    os.getcwd(), sd['target'])
                            else:
                                abs_target = "file://localhost%s" % os.path.abspath(
                                    sd['target'])

                            log_msg = "Transferring output file %s -> %s" % (
                                abs_src, abs_target)
                            log_messages.append(log_msg)
                            logger.debug(log_msg)

                            output_file = saga.filesystem.File(
                                saga.Url(abs_src), session=self._session)

                            if CREATE_PARENTS in sd['flags']:
                                copy_flags = saga.filesystem.CREATE_PARENTS
                            else:
                                copy_flags = 0

                            try:
                                output_file.copy(saga.Url(abs_target),
                                                 flags=copy_flags)
                                output_file.close()
                            except Exception as e:
                                logger.exception(e)
                                raise Exception("copy failed(%s)" % e.message)

                        # If the CU was canceled we can skip the remainder of this loop,
                        # and return to the CU loop
                        if state == CANCELED:
                            continue

                        # Update the CU's state to 'DONE'.
                        ts = timestamp()
                        log_message = "Output transfer completed."
                        um_col.update({'_id': compute_unit_id}, {
                            '$set': {
                                'state': DONE
                            },
                            '$push': {
                                'statehistory': {
                                    'state': DONE,
                                    'timestamp': ts
                                },
                                'log': {
                                    'message': log_message,
                                    'timestamp': ts
                                }
                            }
                        })
                        self._session.prof.prof('advance',
                                                uid=compute_unit_id,
                                                msg=DONE,
                                                state=DONE)

                    except Exception as e:
                        # Update the CU's state to 'FAILED'.
                        ts = timestamp()
                        log_message = "Output transfer failed: %s" % e
                        um_col.update({'_id': compute_unit_id}, {
                            '$set': {
                                'state': FAILED
                            },
                            '$push': {
                                'statehistory': {
                                    'state': FAILED,
                                    'timestamp': ts
                                },
                                'log': {
                                    'message': log_message,
                                    'timestamp': ts
                                }
                            }
                        })
                        logger.exception(log_message)
                        self._session.prof.prof('advance',
                                                uid=compute_unit_id,
                                                msg=FAILED,
                                                state=FAILED)
                        raise

        except SystemExit as e:
            logger.exception(
                "output file transfer thread caught system exit -- forcing application shutdown"
            )
            thread.interrupt_main()
Example #19
0
    def execute_pattern(self, pattern, resource):

        pattern_start_time = datetime.datetime.now()

        def get_input_data(kernel, instance=None, iteration=None, ktype=None):

            # INPUT DATA:
            ip_list = []
            #------------------------------------------------------------------------------------------------------------------
            # upload_input_data
            data_in = []
            if kernel._kernel._upload_input_data is not None:
                if isinstance(kernel._kernel._upload_input_data, list):
                    pass
                else:
                    kernel._kernel._upload_input_data = [
                        kernel._kernel._upload_input_data
                    ]
                for i in range(0, len(kernel._kernel._upload_input_data)):
                    if (ktype == 'simulation' or ktype == 'analysis'):
                        var = resolve_placeholder_vars(
                            working_dirs=self.working_dirs,
                            path=kernel._kernel._upload_input_data[i],
                            instance=instance,
                            iteration=iteration,
                            type=ktype)
                    else:
                        var = resolve_placeholder_vars(
                            working_dirs=self.working_dirs,
                            path=kernel._kernel._upload_input_data[i])
                    if len(var.split('>')) > 1:
                        temp = {
                            'source': var.split('>')[0].strip(),
                            'target': var.split('>')[1].strip()
                        }
                    else:
                        temp = {
                            'source': var.split('>')[0].strip(),
                            'target':
                            os.path.basename(var.split('>')[0].strip())
                        }
                    data_in.append(temp)

                if ip_list is None:
                    ip_list = data_in
                else:
                    ip_list += data_in
            #------------------------------------------------------------------------------------------------------------------

            #------------------------------------------------------------------------------------------------------------------
            # link_input_data
            data_in = []
            if kernel._kernel._link_input_data is not None:
                if isinstance(kernel._kernel._link_input_data, list):
                    pass
                else:
                    kernel._kernel._link_input_data = [
                        kernel._kernel._link_input_data
                    ]
                for i in range(0, len(kernel._kernel._link_input_data)):
                    if (ktype == 'simulation' or ktype == 'analysis'):
                        var = resolve_placeholder_vars(
                            working_dirs=self.working_dirs,
                            path=kernel._kernel._link_input_data[i],
                            instance=instance,
                            iteration=iteration,
                            type=ktype)
                    else:
                        var = resolve_placeholder_vars(
                            working_dirs=self.working_dirs,
                            path=kernel._kernel._link_input_data[i])
                    if len(var.split('>')) > 1:
                        temp = {
                            'source': var.split('>')[0].strip(),
                            'target': var.split('>')[1].strip(),
                            'action': radical.pilot.LINK
                        }
                    else:
                        temp = {
                            'source': var.split('>')[0].strip(),
                            'target':
                            os.path.basename(var.split('>')[0].strip()),
                            'action': radical.pilot.LINK
                        }
                    data_in.append(temp)

                if ip_list is None:
                    ip_list = data_in
                else:
                    ip_list += data_in
            #------------------------------------------------------------------------------------------------------------------

            #------------------------------------------------------------------------------------------------------------------
            # copy_input_data
            data_in = []
            if kernel._kernel._copy_input_data is not None:
                if isinstance(kernel._kernel._copy_input_data, list):
                    pass
                else:
                    kernel._kernel._copy_input_data = [
                        kernel._kernel._copy_input_data
                    ]
                for i in range(0, len(kernel._kernel._copy_input_data)):
                    if (ktype == 'simulation' or ktype == 'analysis'):
                        var = resolve_placeholder_vars(
                            working_dirs=self.working_dirs,
                            path=kernel._kernel._copy_input_data[i],
                            instance=instance,
                            iteration=iteration,
                            type=ktype)
                    else:
                        var = resolve_placeholder_vars(
                            working_dirs=self.working_dirs,
                            path=kernel._kernel._copy_input_data[i])
                    if len(var.split('>')) > 1:
                        temp = {
                            'source': var.split('>')[0].strip(),
                            'target': var.split('>')[1].strip(),
                            'action': radical.pilot.COPY
                        }
                    else:
                        temp = {
                            'source': var.split('>')[0].strip(),
                            'target':
                            os.path.basename(var.split('>')[0].strip()),
                            'action': radical.pilot.COPY
                        }
                    data_in.append(temp)

                if ip_list is None:
                    ip_list = data_in
                else:
                    ip_list += data_in
            #------------------------------------------------------------------------------------------------------------------

            #------------------------------------------------------------------------------------------------------------------
            # download input data
            if kernel.download_input_data is not None:
                data_in = kernel.download_input_data

                if ip_list is None:
                    ip_list = data_in
                else:
                    ip_list += data_in
            #------------------------------------------------------------------------------------------------------------------
            return ip_list
            #------------------------------------------------------------------------------------------------------------------

        def get_output_data(kernel, instance=None, iteration=None, ktype=None):
            # OUTPUT DATA:
            op_list = []
            #------------------------------------------------------------------------------------------------------------------
            # copy_output_data
            data_out = []
            if kernel._kernel._copy_output_data is not None:
                if isinstance(kernel._kernel._copy_output_data, list):
                    pass
                else:
                    kernel._kernel._copy_output_data = [
                        kernel._kernel._copy_output_data
                    ]
                for i in range(0, len(kernel._kernel._copy_output_data)):
                    if (ktype == 'simulation' or ktype == 'analysis'):
                        var = resolve_placeholder_vars(
                            working_dirs=self.working_dirs,
                            path=kernel._kernel._copy_output_data[i],
                            instance=instance,
                            iteration=iteration,
                            type=ktype)
                    else:
                        var = resolve_placeholder_vars(
                            working_dirs=self.working_dirs,
                            path=kernel._kernel._copy_output_data[i])
                    if len(var.split('>')) > 1:
                        temp = {
                            'source': var.split('>')[0].strip(),
                            'target': var.split('>')[1].strip(),
                            'action': radical.pilot.COPY
                        }
                    else:
                        temp = {
                            'source': var.split('>')[0].strip(),
                            'target':
                            os.path.basename(var.split('>')[0].strip()),
                            'action': radical.pilot.COPY
                        }
                    data_out.append(temp)

                if op_list is None:
                    op_list = data_out
                else:
                    op_list += data_out
            #-----------------------------------------------------------------------------------------------------------------

            #------------------------------------------------------------------------------------------------------------------
            # download_output_data
            data_out = []
            if kernel._kernel._download_output_data is not None:
                if isinstance(kernel._kernel._download_output_data, list):
                    pass
                else:
                    kernel._kernel._download_output_data = [
                        kernel._kernel._download_output_data
                    ]
                for i in range(0, len(kernel._kernel._download_output_data)):
                    if (ktype == 'simulation' or ktype == 'analysis'):
                        var = resolve_placeholder_vars(
                            working_dirs=self.working_dirs,
                            path=kernel._kernel._download_output_data[i],
                            instance=instance,
                            iteration=iteration,
                            type=ktype)
                    else:
                        var = resolve_placeholder_vars(
                            working_dirs=self.working_dirs,
                            path=kernel._kernel._download_output_data[i])
                    if len(var.split('>')) > 1:
                        temp = {
                            'source': var.split('>')[0].strip(),
                            'target': var.split('>')[1].strip()
                        }
                    else:
                        temp = {
                            'source': var.split('>')[0].strip(),
                            'target':
                            os.path.basename(var.split('>')[0].strip())
                        }
                    data_out.append(temp)

                if op_list is None:
                    op_list = data_out
                else:
                    op_list += data_out

            #------------------------------------------------------------------------------------------------------------------
            return op_list
            #------------------------------------------------------------------------------------------------------------------

        #-----------------------------------------------------------------------
        #
        def unit_state_cb(unit, state):

            if state == radical.pilot.FAILED:
                self.get_logger().error(
                    "ComputeUnit error: STDERR: {0}, STDOUT: {0}".format(
                        unit.stderr, unit.stdout))
                self.get_logger().error("Pattern execution FAILED.")
                sys.exit(1)

        #-----------------------------------------------------------------------
        #
        def create_filecheck_command(files_list):

            command_list = []
            for f in files_list:
                command = 'if [ -f "{0}" ]; then exit 0; else echo "File {0} does not exist" >&2; exit 1; fi;'.format(
                    f)
                command_list.append(command)

            return command_list

        self._reporter.ok('>>ok')
        self.get_logger().info(
            "Executing simulation-analysis loop with {0} iterations on {1} allocated core(s) on '{2}'"
            .format(pattern.iterations, resource._cores,
                    resource._resource_key))

        self._reporter.header(
            "Executing simulation-analysis loop with {0} iterations on {1} allocated core(s) on '{2}'"
            .format(pattern.iterations, resource._cores,
                    resource._resource_key))

        all_cus = []

        #print resource._pilot.description['cores']

        self.get_logger().info("Waiting for pilot on {0} to go Active".format(
            resource._resource_key))
        self._reporter.info("Job waiting on queue...".format(
            resource._resource_key))
        resource._pmgr.wait_pilots(resource._pilot.uid, 'Active')
        self._reporter.ok("\nJob is now running !".format(
            resource._resource_key))

        profiling = int(os.environ.get('RADICAL_ENMD_PROFILING', 0))

        if profiling == 1:
            from collections import OrderedDict as od
            pattern._execution_profile = []
            enmd_overhead_dict = od()
            cu_dict = od()

        try:

            start_now = datetime.datetime.now()

            resource._umgr.register_callback(unit_state_cb)

            ########################################################################
            # execute pre_loop
            #

            ################################################################
            # EXECUTE PRE-LOOP

            if profiling == 1:
                probe_preloop_start = datetime.datetime.now()
                enmd_overhead_dict['preloop'] = od()
                enmd_overhead_dict['preloop'][
                    'start_time'] = probe_preloop_start

            pre_loop = pattern.pre_loop()

            if pre_loop is not None:
                pre_loop._bind_to_resource(resource._resource_key)

                cud = radical.pilot.ComputeUnitDescription()
                cud.name = "pre_loop"

                cud.pre_exec = pre_loop._cu_def_pre_exec
                cud.executable = pre_loop._cu_def_executable
                cud.arguments = pre_loop.arguments
                cud.mpi = pre_loop.uses_mpi
                cud.input_staging = get_input_data(kernel=pre_loop)
                cud.output_staging = get_output_data(kernel=pre_loop)

                if pre_loop.exists_remote is not None:
                    cud.post_exec = create_filecheck_command(
                        pre_loop.exists_remote)

                self.get_logger().debug("Created pre_loop CU: {0}.".format(
                    cud.as_dict()))

                self.get_logger().info(
                    "Submitted ComputeUnit(s) for pre_loop step.")
                self._reporter.info("\nWaiting for pre_loop step to complete.")
                if profiling == 1:
                    probe_preloop_wait = datetime.datetime.now()
                    enmd_overhead_dict['preloop'][
                        'wait_time'] = probe_preloop_wait

                unit = resource._umgr.submit_units(cud)
                all_cus.append(unit)
                resource._umgr.wait_units(unit.uid)

                if profiling == 1:
                    probe_preloop_res = datetime.datetime.now()
                    enmd_overhead_dict['preloop'][
                        'res_time'] = probe_preloop_res

                self.get_logger().info("Pre_loop completed.")

                if unit.state != radical.pilot.DONE:
                    raise EnsemblemdError(
                        "Pre-loop CU failed with error: {0}".format(
                            unit.stdout))

                self.working_dirs["pre_loop"] = saga.Url(
                    unit.working_directory).path

                # Process CU information and append it to the dictionary
                if profiling == 1:
                    probe_preloop_done = datetime.datetime.now()
                    enmd_overhead_dict['preloop'][
                        'stop_time'] = probe_preloop_done
                    cu_dict['pre_loop'] = unit

                self._reporter.ok('>> done')
            else:
                self.get_logger().info("No pre_loop stage.")

            ########################################################################
            # execute simulation analysis loop
            #
            for iteration in range(1, pattern.iterations + 1):

                self.working_dirs['iteration_{0}'.format(iteration)] = {}

                ################################################################
                # EXECUTE SIMULATION STEPS

                if profiling == 1:
                    enmd_overhead_dict['iter_{0}'.format(iteration)] = od()
                    cu_dict['iter_{0}'.format(iteration)] = od()

                if isinstance(
                        pattern.simulation_step(iteration=iteration,
                                                instance=1), list):
                    num_sim_kerns = len(
                        pattern.simulation_step(iteration=iteration,
                                                instance=1))
                else:
                    num_sim_kerns = 1
                #print num_sim_kerns

                all_sim_cus = []
                if profiling == 1:
                    enmd_overhead_dict['iter_{0}'.format(
                        iteration)]['sim'] = od()
                    cu_dict['iter_{0}'.format(iteration)]['sim'] = list()

                for kern_step in range(0, num_sim_kerns):

                    if profiling == 1:
                        probe_sim_start = datetime.datetime.now()

                        enmd_overhead_dict['iter_{0}'.format(iteration)][
                            'sim']['kernel_{0}'.format(kern_step)] = od()
                        enmd_overhead_dict['iter_{0}'.format(
                            iteration)]['sim']['kernel_{0}'.format(
                                kern_step)]['start_time'] = probe_sim_start

                    s_units = []
                    for s_instance in range(1,
                                            pattern._simulation_instances + 1):

                        if isinstance(
                                pattern.simulation_step(iteration=iteration,
                                                        instance=s_instance),
                                list):
                            sim_step = pattern.simulation_step(
                                iteration=iteration,
                                instance=s_instance)[kern_step]
                        else:
                            sim_step = pattern.simulation_step(
                                iteration=iteration, instance=s_instance)

                        sim_step._bind_to_resource(resource._resource_key)

                        # Resolve all placeholders
                        #if sim_step.link_input_data is not None:
                        #    for i in range(len(sim_step.link_input_data)):
                        #        sim_step.link_input_data[i] = resolve_placeholder_vars(working_dirs, s_instance, iteration, pattern._simulation_instances, pattern._analysis_instances, "simulation", sim_step.link_input_data[i])

                        cud = radical.pilot.ComputeUnitDescription()
                        cud.name = "sim ;{iteration} ;{instance}".format(
                            iteration=iteration, instance=s_instance)

                        cud.pre_exec = sim_step._cu_def_pre_exec
                        cud.executable = sim_step._cu_def_executable
                        cud.arguments = sim_step.arguments
                        cud.mpi = sim_step.uses_mpi
                        cud.input_staging = get_input_data(kernel=sim_step,
                                                           instance=s_instance,
                                                           iteration=iteration,
                                                           ktype='simulation')
                        cud.output_staging = get_output_data(
                            kernel=sim_step,
                            instance=s_instance,
                            iteration=iteration,
                            ktype='simulation')

                        if sim_step.cores is not None:
                            cud.cores = sim_step.cores

                        if sim_step.exists_remote is not None:
                            cud.post_exec = create_filecheck_command(
                                sim_step.exists_remote)

                        s_units.append(cud)

                        if sim_step.get_instance_type == 'single':
                            break

                    self.get_logger().debug(
                        "Created simulation CU: {0}.".format(cud.as_dict()))

                    self.get_logger().info(
                        "Submitted tasks for simulation iteration {0}.".format(
                            iteration))
                    self.get_logger().info(
                        "Waiting for {3} simulations in iteration {0}/ kernel {1}: {2} to complete."
                        .format(iteration, kern_step + 1, sim_step.name,
                                pattern._simulation_instances))

                    self._reporter.info(
                        "\nIteration {0}: Waiting for {2} simulation tasks: {1} to complete"
                        .format(iteration, sim_step.name,
                                pattern._simulation_instances))
                    if profiling == 1:
                        probe_sim_wait = datetime.datetime.now()
                        enmd_overhead_dict['iter_{0}'.format(
                            iteration)]['sim']['kernel_{0}'.format(
                                kern_step)]['wait_time'] = probe_sim_wait

                    s_cus = resource._umgr.submit_units(s_units)
                    all_cus.extend(s_cus)
                    all_sim_cus.extend(s_cus)

                    uids = [cu.uid for cu in s_cus]
                    resource._umgr.wait_units(uids)

                    if profiling == 1:
                        probe_sim_res = datetime.datetime.now()
                        enmd_overhead_dict['iter_{0}'.format(
                            iteration)]['sim']['kernel_{0}'.format(
                                kern_step)]['res_time'] = probe_sim_res

                    self.get_logger().info(
                        "Simulations in iteration {0}/ kernel {1}: {2} completed."
                        .format(iteration, kern_step + 1, sim_step.name))

                    failed_units = ""
                    for unit in s_cus:
                        if unit.state != radical.pilot.DONE:
                            failed_units += " * Simulation task {0} failed with an error: {1}\n".format(
                                unit.uid, unit.stderr)

                    if profiling == 1:
                        probe_sim_done = datetime.datetime.now()
                        enmd_overhead_dict['iter_{0}'.format(
                            iteration)]['sim']['kernel_{0}'.format(
                                kern_step)]['stop_time'] = probe_sim_done

                    self._reporter.ok('>> done')

                if profiling == 1:
                    probe_post_sim_start = datetime.datetime.now()
                    enmd_overhead_dict['iter_{0}'.format(
                        iteration)]['sim']['post'] = od()
                    enmd_overhead_dict['iter_{0}'.format(iteration)]['sim'][
                        'post']['start_time'] = probe_post_sim_start

                # TODO: ensure working_dir <-> instance mapping
                i = 0
                for cu in s_cus:
                    i += 1
                    self.working_dirs['iteration_{0}'.format(iteration)][
                        'simulation_{0}'.format(i)] = saga.Url(
                            cu.working_directory).path

                if profiling == 1:
                    probe_post_sim_end = datetime.datetime.now()
                    enmd_overhead_dict['iter_{0}'.format(iteration)]['sim'][
                        'post']['stop_time'] = probe_post_sim_end
                    cu_dict['iter_{0}'.format(iteration)]['sim'] = all_sim_cus

                ################################################################
                # EXECUTE ANALYSIS STEPS

                if isinstance(
                        pattern.analysis_step(iteration=iteration, instance=1),
                        list):
                    num_ana_kerns = len(
                        pattern.analysis_step(iteration=iteration, instance=1))
                else:
                    num_ana_kerns = 1
                #print num_ana_kerns

                all_ana_cus = []
                if profiling == 1:
                    enmd_overhead_dict['iter_{0}'.format(
                        iteration)]['ana'] = od()
                    cu_dict['iter_{0}'.format(iteration)]['ana'] = list()

                for kern_step in range(0, num_ana_kerns):

                    if profiling == 1:
                        probe_ana_start = datetime.datetime.now()
                        enmd_overhead_dict['iter_{0}'.format(iteration)][
                            'ana']['kernel_{0}'.format(kern_step)] = od()
                        enmd_overhead_dict['iter_{0}'.format(
                            iteration)]['ana']['kernel_{0}'.format(
                                kern_step)]['start_time'] = probe_ana_start

                    a_units = []
                    for a_instance in range(1,
                                            pattern._analysis_instances + 1):

                        if isinstance(
                                pattern.analysis_step(iteration=iteration,
                                                      instance=a_instance),
                                list):
                            ana_step = pattern.analysis_step(
                                iteration=iteration,
                                instance=a_instance)[kern_step]
                        else:
                            ana_step = pattern.analysis_step(
                                iteration=iteration, instance=a_instance)

                        ana_step._bind_to_resource(resource._resource_key)

                        # Resolve all placeholders
                        #if ana_step.link_input_data is not None:
                        #    for i in range(len(ana_step.link_input_data)):
                        #        ana_step.link_input_data[i] = resolve_placeholder_vars(working_dirs, a_instance, iteration, pattern._simulation_instances, pattern._analysis_instances, "analysis", ana_step.link_input_data[i])

                        cud = radical.pilot.ComputeUnitDescription()
                        cud.name = "ana ; {iteration}; {instance}".format(
                            iteration=iteration, instance=a_instance)

                        cud.pre_exec = ana_step._cu_def_pre_exec
                        cud.executable = ana_step._cu_def_executable
                        cud.arguments = ana_step.arguments
                        cud.mpi = ana_step.uses_mpi
                        cud.input_staging = get_input_data(kernel=ana_step,
                                                           instance=a_instance,
                                                           iteration=iteration,
                                                           ktype='analysis')
                        cud.output_staging = get_output_data(
                            kernel=ana_step,
                            instance=a_instance,
                            iteration=iteration,
                            ktype='analysis')

                        if ana_step.cores is not None:
                            cud.cores = ana_step.cores

                        if ana_step.exists_remote is not None:
                            cud.post_exec = create_filecheck_command(
                                ana_step.exists_remote)

                        a_units.append(cud)

                        if ana_step.get_instance_type == 'single':
                            break

                    self.get_logger().debug("Created analysis CU: {0}.".format(
                        cud.as_dict()))

                    self.get_logger().info(
                        "Submitted tasks for analysis iteration {0}.".format(
                            iteration))
                    self.get_logger().info(
                        "Waiting for analysis tasks in iteration {0}/kernel {1}: {2} to complete."
                        .format(iteration, kern_step + 1, ana_step.name))

                    self._reporter.info(
                        "\nIteration {0}: Waiting for analysis tasks: {1} to complete"
                        .format(iteration, ana_step.name))
                    if profiling == 1:
                        probe_ana_wait = datetime.datetime.now()
                        enmd_overhead_dict['iter_{0}'.format(
                            iteration)]['ana']['kernel_{0}'.format(
                                kern_step)]['wait_time'] = probe_ana_wait

                    a_cus = resource._umgr.submit_units(a_units)
                    all_cus.extend(a_cus)
                    all_ana_cus.extend(a_cus)

                    uids = [cu.uid for cu in a_cus]
                    resource._umgr.wait_units(uids)

                    if profiling == 1:
                        probe_ana_res = datetime.datetime.now()
                        enmd_overhead_dict['iter_{0}'.format(
                            iteration)]['ana']['kernel_{0}'.format(
                                kern_step)]['res_time'] = probe_ana_res

                    self.get_logger().info(
                        "Analysis in iteration {0}/kernel {1}: {2} completed.".
                        format(iteration, kern_step + 1, ana_step.name))

                    failed_units = ""
                    for unit in a_cus:
                        if unit.state != radical.pilot.DONE:
                            failed_units += " * Analysis task {0} failed with an error: {1}\n".format(
                                unit.uid, unit.stderr)

                    if profiling == 1:
                        probe_ana_done = datetime.datetime.now()
                        enmd_overhead_dict['iter_{0}'.format(
                            iteration)]['ana']['kernel_{0}'.format(
                                kern_step)]['stop_time'] = probe_ana_done

                    self._reporter.ok('>> done')

                if profiling == 1:
                    probe_post_ana_start = datetime.datetime.now()
                    enmd_overhead_dict['iter_{0}'.format(
                        iteration)]['ana']['post'] = od()
                    enmd_overhead_dict['iter_{0}'.format(iteration)]['ana'][
                        'post']['start_time'] = probe_post_ana_start

                if (pattern.adaptive_simulation == False):
                    pass
                else:
                    pattern._simulation_instances = pattern.get_new_simulation_instances(
                        a_cus[0].stdout)

                i = 0
                for cu in a_cus:
                    i += 1
                    self.working_dirs['iteration_{0}'.format(iteration)][
                        'analysis_{0}'.format(i)] = saga.Url(
                            cu.working_directory).path

                if profiling == 1:
                    probe_post_ana_end = datetime.datetime.now()
                    enmd_overhead_dict['iter_{0}'.format(iteration)]['ana'][
                        'post']['stop_time'] = probe_post_ana_end
                    cu_dict['iter_{0}'.format(iteration)]['ana'] = all_ana_cus

            self._reporter.header('Pattern execution successfully finished')

            # ONLY PROFILING SECTION BELOW
            if profiling == 1:

                #Pattern overhead logging
                title = "iteration,step,kernel,probe,timestamp"
                f1 = open('enmd_pat_overhead.csv', 'w')
                f1.write(title + "\n\n")
                iter = 'None'
                step = 'pre_loop'
                kern = 'None'
                for key, val in enmd_overhead_dict['preloop'].items():
                    probe = key
                    timestamp = val
                    entry = '{0},{1},{2},{3},{4}\n'.format(
                        iter, step, kern, probe, timestamp)
                    f1.write(entry)

                iters = pattern.iterations

                for i in range(1, iters + 1):
                    iter = 'iter_{0}'.format(i)
                    for key1, val1 in enmd_overhead_dict[iter].items():
                        step = key1
                        for key2, val2 in val1.items():
                            kern = key2
                            for key3, val3 in val2.items():
                                probe = key3
                                timestamp = val3
                                entry = '{0},{1},{2},{3},{4}\n'.format(
                                    iter.split('_')[1], step, kern, probe,
                                    timestamp)
                                f1.write(entry)

                f1.close()

                #CU data logging
                title = "uid, iter, step, Scheduling, StagingInput, AgentStagingInputPending, AgentStagingInput, AllocatingPending, Allocating, ExecutingPending, Executing, AgentStagingOutputPending, AgentStagingOutput, PendingOutputStaging, StagingOutput, Done"
                f2 = open(
                    "execution_profile_{mysession}.csv".format(
                        mysession=resource._session.uid), 'w')
                f2.write(title + "\n\n")
                iter = 'None'
                step = 'pre_loop'

                if step in cu_dict:
                    cu = cu_dict['pre_loop']

                    st_data = {}
                    for st in cu.state_history:
                        st_dict = st.as_dict()
                        st_data["{0}".format(st_dict["state"])] = {}
                        st_data["{0}".format(
                            st_dict["state"])] = st_dict["timestamp"]

                    states = [
                        'Scheduling,'
                        'StagingInput', 'AgentStagingInputPending',
                        'AgentStagingInput', 'AllocatingPending', 'Allocating',
                        'ExecutingPending', 'Executing',
                        'AgentStagingOutputPending', 'AgentStagingOutput',
                        'PendingOutputStaging', 'StagingOutput', 'Done'
                    ]

                    for state in states:
                        if (state in st_data) is False:
                            st_data[state] = None

                    line = "{uid}, {iter}, {step}, {Scheduling}, {StagingInput}, {AgentStagingInputPending}, {AgentStagingInput}, {AllocatingPending}, {Allocating}, {ExecutingPending},{Executing}, {AgentStagingOutputPending}, {AgentStagingOutput}, {PendingOutputStaging}, {StagingOutput}, {Done}".format(
                        uid=cu.uid,
                        iter=0,
                        step='pre_loop',
                        Scheduling=(st_data['Scheduling']),
                        StagingInput=(st_data['StagingInput']),
                        AgentStagingInputPending=(
                            st_data['AgentStagingInputPending']),
                        AgentStagingInput=(st_data['AgentStagingInput']),
                        AllocatingPending=(st_data['AllocatingPending']),
                        Allocating=(st_data['Allocating']),
                        ExecutingPending=(st_data['ExecutingPending']),
                        Executing=(st_data['Executing']),
                        AgentStagingOutputPending=(
                            st_data['AgentStagingOutputPending']),
                        AgentStagingOutput=(st_data['AgentStagingOutput']),
                        PendingOutputStaging=(st_data['PendingOutputStaging']),
                        StagingOutput=(st_data['StagingOutput']),
                        Done=(st_data['Done']))
                    f2.write(line + '\n')
                else:
                    print 'No pre_loop step in the pattern'

                for i in range(1, iters + 1):
                    iter = 'iter_{0}'.format(i)
                    for key, val in cu_dict[iter].items():
                        step = key
                        cus = val

                        if step == 'sim':
                            for cu in cus:
                                st_data = {}
                                for st in cu.state_history:
                                    st_dict = st.as_dict()
                                    st_data["{0}".format(
                                        st_dict["state"])] = {}
                                    st_data["{0}".format(
                                        st_dict["state"]
                                    )] = st_dict["timestamp"]

                                states = [
                                    'Scheduling,'
                                    'StagingInput', 'AgentStagingInputPending',
                                    'AgentStagingInput', 'AllocatingPending',
                                    'Allocating', 'ExecutingPending',
                                    'Executing', 'AgentStagingOutputPending',
                                    'AgentStagingOutput',
                                    'PendingOutputStaging', 'StagingOutput',
                                    'Done'
                                ]

                                for state in states:
                                    if (state in st_data) is False:
                                        st_data[state] = None

                                line = "{uid}, {iter}, {step}, {Scheduling}, {StagingInput}, {AgentStagingInputPending}, {AgentStagingInput}, {AllocatingPending}, {Allocating}, {ExecutingPending},{Executing}, {AgentStagingOutputPending}, {AgentStagingOutput}, {PendingOutputStaging}, {StagingOutput}, {Done}".format(
                                    uid=cu.uid,
                                    iter=iter.split('_')[1],
                                    step=step,
                                    Scheduling=(st_data['Scheduling']),
                                    StagingInput=(st_data['StagingInput']),
                                    AgentStagingInputPending=(
                                        st_data['AgentStagingInputPending']),
                                    AgentStagingInput=(
                                        st_data['AgentStagingInput']),
                                    AllocatingPending=(
                                        st_data['AllocatingPending']),
                                    Allocating=(st_data['Allocating']),
                                    ExecutingPending=(
                                        st_data['ExecutingPending']),
                                    Executing=(st_data['Executing']),
                                    AgentStagingOutputPending=(
                                        st_data['AgentStagingOutputPending']),
                                    AgentStagingOutput=(
                                        st_data['AgentStagingOutput']),
                                    PendingOutputStaging=(
                                        st_data['PendingOutputStaging']),
                                    StagingOutput=(st_data['StagingOutput']),
                                    Done=(st_data['Done']))

                                f2.write(line + '\n')

                        elif step == 'ana':
                            for cu in cus:
                                st_data = {}
                                for st in cu.state_history:
                                    st_dict = st.as_dict()
                                    st_data["{0}".format(
                                        st_dict["state"])] = {}
                                    st_data["{0}".format(
                                        st_dict["state"]
                                    )] = st_dict["timestamp"]

                                states = [
                                    'Scheduling,'
                                    'StagingInput', 'AgentStagingInputPending',
                                    'AgentStagingInput', 'AllocatingPending',
                                    'Allocating', 'ExecutingPending',
                                    'Executing', 'AgentStagingOutputPending',
                                    'AgentStagingOutput',
                                    'PendingOutputStaging', 'StagingOutput',
                                    'Done'
                                ]

                                for state in states:
                                    if (state in st_data) is False:
                                        st_data[state] = None

                                line = "{uid}, {iter}, {step}, {Scheduling}, {StagingInput}, {AgentStagingInputPending}, {AgentStagingInput}, {AllocatingPending}, {Allocating}, {ExecutingPending},{Executing}, {AgentStagingOutputPending}, {AgentStagingOutput}, {PendingOutputStaging}, {StagingOutput}, {Done}".format(
                                    uid=cu.uid,
                                    iter=iter.split('_')[1],
                                    step=step,
                                    Scheduling=(st_data['Scheduling']),
                                    StagingInput=(st_data['StagingInput']),
                                    AgentStagingInputPending=(
                                        st_data['AgentStagingInputPending']),
                                    AgentStagingInput=(
                                        st_data['AgentStagingInput']),
                                    AllocatingPending=(
                                        st_data['AllocatingPending']),
                                    Allocating=(st_data['Allocating']),
                                    ExecutingPending=(
                                        st_data['ExecutingPending']),
                                    Executing=(st_data['Executing']),
                                    AgentStagingOutputPending=(
                                        st_data['AgentStagingOutputPending']),
                                    AgentStagingOutput=(
                                        st_data['AgentStagingOutput']),
                                    PendingOutputStaging=(
                                        st_data['PendingOutputStaging']),
                                    StagingOutput=(st_data['StagingOutput']),
                                    Done=(st_data['Done']))

                                f2.write(line + '\n')

                f2.close()

        except KeyboardInterrupt:

            self._reporter.error('Execution interupted')
            traceback.print_exc()