Python Url Examples

Programming Language: Python

Namespace/Package Name: saga

Method/Function: Url

Examples at hotexamples.com: 19

Python Url - 19 examples found. These are the top rated real world Python examples of saga.Url extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: ec2.py Project: zonca/saga-python

# ------------------------------------------------------------------------------
#
# set up the connection to EC2
#

if not 'EC2_URL' in os.environ: usage("no %s in environment" % 'EC2_URL')
if not 'EC2_ACCESS_KEY' in os.environ:
    usage("no %s in environment" % 'EC2_ACCESS_KEY')
if not 'EC2_SECRET_KEY' in os.environ:
    usage("no %s in environment" % 'EC2_SECRET_KEY')
if not 'EC2_KEYPAIR_ID' in os.environ:
    usage("no %s in environment" % 'EC2_KEYPAIR_ID')
if not 'EC2_KEYPAIR' in os.environ:
    usage("no %s in environment" % 'EC2_KEYPAIR')

server = saga.Url(os.environ['EC2_URL'])

# in order to connect to EC2, we need an EC2 ID and KEY
c1 = saga.Context('ec2')
c1.user_id = os.environ['EC2_ACCESS_KEY']
c1.user_key = os.environ['EC2_SECRET_KEY']
c1.server = server

# in order to access a created VM, we additionally need to point to the ssh
# key which is used for EC2 VM contextualization, i.e. as EC2 'keypair'.
# If the keypair is not yet registered on EC2, it will be registered by SAGA
# -- but then a user_key *must* be specified (only the public key is ever
# transfererd to EC2).
c2 = saga.Context('ec2_keypair')
c2.token = os.environ['EC2_KEYPAIR_ID']
c2.user_cert = os.environ['EC2_KEYPAIR']

Example #2

Show file

    def _handle_pilot_input_staging(self, pilot, sds):

        pid = pilot['uid']

        # NOTE: no unit sandboxes defined!
        src_context = {
            'pwd': pilot['client_sandbox'],
            'pilot': pilot['pilot_sandbox'],
            'resource': pilot['resource_sandbox']
        }
        tgt_context = {
            'pwd': pilot['pilot_sandbox'],
            'pilot': pilot['pilot_sandbox'],
            'resource': pilot['resource_sandbox']
        }

        # Iterate over all directives
        for sd in sds:

            # TODO: respect flags in directive

            action = sd['action']
            flags = sd['flags']
            did = sd['uid']
            src = sd['source']
            tgt = sd['target']

            assert (action in [COPY, LINK, MOVE, TRANSFER])

            self._prof.prof('staging_in_start', uid=pid, msg=did)

            src = complete_url(src, src_context, self._log)
            tgt = complete_url(tgt, tgt_context, self._log)

            if action in [COPY, LINK, MOVE]:
                self._prof.prof('staging_in_fail', uid=pid, msg=did)
                raise ValueError("invalid action '%s' on pilot level" % action)

            self._log.info('transfer %s to %s', src, tgt)

            # FIXME: make sure that tgt URL points to the right resource
            # FIXME: honor sd flags if given (recursive...)
            flags = rsfs.CREATE_PARENTS

            if os.path.isdir(src.path):
                flags |= rsfs.RECURSIVE

            # Define and open the staging directory for the pilot
            # We use the target dir construct here, so that we can create
            # the directory if it does not yet exist.

            # url used for cache (sandbox url w/o path)
            tmp = rs.Url(pilot['pilot_sandbox'])
            tmp.path = '/'
            key = str(tmp)

            self._log.debug("rs.file.Directory ('%s')", key)

            with self._cache_lock:
                if key in self._saga_fs_cache:
                    fs = self._saga_fs_cache[key]

                else:
                    fs = rsfs.Directory(key, session=self._session)
                    self._saga_fs_cache[key] = fs

            fs.copy(src, tgt, flags=flags)

            sd['pmgr_state'] = rps.DONE

            self._prof.prof('staging_in_stop', uid=pid, msg=did)

        self.publish(
            rpc.CONTROL_PUBSUB, {
                'cmd': 'pilot_staging_input_result',
                'arg': {
                    'pilot': pilot,
                    'sds': sds
                }
            })

Example #3

Show file

    def run(self):
        """Starts the process when Process.start() is called.
        """

        # make sure to catch sys.exit (which raises SystemExit)
        try:

            logger.info("Starting InputFileTransferWorker")

            # Try to connect to the database and create a tailable cursor.
            try:
                db = self._session.get_db()
                um_col = db["%s.cu" % self._session.uid]
                logger.debug(
                    "Connected to MongoDB. Serving requests for UnitManager %s."
                    % self.unit_manager_id)

            except Exception as e:
                logger.exception("Connection error: %s" % e)
                raise

            while not self._stop.is_set() and \
                  not self._session._terminate.is_set():

                # See if we can find a ComputeUnit that is waiting for
                # input file transfer.
                ts = timestamp()
                compute_unit = um_col.find_and_modify(
                    query={
                        "unitmanager": self.unit_manager_id,
                        "state": PENDING_INPUT_STAGING,
                    },
                    update={
                        "$set": {
                            "state": STAGING_INPUT
                        },
                        "$push": {
                            "statehistory": {
                                "state": STAGING_INPUT,
                                "timestamp": ts
                            }
                        }
                    })

                if compute_unit is None:
                    # Sleep a bit if no new units are available.
                    time.sleep(IDLE_TIME)

                else:
                    compute_unit_id = None
                    state = STAGING_INPUT

                    try:
                        log_messages = []

                        # We have found a new CU. Now we can process the transfer
                        # directive(s) wit SAGA.
                        compute_unit_id = str(compute_unit["_id"])

                        logger.debug("InputStagingController: unit found: %s" %
                                     compute_unit_id)
                        self._session.prof.prof('advance',
                                                uid=compute_unit_id,
                                                msg=state,
                                                state=state)

                        remote_sandbox = compute_unit["sandbox"]
                        input_staging = compute_unit.get(
                            "FTW_Input_Directives", [])

                        # if we do staging, create the CU's directory in case it doesn't exist yet.
                        if input_staging:
                            log_msg = "InputStagingController: Creating ComputeUnit sandbox directory %s." % remote_sandbox
                            log_messages.append(log_msg)
                            logger.info(log_msg)

                            # Creating/initialising the sandbox directory.
                            try:
                                logger.debug("saga.fs.Directory ('%s')" %
                                             remote_sandbox)

                                # url used for saga
                                remote_sandbox_url = saga.Url(remote_sandbox)

                                # keyurl and key used for cache
                                remote_sandbox_keyurl = saga.Url(
                                    remote_sandbox)
                                remote_sandbox_keyurl.path = '/'
                                remote_sandbox_key = str(remote_sandbox_keyurl)

                                if remote_sandbox_key not in self._saga_dirs:
                                    self._saga_dirs[remote_sandbox_key] = \
                                            saga.filesystem.Directory(remote_sandbox_url,
                                                    flags=saga.filesystem.CREATE_PARENTS,
                                                    session=self._session)

                                saga_dir = self._saga_dirs[remote_sandbox_key]
                            except Exception as e:
                                logger.exception('Error: %s' % e)
                                raise

                            logger.info(
                                "InputStagingController: Processing input file transfers for ComputeUnit %s"
                                % compute_unit_id)

                        # Loop over all transfer directives and execute them.
                        for sd in input_staging:

                            logger.debug(
                                "InputStagingController: sd: %s : %s" %
                                (compute_unit_id, sd))

                            # Check if there was a cancel request
                            state_doc = um_col.find_one(
                                {"_id": compute_unit_id}, fields=["state"])
                            if state_doc['state'] == CANCELED:
                                self._session.prof.prof('advance',
                                                        uid=compute_unit_id,
                                                        msg=CANCELED,
                                                        state=CANCELED)
                                logger.info(
                                    "Compute Unit Canceled, interrupting input file transfers."
                                )
                                state = CANCELED
                                # Break out of the loop for this CU's SD's
                                break

                            abs_src = os.path.abspath(sd['source'])
                            input_file_url = saga.Url("file://localhost%s" %
                                                      abs_src)
                            if not sd['target']:
                                target = '%s/%s' % (remote_sandbox,
                                                    os.path.basename(abs_src))
                            else:
                                target = "%s/%s" % (remote_sandbox,
                                                    sd['target'])

                            log_msg = "Transferring input file %s -> %s" % (
                                input_file_url, target)
                            log_messages.append(log_msg)
                            logger.debug(log_msg)

                            # Execute the transfer.
                            if CREATE_PARENTS in sd['flags']:
                                copy_flags = saga.filesystem.CREATE_PARENTS
                            else:
                                copy_flags = 0

                            try:
                                saga_dir.copy(input_file_url,
                                              target,
                                              flags=copy_flags)
                            except Exception as e:
                                logger.exception(e)
                                raise Exception("copy failed(%s)" % e.message)

                        # If this CU was canceled we can skip the remainder of this loop,
                        # to process more CUs.
                        if state == CANCELED:
                            continue

                        # All IFTW staging done for this CU.  Push it out, by
                        # setting the state as 'AGENT_ATGING_INPUT_PENDING and
                        # sending it to mongodb.  We mark the CU under 'umgr'
                        # control -- once the agent picks it up, it will be
                        # marked as under 'agent' control, before the
                        # agent_stging_output_component passes control back in
                        # a similar manner.
                        um_col.update({'_id': compute_unit_id}, {
                            '$set': {
                                'state': AGENT_STAGING_INPUT_PENDING,
                                'control': 'umgr'
                            },
                            '$push': {
                                'statehistory': {
                                    'state': AGENT_STAGING_INPUT_PENDING,
                                    'timestamp': ts
                                },
                                'log': {
                                    'timestamp':
                                    timestamp(),
                                    'message':
                                    'push unit to agent after ftw staging'
                                }
                            }
                        })
                        logger.debug(
                            "InputStagingController: %s : push to agent" %
                            compute_unit_id)
                        self._session.prof.prof(
                            'advance',
                            uid=compute_unit_id,
                            msg=AGENT_STAGING_INPUT_PENDING,
                            state=AGENT_STAGING_INPUT_PENDING)

                    except Exception as e:

                        # Update the CU's state to 'FAILED'.
                        ts = timestamp()
                        logentry = {
                            'message': "Input transfer failed: %s" % e,
                            'timestamp': ts
                        }

                        um_col.update({'_id': compute_unit_id}, {
                            '$set': {
                                'state': FAILED
                            },
                            '$push': {
                                'statehistory': {
                                    'state': FAILED,
                                    'timestamp': ts
                                },
                                'log': logentry
                            }
                        })
                        self._session.prof.prof('advance',
                                                uid=compute_unit_id,
                                                msg=FAILED,
                                                state=FAILED)

                        logger.exception(str(logentry))
                        raise

        except SystemExit as e:
            logger.debug(
                "input file transfer thread caught system exit -- forcing application shutdown"
            )
            thread.interrupt_main()

Example #4

Show file

    def initialize(self,
                   url,
                   session=None,
                   prompt=None,
                   logger=None,
                   posix=True,
                   interactive=True):

        with self.rlock:

            # make sure we have a valid url type
            url = saga.Url(url)

            if not prompt:
                prompt = "^(.*[\$#%>\]])\s*$"

            if not logger:
                logger = self.logger

            # collect all information we have/need about the requested master
            # connection
            info = self._create_master_entry(url, session, prompt, logger,
                                             posix, interactive)

            # we got master info - register the master, and create the instance!
            type_s = str(info['shell_type'])
            user_s = str(info['user'])
            host_s = str(info['host_str'])

            # Now, if we don't have that master, yet, we need to instantiate it
            if not host_s in self.registry: self.registry[host_s] = {}
            if not user_s in self.registry[host_s]:
                self.registry[host_s][user_s] = {}
            if not type_s in self.registry[host_s][user_s]:

                # new master: create an instance, and register it
                m_cmd = info['scripts'][info['shell_type']]['master'] % info

                logger.debug ("open master pty for [%s] [%s] %s: %s'" \
                                % (type_s, host_s, user_s, m_cmd))

                info['pty'] = supp.PTYProcess(m_cmd, logger=logger)
                if not info['pty'].alive():
                    raise se.NoSuccess._log (logger, \
                          "Shell not connected to %s" % info['host_str'])

                # authorization, prompt setup, etc.  Initialize as shell if not
                # explicitly marked as non-posix shell
                self._initialize_pty(info['pty'], info)

                # master was created - register it
                self.registry[host_s][user_s][type_s] = info

            else:
                # we already have a master: make sure it is alive, and restart as
                # needed
                info = self.registry[host_s][user_s][type_s]

                if not info['pty'].alive(recover=True):
                    raise se.IncorrectState._log (logger, \
                          "Lost shell connection to %s" % info['host_str'])

            return info

Example #5

Show file

File: session.py Project: 0/radical.pilot

def fetch_profiles(sid,
                   dburl=None,
                   client=None,
                   tgt=None,
                   access=None,
                   session=None,
                   skip_existing=False):
    '''
    sid: session for which all profiles are fetched
    client: dir to look for client session profiles
    tgt: dir to store the profile in

    returns list of file names
    '''

    ret = list()

    if not dburl:
        dburl = os.environ['RADICAL_PILOT_DBURL']

    if not dburl:
        raise RuntimeError('Please set RADICAL_PILOT_DBURL')

    if not client:
        client = os.getcwd()

    if not tgt:
        tgt = os.getcwd()

    if not tgt.startswith('/') and '://' not in tgt:
        tgt = "%s/%s" % (os.getcwd(), tgt)

    # we always create a session dir as real target
    tgt_url = saga.Url("%s/%s/" % (tgt, sid))

    # Turn URLs without schema://host into file://localhost,
    # so that they dont become interpreted as relative.
    if not tgt_url.schema:
        tgt_url.schema = 'file'
    if not tgt_url.host:
        tgt_url.host = 'localhost'

    # first fetch session profile
    # FIXME: should we record pwd or profile location in db session?  Or create
    #        a sandbox like dir for storing profiles and logs?
    client_profile = "%s/%s.prof" % (client, sid)

    ftgt = saga.Url('%s/%s' % (tgt_url, os.path.basename(client_profile)))
    ret.append("%s" % ftgt.path)

    if skip_existing and os.path.isfile(ftgt.path) \
            and os.stat(ftgt.path).st_size > 0:

        logger.report.info("\t- %s\n" % client_profile.split('/')[-1])

    else:

        logger.report.info("\t+ %s\n" % client_profile.split('/')[-1])
        prof_file = saga.filesystem.File(client_profile, session=session)
        prof_file.copy(ftgt, flags=saga.filesystem.CREATE_PARENTS)
        prof_file.close()

    _, db, _, _, _ = ru.mongodb_connect(dburl)

    json_docs = get_session_docs(db, sid)

    pilots = json_docs['pilot']
    num_pilots = len(pilots)
    #  print "Session: %s" % sid
    #  print "Number of pilots in session: %d" % num_pilots

    for pilot in pilots:

        # print "Processing pilot '%s'" % pilot['_id']

        sandbox_url = saga.Url(pilot['sandbox'])

        if access:
            # Allow to use a different access scheme than used for the the run.
            # Useful if you ran from the headnode, but would like to retrieve
            # the profiles to your desktop (Hello Titan).
            access_url = saga.Url(access)
            sandbox_url.schema = access_url.schema
            sandbox_url.host = access_url.host

        # print "Overriding remote sandbox: %s" % sandbox_url

        sandbox = saga.filesystem.Directory(sandbox_url, session=session)

        # Try to fetch a tarball of profiles, so that we can get them all in one (SAGA) go!
        PROFILES_TARBALL = '%s.prof.tgz' % pilot['_id']
        tarball_available = False
        try:
            if sandbox.is_file(PROFILES_TARBALL):
                print "Profiles tarball exists!"

                ftgt = saga.Url('%s/%s' % (tgt_url, PROFILES_TARBALL))

                if skip_existing and os.path.isfile(ftgt.path) \
                        and os.stat(ftgt.path).st_size > 0:

                    print "Skipping fetching of '%s/%s' to '%s'." % (
                        sandbox_url, PROFILES_TARBALL, tgt_url)
                    tarball_available = True
                else:

                    print "Fetching '%s%s' to '%s'." % (
                        sandbox_url, PROFILES_TARBALL, tgt_url)
                    prof_file = saga.filesystem.File(
                        "%s%s" % (sandbox_url, PROFILES_TARBALL),
                        session=session)
                    prof_file.copy(ftgt, flags=saga.filesystem.CREATE_PARENTS)
                    prof_file.close()

                    tarball_available = True
            else:
                print "Profiles tarball doesnt exists!"

        except saga.DoesNotExist:
            print "exception(TODO): profiles tarball doesnt exists!"

        try:
            os.mkdir("%s/%s" % (tgt_url.path, pilot['_id']))
        except OSError:
            pass

        # We now have a local tarball
        if tarball_available:
            print "Extracting tarball %s into '%s'." % (ftgt.path,
                                                        tgt_url.path)
            tarball = tarfile.open(ftgt.path)
            tarball.extractall("%s/%s" % (tgt_url.path, pilot['_id']))

            profiles = glob.glob("%s/*.prof" % tgt_url.path)
            print "Tarball %s extracted to '%s/%s/'." % (
                ftgt.path, tgt_url.path, pilot['_id'])
            ret.extend(profiles)

            # If extract succeeded, no need to fetch individual profiles
            continue

        # If we dont have a tarball (for whichever reason), fetch individual profiles
        profiles = sandbox.list('*.prof')

        for prof in profiles:

            ftgt = saga.Url('%s/%s/%s' % (tgt_url, pilot['_id'], prof))
            ret.append("%s" % ftgt.path)

            if skip_existing and os.path.isfile(ftgt.path) \
                             and os.stat(ftgt.path).st_size > 0:

                logger.report.info("\t- %s\n" % str(prof).split('/')[-1])
                continue

            logger.report.info("\t+ %s\n" % str(prof).split('/')[-1])
            prof_file = saga.filesystem.File("%s%s" % (sandbox_url, prof),
                                             session=session)
            prof_file.copy(ftgt, flags=saga.filesystem.CREATE_PARENTS)
            prof_file.close()

    return ret

Example #6

Show file

File: session.py Project: lee212/radical.pilot

def fetch_logfiles(sid,
                   dburl=None,
                   src=None,
                   tgt=None,
                   access=None,
                   session=None,
                   skip_existing=False,
                   fetch_client=False,
                   log=None):
    '''
    sid: session for which all logfiles are fetched
    src: dir to look for client session logfiles
    tgt: dir to store the logfile in

    returns list of file names
    '''

    if not log and session:
        log = session._log
        rep = session._rep
    elif not log:
        log = ru.Logger('radical.pilot.utils')
        rep = ru.Reporter('radical.pilot.utils')

    ret = list()

    if not dburl:
        dburl = os.environ['RADICAL_PILOT_DBURL']

    if not dburl:
        raise RuntimeError('Please set RADICAL_PILOT_DBURL')

    if not src:
        src = os.getcwd()

    if not tgt:
        tgt = os.getcwd()

    if not tgt.startswith('/') and '://' not in tgt:
        tgt = "%s/%s" % (os.getcwd(), tgt)

    # we always create a session dir as real target
    tgt_url = saga.Url("%s/%s/" % (tgt, sid))

    # Turn URLs without schema://host into file://localhost,
    # so that they dont become interpreted as relative.
    if not tgt_url.schema:
        tgt_url.schema = 'file'
    if not tgt_url.host:
        tgt_url.host = 'localhost'

    if fetch_client:
        # first fetch session logfile
        client_logfile = "%s/%s.log" % (src, sid)

        ftgt = saga.Url('%s/%s' % (tgt_url, os.path.basename(client_logfile)))
        ret.append("%s" % ftgt.path)

        if skip_existing and os.path.isfile(ftgt.path) \
                and os.stat(ftgt.path).st_size > 0:
            pass
        else:
            log_file = saga.filesystem.File(client_logfile, session=session)
            log_file.copy(ftgt, flags=saga.filesystem.CREATE_PARENTS)
            log_file.close()

    _, db, _, _, _ = ru.mongodb_connect(dburl)

    json_docs = get_session_docs(db, sid)

    pilots = json_docs['pilot']
    num_pilots = len(pilots)
    log.info("Session: %s", sid)
    log.info("Number of pilots in session: %d", num_pilots)

    for pilot in pilots:

        try:
            sandbox_url = saga.Url(pilot['pilot_sandbox'])

            if access:
                # Allow to use a different access schema than used for the the run.
                # Useful if you ran from the headnode, but would like to retrieve
                # the logfiles to your desktop (Hello Titan).
                access_url = saga.Url(access)
                sandbox_url.schema = access_url.schema
                sandbox_url.host = access_url.host

            sandbox = saga.filesystem.Directory(sandbox_url, session=session)

            # Try to fetch a tarball of logfiles, so that we can get them all in one (SAGA) go!
            LOGFILES_TARBALL = '%s.log.tgz' % pilot['uid']
            tarball_available = False
            try:
                if  sandbox.is_file(LOGFILES_TARBALL) and \
                    sandbox.get_size(LOGFILES_TARBALL):

                    log.info("logfiles tarball exists")
                    ftgt = saga.Url('%s/%s' % (tgt_url, LOGFILES_TARBALL))

                    if skip_existing and os.path.isfile(ftgt.path) \
                            and os.stat(ftgt.path).st_size > 0:

                        log.info("Skip fetching of '%s/%s' to '%s'.",
                                 sandbox_url, LOGFILES_TARBALL, tgt_url)
                        tarball_available = True
                    else:

                        log.info("Fetching '%s%s' to '%s'.", sandbox_url,
                                 LOGFILES_TARBALL, tgt_url)
                        log_file = saga.filesystem.File(
                            "%s%s" % (sandbox_url, LOGFILES_TARBALL),
                            session=session)
                        log_file.copy(ftgt,
                                      flags=saga.filesystem.CREATE_PARENTS)
                        log_file.close()

                        tarball_available = True
                else:
                    log.warn("logiles tarball doesnt exists")

            except saga.DoesNotExist:
                log.warn("logfiles tarball doesnt exists")

            try:
                os.mkdir("%s/%s" % (tgt_url.path, pilot['uid']))
            except OSError:
                pass

            # We now have a local tarball
            if tarball_available:
                log.debug("Extract tarball %s to %s", ftgt.path, tgt_url.path)

                try:
                    tarball = tarfile.open(ftgt.path)
                    tarball.extractall("%s/%s" % (tgt_url.path, pilot['uid']))

                    logfiles = glob.glob("%s/%s/*.log" %
                                         (tgt_url.path, pilot['uid']))
                    log.info("tarball %s extracted to '%s/%s/'.", ftgt.path,
                             tgt_url.path, pilot['uid'])
                    ret.extend(logfiles)
                    os.unlink(ftgt.path)

                except Exception as e:
                    log.warn('could not extract tarball %s [%s]', ftgt.path, e)

                # If extract succeeded, no need to fetch individual logfiles
                rep.ok("+ %s (logfiles)\n" % pilot['uid'])
                continue

            # If we dont have a tarball (for whichever reason), fetch individual logfiles
            logfiles = sandbox.list('*.log')

            for logfile in logfiles:

                ftgt = saga.Url('%s/%s/%s' % (tgt_url, pilot['uid'], logfile))
                ret.append("%s" % ftgt.path)

                if skip_existing and os.path.isfile(ftgt.path) \
                                 and os.stat(ftgt.path).st_size > 0:

                    continue

                log_file = saga.filesystem.File("%s%s" %
                                                (sandbox_url, logfile),
                                                session=session)
                log_file.copy(ftgt, flags=saga.filesystem.CREATE_PARENTS)
                log_file.close()

            rep.ok("+ %s (logfiles)\n" % pilot['uid'])

        except Exception as e:
            rep.error("- %s (logfiles)\n" % pilot['uid'])

    return ret

Example #7

Show file

File: compute_pilot.py Project: 0/radical.pilot

    def stage_in(self, directives):
        """Stages the content of the staging directive into the pilot's
        staging area"""

        # Wait until we can assume the pilot directory to be created
        if self.state == NEW:
            self.wait(
                state=[PENDING_LAUNCH, LAUNCHING, PENDING_ACTIVE, ACTIVE])
        elif self.state in [DONE, FAILED, CANCELED]:
            raise Exception(
                "Pilot already finished, no need to stage anymore!")

        # Iterate over all directives
        for directive in expand_staging_directive(directives):

            # TODO: respect flags in directive

            src_url = saga.Url(directive['source'])
            action = directive['action']

            # Convert the target url into a SAGA Url object
            tgt_url = saga.Url(directive['target'])
            # Create a pointer to the directory object that we will use
            tgt_dir_url = tgt_url

            if tgt_url.path.endswith('/'):
                # If the original target was a directory (ends with /),
                # we assume that the user wants the same filename as the source.
                tgt_filename = os.path.basename(src_url.path)
            else:
                # Otherwise, extract the filename and update the directory
                tgt_filename = os.path.basename(tgt_dir_url.path)
                tgt_dir_url.path = os.path.dirname(tgt_dir_url.path)

            # Handle special 'staging' scheme
            if tgt_dir_url.scheme == 'staging':

                # We expect a staging:///relative/path/file.txt URI,
                # as hostname would have unclear semantics currently.
                if tgt_dir_url.host:
                    raise Exception(
                        "hostname not supported with staging:// scheme")

                # Remove the leading slash to get a relative path from the staging area
                rel_path = os.path.relpath(tgt_dir_url.path, '/')

                # Now base the target directory relative of the sandbox and staging prefix
                tgt_dir_url = saga.Url(
                    os.path.join(self.sandbox, STAGING_AREA, rel_path))

            # Define and open the staging directory for the pilot
            # We use the target dir construct here, so that we can create
            # the directory if it does not yet exist.
            target_dir = saga.filesystem.Directory(
                tgt_dir_url, flags=saga.filesystem.CREATE_PARENTS)

            if action == LINK:
                # TODO: Does this make sense?
                #log_message = 'Linking %s to %s' % (source, abs_target)
                #os.symlink(source, abs_target)
                logger.error(
                    "action 'LINK' not supported on pilot level staging")
                raise ValueError(
                    "action 'LINK' not supported on pilot level staging")
            elif action == COPY:
                # TODO: Does this make sense?
                #log_message = 'Copying %s to %s' % (source, abs_target)
                #shutil.copyfile(source, abs_target)
                logger.error(
                    "action 'COPY' not supported on pilot level staging")
                raise ValueError(
                    "action 'COPY' not supported on pilot level staging")
            elif action == MOVE:
                # TODO: Does this make sense?
                #log_message = 'Moving %s to %s' % (source, abs_target)
                #shutil.move(source, abs_target)
                logger.error(
                    "action 'MOVE' not supported on pilot level staging")
                raise ValueError(
                    "action 'MOVE' not supported on pilot level staging")
            elif action == TRANSFER:
                log_message = 'Transferring %s to %s' % (
                    src_url, os.path.join(str(tgt_dir_url), tgt_filename))
                logger.info(log_message)
                # Transfer the source file to the target staging area
                target_dir.copy(src_url, tgt_filename)
            else:
                raise Exception('Action %s not supported' % action)

Example #8

Show file

    def _handle_unit(self, unit, actionables):

        uid = unit['uid']

        src_context = {
            'pwd': unit['unit_sandbox'],  # !!!
            'unit': unit['unit_sandbox'],
            'pilot': unit['pilot_sandbox'],
            'resource': unit['resource_sandbox']
        }
        tgt_context = {
            'pwd': os.getcwd(),  # !!!
            'unit': unit['unit_sandbox'],
            'pilot': unit['pilot_sandbox'],
            'resource': unit['resource_sandbox']
        }

        # url used for cache (sandbox url w/o path)
        tmp = rs.Url(unit["unit_sandbox"])
        tmp.path = '/'
        key = str(tmp)

        if key not in self._cache:
            self._cache[key] = rs.filesystem.Directory(tmp,
                                                       session=self._session)
        saga_dir = self._cache[key]

        # Loop over all transfer directives and execute them.
        for sd in actionables:

            action = sd['action']
            flags = sd['flags']
            did = sd['uid']
            src = sd['source']
            tgt = sd['target']

            self._prof.prof('staging_out_start', uid=uid, msg=did)

            self._log.debug('src: %s', src)
            self._log.debug('tgt: %s', tgt)

            src = rpsd.complete_url(src, src_context, self._log)
            tgt = rpsd.complete_url(tgt, tgt_context, self._log)

            self._log.debug('src: %s', src)
            self._log.debug('tgt: %s', tgt)

            # Check if the src is a folder, if true
            # add recursive flag if not already specified
            if saga_dir.is_dir(src.path):
                flags |= rs.filesystem.RECURSIVE

            # Always set CREATE_PARENTS
            flags |= rs.filesystem.CREATE_PARENTS

            saga_dir.copy(src, tgt, flags=flags)
            self._prof.prof('staging_out_stop', uid=uid, msg=did)

        # all staging is done -- at this point the unit is final
        unit['state'] = unit['target_state']
        self.advance(unit, publish=True, push=True)

Example #9

Show file

File: pilot_launcher_worker.py Project: JensTimmerman/radical.pilot

    def run(self):
        """Starts the process when Process.start() is called.
        """

        # make sure to catch sys.exit (which raises SystemExit)
        try:
            # Get directory where this module lives
            mod_dir = os.path.dirname(os.path.realpath(__file__))

            # Try to connect to the database
            try:
                connection = self.db_connection_info.get_db_handle()
                db = connection[self.db_connection_info.dbname]
                pilot_col = db["%s.p" % self.db_connection_info.session_id]
                logger.debug(
                    "Connected to MongoDB. Serving requests for PilotManager %s."
                    % self.pilot_manager_id)

            except Exception as e:
                logger.exception("Connection error: %s" % e)
                return

            last_job_check = time.time()

            while not self._stop.is_set():

                # Periodically, we pull up all ComputePilots that are pending
                # execution or were last seen executing and check if the corresponding
                # SAGA job is still pending in the queue. If that is not the case,
                # we assume that the job has failed for some reasons and update
                # the state of the ComputePilot accordingly.
                if last_job_check + JOB_CHECK_INTERVAL < time.time():
                    last_job_check = time.time()
                    self.check_pilot_states(pilot_col)

                # See if we can find a ComputePilot that is waiting to be launched.
                # If we find one, we use SAGA to create a job service, a job
                # description and a job that is then send to the local or remote
                # queueing system. If this succedes, we set the ComputePilot's
                # state to pending, otherwise to failed.
                compute_pilot = None

                ts = datetime.datetime.utcnow()
                compute_pilot = pilot_col.find_and_modify(
                    query={
                        "pilotmanager": self.pilot_manager_id,
                        "state": PENDING_LAUNCH
                    },
                    update={
                        "$set": {
                            "state": LAUNCHING
                        },
                        "$push": {
                            "statehistory": {
                                "state": LAUNCHING,
                                "timestamp": ts
                            }
                        }
                    })

                if not compute_pilot:
                    time.sleep(IDLE_TIMER)

                else:
                    try:
                        # ------------------------------------------------------
                        #
                        # LAUNCH THE PILOT AGENT VIA SAGA
                        #
                        logentries = []
                        pilot_id = str(compute_pilot["_id"])

                        logger.info("Launching ComputePilot %s" % pilot_id)

                        # ------------------------------------------------------
                        # Database connection parameters
                        session_uid = self.db_connection_info.session_id
                        database_url = self.db_connection_info.dburl
                        database_name = self.db_connection_info.dbname
                        database_auth = self.db_connection_info.dbauth

                        # ------------------------------------------------------
                        # pilot description and resource configuration
                        number_cores = compute_pilot['description']['cores']
                        runtime = compute_pilot['description']['runtime']
                        queue = compute_pilot['description']['queue']
                        project = compute_pilot['description']['project']
                        cleanup = compute_pilot['description']['cleanup']
                        resource_key = compute_pilot['description']['resource']
                        schema = compute_pilot['description']['access_schema']
                        memory = compute_pilot['description']['memory']
                        pilot_sandbox = compute_pilot['sandbox']
                        global_sandbox = compute_pilot['global_sandbox']

                        # we expand and exchange keys in the resource config,
                        # depending on the selected schema so better use a deep
                        # copy..
                        resource_cfg = self._session.get_resource_config(
                            resource_key, schema)

                        # import pprint
                        # pprint.pprint (resource_cfg)

                        # ------------------------------------------------------
                        # get parameters from cfg, set defaults where needed
                        agent_mongodb_endpoint = resource_cfg.get(
                            'agent_mongodb_endpoint', database_url)
                        agent_spawner = resource_cfg.get(
                            'agent_spawner', DEFAULT_AGENT_SPAWNER)
                        agent_type = resource_cfg.get('agent_type',
                                                      DEFAULT_AGENT_TYPE)
                        agent_scheduler = resource_cfg.get('agent_scheduler')
                        tunnel_bind_device = resource_cfg.get(
                            'tunnel_bind_device')
                        default_queue = resource_cfg.get('default_queue')
                        forward_tunnel_endpoint = resource_cfg.get(
                            'forward_tunnel_endpoint')
                        js_endpoint = resource_cfg.get('job_manager_endpoint')
                        lrms = resource_cfg.get('lrms')
                        mpi_launch_method = resource_cfg.get(
                            'mpi_launch_method')
                        pre_bootstrap = resource_cfg.get('pre_bootstrap')
                        python_interpreter = resource_cfg.get(
                            'python_interpreter')
                        spmd_variation = resource_cfg.get('spmd_variation')
                        task_launch_method = resource_cfg.get(
                            'task_launch_method')
                        rp_version = resource_cfg.get('rp_version',
                                                      DEFAULT_RP_VERSION)
                        virtenv_mode = resource_cfg.get(
                            'virtenv_mode', DEFAULT_VIRTENV_MODE)
                        virtenv = resource_cfg.get('virtenv', DEFAULT_VIRTENV)
                        stage_cacerts = resource_cfg.get(
                            'stage_cacerts', 'False')

                        if stage_cacerts.lower() == 'true':
                            stage_cacerts = True
                        else:
                            stage_cacerts = False

                        # expand variables in virtenv string
                        virtenv = virtenv % {
                            'pilot_sandbox': saga.Url(pilot_sandbox).path,
                            'global_sandbox': saga.Url(global_sandbox).path
                        }

                        # Check for deprecated global_virtenv
                        global_virtenv = resource_cfg.get('global_virtenv')
                        if global_virtenv:
                            logger.warn(
                                "'global_virtenv' keyword is deprecated -- use 'virtenv' and 'virtenv_mode'"
                            )
                            virtenv = global_virtenv
                            virtenv_mode = 'use'

                        # set default scheme, host, port and dbname if not set
                        db_url = saga.Url(agent_mongodb_endpoint)
                        if not db_url.scheme: db_url.scheme = 'mongodb'
                        if not db_url.host: db_url.host = 'localhost'
                        if not db_url.port: db_url.port = 27017
                        if not database_name: database_name = 'radicalpilot'

                        # Create a host:port string for use by the bootstrapper.
                        database_hostport = "%s:%d" % (db_url.host,
                                                       db_url.port)

                        # ------------------------------------------------------
                        # Copy the bootstrap shell script.  This also creates
                        # the sandbox. We use always "default_bootstrapper.sh"
                        bootstrapper = 'default_bootstrapper.sh'
                        bootstrapper_path = os.path.abspath("%s/../bootstrapper/%s" \
                                % (mod_dir, bootstrapper))

                        msg = "Using bootstrapper %s" % bootstrapper_path
                        logentries.append(Logentry(msg, logger=logger.info))

                        bs_script_url = saga.Url("file://localhost/%s" %
                                                 bootstrapper_path)
                        bs_script_tgt = saga.Url("%s/pilot_bootstrapper.sh" %
                                                 pilot_sandbox)

                        msg = "Copying bootstrapper '%s' to agent sandbox (%s)." \
                                % (bs_script_url, bs_script_tgt)
                        logentries.append(Logentry(msg, logger=logger.debug))

                        bs_script = saga.filesystem.File(bs_script_url,
                                                         session=self._session)
                        bs_script.copy(bs_script_tgt,
                                       flags=saga.filesystem.CREATE_PARENTS)
                        bs_script.close()

                        # ------------------------------------------------------
                        # the version of the agent is derived from
                        # rp_version, which has the following format
                        # and interpretation:
                        #
                        # case rp_version:
                        #   @<token>:
                        #   @tag/@branch/@commit: # no sdist staging
                        #       git clone $github_base radical.pilot.src
                        #       (cd radical.pilot.src && git checkout token)
                        #       pip install -t $VIRTENV/rp_install/ radical.pilot.src
                        #       rm -rf radical.pilot.src
                        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
                        #
                        #   release: # no sdist staging
                        #       pip install -t $VIRTENV/rp_install radical.pilot
                        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
                        #
                        #   local: # needs sdist staging
                        #       tar zxf $sdist.tgz
                        #       pip install -t $VIRTENV/rp_install $sdist/
                        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
                        #
                        #   debug: # needs sdist staging
                        #       tar zxf $sdist.tgz
                        #       pip install -t $SANDBOX/rp_install $sdist/
                        #       export PYTHONPATH=$SANDBOX/rp_install:$PYTHONPATH
                        #
                        #   installed: # no sdist staging
                        #       true
                        # esac
                        #
                        # virtenv_mode
                        #   private : error  if ve exists, otherwise create, then use
                        #   update  : update if ve exists, otherwise create, then use
                        #   create  : use    if ve exists, otherwise create, then use
                        #   use     : use    if ve exists, otherwise error,  then exit
                        #   recreate: delete if ve exists, otherwise create, then use
                        #
                        # examples   :
                        #   [email protected]
                        #   virtenv@devel
                        #   virtenv@release
                        #   virtenv@installed
                        #   stage@local
                        #   stage@/tmp/my_agent.py
                        #
                        # Note that some combinations may be invalid,
                        # specifically in the context of virtenv_mode.  If, for
                        # example, virtenv_mode is 'use', then the 'virtenv:tag'
                        # will not make sense, as the virtenv is not updated.
                        # In those cases, the virtenv_mode is honored, and
                        # a warning is printed.
                        #
                        # Also, the 'stage' mode can only be combined with the
                        # 'local' source, or with a path to the agent (relative
                        # to mod_dir, or absolute).
                        #
                        # A rp_version which does not adhere to the
                        # above syntax is ignored, and the fallback stage@local
                        # is used.

                        if  not rp_version.startswith('@') and \
                            not rp_version in ['installed', 'local', 'debug']:
                            raise ValueError("invalid rp_version '%s'" %
                                             rp_version)

                        stage_sdist = True
                        if rp_version in ['installed', 'release']:
                            stage_sdist = False

                        if rp_version.startswith('@'):
                            stage_sdist = False
                            rp_version = rp_version[1:]  # strip '@'

                        # ------------------------------------------------------
                        # Copy the rp sdist if needed.  We actually also stage
                        # the sdists for radical.utils and radical.saga, so that
                        # we have the complete stack to install...
                        if stage_sdist:

                            for path in [
                                    ru.sdist_path, saga.sdist_path, sdist_path
                            ]:

                                sdist_url = saga.Url("file://localhost/%s" %
                                                     path)
                                msg = "Copying sdist '%s' to sdist sandbox (%s)." % (
                                    sdist_url, pilot_sandbox)
                                logentries.append(
                                    Logentry(msg, logger=logger.debug))

                                sdist_file = saga.filesystem.File(sdist_url)
                                sdist_file.copy("%s/" % (str(pilot_sandbox)))
                                sdist_file.close()

                        # ------------------------------------------------------
                        # some machines cannot run pip due to outdated ca certs.
                        # For those, we also stage an updated cert bundle
                        if stage_cacerts:
                            cc_path = os.path.abspath("%s/../bootstrapper/%s" \
                                    % (mod_dir, 'cacert.pem.gz'))

                            cc_script_url = saga.Url("file://localhost/%s" %
                                                     cc_path)
                            cc_script_tgt = saga.Url("%s/cacert.pem.gz" %
                                                     pilot_sandbox)

                            cc_script = saga.filesystem.File(
                                cc_script_url, session=self._session)
                            cc_script.copy(
                                cc_script_tgt,
                                flags=saga.filesystem.CREATE_PARENTS)
                            cc_script.close()

                        # ------------------------------------------------------
                        # sanity checks
                        if not agent_spawner:
                            raise RuntimeError("missing agent spawner")
                        if not agent_scheduler:
                            raise RuntimeError("missing agent scheduler")
                        if not lrms: raise RuntimeError("missing LRMS")
                        if not mpi_launch_method:
                            raise RuntimeError("missing mpi launch method")
                        if not task_launch_method:
                            raise RuntimeError("missing task launch method")

                        # massage some values
                        debug_level = os.environ.get(
                            'RADICAL_PILOT_AGENT_VERBOSE', logger.level)
                        try:
                            debug_level = int(debug_level)
                        except ValueError:
                            debug_level = {
                                'CRITICAL': 1,
                                'ERROR': 2,
                                'WARNING': 3,
                                'WARN': 3,
                                'INFO': 4,
                                'DEBUG': 5
                            }.get(debug_level, 0)

                        if not queue:
                            queue = default_queue

                        if cleanup and isinstance(cleanup, bool):
                            cleanup = 'luve'  #  l : log files
                            #  u : unit work dirs
                            #  v : virtualenv
                            #  e : everything (== pilot sandbox)
                            #
                            # we never cleanup virtenvs which are not private
                            if virtenv_mode is not 'private':
                                cleanup = cleanup.replace('v', '')

                        sdists = ':'.join(
                            [ru.sdist_name, saga.sdist_name, sdist_name])

                        # set mandatory args
                        bootstrap_args = ""
                        bootstrap_args += " -b '%s'" % sdists
                        bootstrap_args += " -c '%s'" % number_cores
                        bootstrap_args += " -d '%s'" % debug_level
                        bootstrap_args += " -g '%s'" % virtenv
                        bootstrap_args += " -j '%s'" % task_launch_method
                        bootstrap_args += " -k '%s'" % mpi_launch_method
                        bootstrap_args += " -l '%s'" % lrms
                        bootstrap_args += " -m '%s'" % database_hostport
                        bootstrap_args += " -n '%s'" % database_name
                        bootstrap_args += " -o '%s'" % agent_spawner
                        bootstrap_args += " -p '%s'" % pilot_id
                        bootstrap_args += " -q '%s'" % agent_scheduler
                        bootstrap_args += " -r '%s'" % runtime
                        bootstrap_args += " -s '%s'" % session_uid
                        bootstrap_args += " -t '%s'" % agent_type
                        bootstrap_args += " -u '%s'" % virtenv_mode
                        bootstrap_args += " -v '%s'" % rp_version

                        # set optional args
                        if database_auth:
                            bootstrap_args += " -a '%s'" % database_auth
                        if tunnel_bind_device:
                            bootstrap_args += " -D '%s'" % tunnel_bind_device
                        if pre_bootstrap:
                            bootstrap_args += " -e '%s'" % "' -e '".join(
                                pre_bootstrap)
                        if forward_tunnel_endpoint:
                            bootstrap_args += " -f '%s'" % forward_tunnel_endpoint
                        if python_interpreter:
                            bootstrap_args += " -i '%s'" % python_interpreter
                        if cleanup:
                            bootstrap_args += " -x '%s'" % cleanup

                        # ------------------------------------------------------
                        # now that the script is in place and we know where it is,
                        # we can launch the agent
                        js_url = saga.Url(js_endpoint)
                        logger.debug("saga.job.Service ('%s')" % js_url)
                        if js_url in self._shared_worker_data['job_services']:
                            js = self._shared_worker_data['job_services'][
                                js_url]
                        else:
                            js = saga.job.Service(js_url,
                                                  session=self._session)
                            self._shared_worker_data['job_services'][
                                js_url] = js

                        # ------------------------------------------------------
                        # Create SAGA Job description and submit the pilot job

                        jd = saga.job.Description()

                        jd.executable = "/bin/bash"
                        jd.arguments = [
                            "-l pilot_bootstrapper.sh", bootstrap_args
                        ]
                        jd.working_directory = saga.Url(pilot_sandbox).path
                        jd.project = project
                        jd.output = "agent.out"
                        jd.error = "agent.err"
                        jd.total_cpu_count = number_cores
                        jd.wall_time_limit = runtime
                        jd.total_physical_memory = memory
                        jd.queue = queue

                        # Set the SPMD variation only if required
                        if spmd_variation:
                            jd.spmd_variation = spmd_variation

                        if 'RADICAL_PILOT_PROFILE' in os.environ:
                            jd.environment = {'RADICAL_PILOT_PROFILE': 'TRUE'}

                        logger.debug("Bootstrap command line: %s %s" %
                                     (jd.executable, jd.arguments))

                        msg = "Submitting SAGA job with description: %s" % str(
                            jd.as_dict())
                        logentries.append(Logentry(msg, logger=logger.debug))

                        pilotjob = js.create_job(jd)
                        pilotjob.run()

                        # do a quick error check
                        if pilotjob.state == saga.FAILED:
                            raise RuntimeError("SAGA Job state is FAILED.")

                        saga_job_id = pilotjob.id
                        self._shared_worker_data['job_ids'][pilot_id] = [
                            saga_job_id, js_url
                        ]

                        msg = "SAGA job submitted with job id %s" % str(
                            saga_job_id)
                        logentries.append(Logentry(msg, logger=logger.debug))

                        #
                        # ------------------------------------------------------

                        log_dicts = list()
                        for le in logentries:
                            log_dicts.append(le.as_dict())

                        # Update the Pilot's state to 'PENDING_ACTIVE' if SAGA job submission was successful.
                        ts = datetime.datetime.utcnow()
                        ret = pilot_col.update(
                            {
                                "_id": pilot_id,
                                "state": 'Launching'
                            }, {
                                "$set": {
                                    "state": PENDING_ACTIVE,
                                    "saga_job_id": saga_job_id
                                },
                                "$push": {
                                    "statehistory": {
                                        "state": PENDING_ACTIVE,
                                        "timestamp": ts
                                    }
                                },
                                "$pushAll": {
                                    "log": log_dicts
                                }
                            })

                        if ret['n'] == 0:
                            # could not update, probably because the agent is
                            # running already.  Just update state history and
                            # jobid then
                            # FIXME: make sure of the agent state!
                            ret = pilot_col.update({"_id": pilot_id}, {
                                "$set": {
                                    "saga_job_id": saga_job_id
                                },
                                "$push": {
                                    "statehistory": {
                                        "state": PENDING_ACTIVE,
                                        "timestamp": ts
                                    }
                                },
                                "$pushAll": {
                                    "log": log_dicts
                                }
                            })

                    except Exception as e:
                        # Update the Pilot's state 'FAILED'.
                        out, err, log = self._get_pilot_logs(
                            pilot_col, pilot_id)
                        ts = datetime.datetime.utcnow()

                        # FIXME: we seem to be unable to bson/json handle saga
                        # log messages containing an '#'.  This shows up here.
                        # Until we find a clean workaround, make log shorter and
                        # rely on saga logging to reveal the problem.
                        msg = "Pilot launching failed! (%s)" % e
                        logentries.append(Logentry(msg))

                        log_dicts = list()
                        log_messages = list()
                        for le in logentries:
                            log_dicts.append(le.as_dict())
                            log_messages.append(le.message)

                        pilot_col.update(
                            {
                                "_id": pilot_id,
                                "state": {
                                    "$ne": FAILED
                                }
                            }, {
                                "$set": {
                                    "state": FAILED,
                                    "stdout": out,
                                    "stderr": err,
                                    "logfile": log
                                },
                                "$push": {
                                    "statehistory": {
                                        "state": FAILED,
                                        "timestamp": ts
                                    }
                                },
                                "$pushAll": {
                                    "log": log_dicts
                                }
                            })
                        logger.exception('\n'.join(log_messages))

        except SystemExit as e:
            logger.exception(
                "pilot launcher thread caught system exit -- forcing application shutdown"
            )
            import thread
            thread.interrupt_main()

Example #10

Show file

File: default.py Project: lee212/radical.pilot

    def _handle_unit(self, unit, actionables):

        # FIXME: we should created unit sandboxes in a bulk

        uid = unit['uid']

        self._prof.prof("create_sandbox_start", uid=uid)

        src_context = {
            'pwd': os.getcwd(),  # !!!
            'unit': unit['unit_sandbox'],
            'pilot': unit['pilot_sandbox'],
            'resource': unit['resource_sandbox']
        }
        tgt_context = {
            'pwd': unit['unit_sandbox'],  # !!!
            'unit': unit['unit_sandbox'],
            'pilot': unit['pilot_sandbox'],
            'resource': unit['resource_sandbox']
        }

        # we have actionable staging directives, and thus we need a unit
        # sandbox.
        sandbox = rs.Url(unit["unit_sandbox"])
        tmp = rs.Url(unit["unit_sandbox"])

        # url used for cache (sandbox url w/o path)
        tmp.path = '/'
        key = str(tmp)
        self._log.debug('key %s / %s', key, tmp)

        if key not in self._fs_cache:
            self._fs_cache[key] = rs.filesystem.Directory(
                tmp, session=self._session)

        saga_dir = self._fs_cache[key]
        saga_dir.make_dir(sandbox, flags=rs.filesystem.CREATE_PARENTS)
        self._prof.prof("create_sandbox_stop", uid=uid)

        # Loop over all transfer directives and filter out tarball staging
        # directives.  Those files are added into a tarball, and a single
        # actionable to stage that tarball replaces the original actionables.

        # create a new actionable list during the filtering
        new_actionables = list()
        tar_file = None

        for sd in actionables:

            # don't touch non-tar SDs
            if sd['action'] != rpc.TARBALL:
                new_actionables.append(sd)

            else:

                action = sd['action']
                flags = sd['flags']  # NOTE: we don't use those
                did = sd['uid']
                src = sd['source']
                tgt = sd['target']

                src = complete_url(src, src_context, self._log)
                tgt = complete_url(tgt, tgt_context, self._log)

                self._prof.prof('staging_in_tar_start', uid=uid, msg=did)

                # create a tarfile on the first match, and register for transfer
                if not tar_file:
                    tmp_file = tempfile.NamedTemporaryFile(
                        prefix='rp_usi_%s.' % uid, suffix='.tar', delete=False)
                    tar_path = tmp_file.name
                    tar_file = tarfile.open(fileobj=tmp_file, mode='w')
                    tar_src = ru.Url('file://localhost/%s' % tar_path)
                    tar_tgt = ru.Url('unit:////%s.tar' % uid)
                    tar_did = ru.generate_id('sd')
                    tar_sd = {
                        'action': rpc.TRANSFER,
                        'flags': rpc.DEFAULT_FLAGS,
                        'uid': tar_did,
                        'source': str(tar_src),
                        'target': str(tar_tgt),
                    }
                    new_actionables.append(tar_sd)

                # add the src file
                tar_file.add(src.path, arcname=tgt.path)

                self._prof.prof('staging_in_tar_stop', uid=uid, msg=did)

        # make sure tarball is flushed to disk
        if tar_file:
            tar_file.close()

        # work on the filtered TRANSFER actionables
        for sd in new_actionables:

            action = sd['action']
            flags = sd['flags']
            did = sd['uid']
            src = sd['source']
            tgt = sd['target']

            if action == rpc.TRANSFER:

                src = complete_url(src, src_context, self._log)
                tgt = complete_url(tgt, tgt_context, self._log)

                # Check if the src is a folder, if true
                # add recursive flag if not already specified
                if os.path.isdir(src.path):
                    flags |= rs.filesystem.RECURSIVE

                # Always set CREATE_PARENTS
                flags |= rs.filesystem.CREATE_PARENTS

                src = complete_url(src, src_context, self._log)
                tgt = complete_url(tgt, tgt_context, self._log)

                self._prof.prof('staging_in_start', uid=uid, msg=did)
                saga_dir.copy(src, tgt, flags=flags)
                self._prof.prof('staging_in_stop', uid=uid, msg=did)

        if tar_file:

            # some tarball staging was done.  Add a staging directive for the
            # agent to untar the tarball, and clean up.
            tar_sd['action'] = rpc.TARBALL
            unit['description']['input_staging'].append(tar_sd)
            os.remove(tar_path)

        # staging is done, we can advance the unit at last
        self.advance(unit,
                     rps.AGENT_STAGING_INPUT_PENDING,
                     publish=True,
                     push=True)

Example #11

Show file

def main():

    tmp_dir = None

    try:

        tmp_dir = tempfile.mkdtemp(prefix='saga-test-', suffix='-%s' % TEST_NAME,
                                   dir=os.path.expanduser('~/tmp'))

        print 'tmpdir: %s' % tmp_dir

        ctx = saga.Context("x509")
        ctx.user_proxy = '/Users/mark/proj/myproxy/xsede.x509'

        session = saga.Session()
        session.add_context(ctx)

        source_url = saga.Url()
        source_url.schema = 'go'
        source_url.host = SOURCE
        source_url.path = tmp_dir

        target_url = saga.Url()
        target_url.schema = 'go'
        target_url.host = TARGET
        target_url.path = os.path.join('~/saga-tests/', os.path.basename(tmp_dir))

        print "Point to local Directory through GO ..."
        d = saga.filesystem.Directory(source_url)
        print "And check ..."
        assert d.is_dir() == True
        assert d.is_file() == False
        assert d.is_link() == False
        d.close()
        print "Point to remote Directory through GO ..."
        d = saga.filesystem.Directory(target_url, flags=saga.filesystem.CREATE_PARENTS)
        print "And check ..."
        assert d.is_dir() == True
        assert d.is_file() == False
        assert d.is_link() == False
        d.close()

        print "Point to local file through GO, before creation ..."
        caught = False
        try:
            saga.filesystem.File(os.path.join(str(source_url), FILE_A_level_0))
        except saga.DoesNotExist:
            caught = True
        assert caught == True

        print "Create actual file ..."
        touch(tmp_dir, FILE_A_level_0)
        print "Try again ..."
        f = saga.filesystem.File(os.path.join(str(source_url), FILE_A_level_0))
        assert f.is_file() == True
        assert f.is_dir() == False
        assert f.is_link() == False
        f.close()

        print "Copy local file to remote, using different filename ..."
        d = saga.filesystem.Directory(target_url, flags=saga.filesystem.CREATE_PARENTS)
        d.copy(os.path.join(str(source_url), FILE_A_level_0), FILE_A_level_0+COPIED_SUFFIX)
        d.close()
        f = saga.filesystem.File(os.path.join(str(target_url), FILE_A_level_0+COPIED_SUFFIX))
        assert f.is_file() == True
        assert f.is_dir() == False
        assert f.is_link() == False
        f.close()

        print "Copy local file to remote, keeping filename in tact ..."
        d = saga.filesystem.Directory(target_url, flags=saga.filesystem.CREATE_PARENTS)
        d.copy(os.path.join(str(source_url), FILE_A_level_0), FILE_A_level_0)
        d.close()
        f = saga.filesystem.File(os.path.join(str(target_url), FILE_A_level_0))
        assert f.is_file() == True
        assert f.is_dir() == False
        assert f.is_link() == False
        f.close()

        print 'Create file in level 1 ...'
        tree = LEVEL_1
        os.mkdir(os.path.join(tmp_dir, tree))
        touch(os.path.join(tmp_dir, tree), FILE_A_level_1)
        print "Test local file ..."
        f = saga.filesystem.File(os.path.join(str(source_url), tree, FILE_A_level_1))
        assert f.is_file() == True
        assert f.is_dir() == False
        assert f.is_link() == False
        f.close()

        print "Copy local file to remote, keeping filename in tact ..."
        d = saga.filesystem.Directory(os.path.join(str(target_url), tree), flags=saga.filesystem.CREATE_PARENTS)
        d.copy(os.path.join(str(source_url), tree, FILE_A_level_1), FILE_A_level_1)
        d.close()

        print "Test file after transfer ..."
        f = saga.filesystem.File(os.path.join(str(target_url), tree, FILE_A_level_1))
        assert f.is_file() == True
        assert f.is_dir() == False
        assert f.is_link() == False
        f.close()

        print "Copy non-existent local file to remote, keeping filename in tact ..."
        d = saga.filesystem.Directory(str(target_url), flags=saga.filesystem.CREATE_PARENTS)
        try:
            d.copy(os.path.join(str(source_url), NON_EXISTING_FILE), NON_EXISTING_FILE)
        except saga.DoesNotExist:
            caught = True
        assert caught == True

        print "Test file after (non-)transfer ..."
        caught = False
        try:
            saga.filesystem.File(os.path.join(str(target_url), NON_EXISTING_FILE))
        except saga.DoesNotExist:
            caught = True
        assert caught == True

        # destination = "go://gridftp.stampede.tacc.xsede.org/~/tmp/"
        # #destination = "go://oasis-dm.sdsc.xsede.org/~/tmp/"
        # #destination = "go://ncsa#BlueWaters/~/tmp/"
        # #destination = "go://marksant#netbook/Users/mark/tmp/go/"
        # src_filename = "my_file"
        # dst_filename = "my_file_"
        # rt_filename = "my_file__"
        #
        # # open home directory on a remote machine
        # source_dir = saga.filesystem.Directory(source)
        #
        # # copy .bash_history to /tmp/ on the local machine
        # source_dir.copy(src_filename, os.path.join(destination, dst_filename))
        #
        # # list 'm*' in local /tmp/ directory
        # dest_dir = saga.filesystem.Directory(destination)
        # for entry in dest_dir.list(pattern='%s*' % src_filename[0]):
        #     print entry
        #
        # dest_file = saga.filesystem.File(os.path.join(destination, dst_filename))
        # assert dest_file.is_file() == True
        # assert dest_file.is_link() == False
        # assert dest_file.is_dir() == False
        # print 'Size: %d' % dest_file.get_size()
        #
        # dest_file.copy(source)
        #
        # dest_file.copy(os.path.join(source+'broken', rt_filename))

        print "Before return 0"
        return 0

    except saga.SagaException as ex:
        # Catch all saga exceptions
        print "An exception occurred: (%s) %s " % (ex.type, (str(ex)))
        # Trace back the exception. That can be helpful for debugging.
        print " \n*** Backtrace:\n %s" % ex.traceback

        print "before return -1"
        return -1

    finally:

        print "and finally ..."

        if CLEANUP and tmp_dir:
            shutil.rmtree(tmp_dir)

Example #12

Show file

File: test_sftp.py Project: zonca/saga-python

__author__    = "Andre Merzky"
__copyright__ = "Copyright 2012-2013, The SAGA Project"
__license__   = "MIT"


import re
import time
import saga
import saga.utils.pty_shell as sups

try :
    shell = sups.PTYShell (saga.Url ("fork://localhost"), [])
    shell.run_async ("(sftp -b - localhost || (printf \"SFTP_ABORT\n\"; false)) <<EOT")
    shell.send ("progress\nput /home/merzky/downloads/totalview*.sh /tmp/t\nEOT\n")

  # pat_bof = re.compile ("(?P<perc>\d+\%).*(?P<time>--:--)\s*ETA")
    pat_bof = re.compile ("(?P<perc>\d+)\%\s+(?P<size>.+?)\s+(?P<perf>.+?)\s+(?P<time>--:--)\s*ETA")
    pat_eta = re.compile ("(?P<perc>\d+)\%\s+(?P<size>.+?)\s+(?P<perf>.+?)\s+(?P<time>\d\d:\d\d)\s*ETA")
    pat_eof = re.compile ("(?P<perc>\d+)\%\s+(?P<size>.+?)\s+(?P<perf>.+?)\s+(?P<time>\d\d:\d\d)\s*\n")
    pat_def = re.compile ("^sftp>.*\n")

    begin = True
    error   = ""

    while True :
        ret, out = shell.find (['ETA$', 'SFTP_ABORT\n', '\n'])
        progress    = None

        # ----------------------------------------------------------------------
        # found ETA - transfer is in progress

Example #13

Show file

File: input_file_transfer_worker.py Project: JensTimmerman/radical.pilot

    def run(self):
        """Starts the process when Process.start() is called.
        """

        # make sure to catch sys.exit (which raises SystemExit)
        try :

            logger.info("Starting InputFileTransferWorker")

            # Try to connect to the database and create a tailable cursor.
            try:
                connection = self.db_connection_info.get_db_handle()
                db = connection[self.db_connection_info.dbname]
                um_col = db["%s.cu" % self.db_connection_info.session_id]
                logger.debug("Connected to MongoDB. Serving requests for UnitManager %s." % self.unit_manager_id)

            except Exception as e :
                logger.exception("Connection error: %s" % e)
                raise

            try :
                while not self._stop.is_set():
                    # See if we can find a ComputeUnit that is waiting for
                    # input file transfer.
                    compute_unit = None

                    ts = datetime.datetime.utcnow()
                    compute_unit = um_col.find_and_modify(
                        query={"unitmanager": self.unit_manager_id,
                               "FTW_Input_Status": PENDING},
                        update={"$set" : {"FTW_Input_Status": EXECUTING,
                                          "state": STAGING_INPUT},
                                "$push": {"statehistory": {"state": STAGING_INPUT, "timestamp": ts}}},
                        limit=BULK_LIMIT # TODO: bulklimit is probably not the best way to ensure there is just one
                    )
                    # FIXME: AM: find_and_modify is not bulkable!
                    state = STAGING_INPUT

                    if compute_unit is None:
                        # Sleep a bit if no new units are available.
                        time.sleep(IDLE_TIME) 

                    else:
                        compute_unit_id = None
                        try:
                            log_messages = []

                            # We have found a new CU. Now we can process the transfer
                            # directive(s) wit SAGA.
                            compute_unit_id = str(compute_unit["_id"])
                            remote_sandbox = compute_unit["sandbox"]
                            input_staging = compute_unit["FTW_Input_Directives"]

                            # We need to create the CU's directory in case it doesn't exist yet.
                            log_msg = "Creating ComputeUnit sandbox directory %s." % remote_sandbox
                            log_messages.append(log_msg)
                            logger.info(log_msg)

                            # Creating the sandbox directory.
                            try:
                                logger.debug ("saga.fs.Directory ('%s')" % remote_sandbox)

                                remote_sandbox_keyurl = saga.Url (remote_sandbox)
                                remote_sandbox_keyurl.path = '/'
                                remote_sandbox_key = str(remote_sandbox_keyurl)

                                if  remote_sandbox_key not in self._saga_dirs :
                                    self._saga_dirs[remote_sandbox_key] = \
                                            saga.filesystem.Directory (remote_sandbox_key,
                                                    flags=saga.filesystem.CREATE_PARENTS,
                                                    session=self._session)

                                saga_dir = self._saga_dirs[remote_sandbox_key]
                                saga_dir.make_dir (remote_sandbox, 
                                                   flags=saga.filesystem.CREATE_PARENTS)
                            except Exception as e :
                                logger.exception('Error: %s' % e)
                                # FIXME: why is this exception ignored?  AM


                            logger.info("Processing input file transfers for ComputeUnit %s" % compute_unit_id)
                            # Loop over all transfer directives and execute them.
                            for sd in input_staging:

                                state_doc = um_col.find_one(
                                    {"_id": compute_unit_id},
                                    fields=["state"]
                                )
                                if state_doc['state'] == CANCELED:
                                    logger.info("Compute Unit Canceled, interrupting input file transfers.")
                                    state = CANCELED
                                    break

                                abs_src = os.path.abspath(sd['source'])
                                input_file_url = saga.Url("file://localhost/%s" % abs_src)
                                if not sd['target']:
                                    target = remote_sandbox
                                else:
                                    target = "%s/%s" % (remote_sandbox, sd['target'])

                                log_msg = "Transferring input file %s -> %s" % (input_file_url, target)
                                log_messages.append(log_msg)
                                logger.debug(log_msg)

                                # Execute the transfer.
                                logger.debug ("saga.fs.File ('%s')" % input_file_url)
                                input_file = saga.filesystem.File(
                                    input_file_url,
                                    session=self._session
                                )

                                if CREATE_PARENTS in sd['flags']:
                                    copy_flags = saga.filesystem.CREATE_PARENTS
                                else:
                                    copy_flags = 0

                                try :
                                    input_file.copy(target, flags=copy_flags)
                                except Exception as e :
                                    logger.exception (e)
                                input_file.close()

                                # If all went fine, update the state of this StagingDirective to Done
                                um_col.find_and_modify(
                                    query={"_id" : compute_unit_id,
                                           'FTW_Input_Status': EXECUTING,
                                           'FTW_Input_Directives.state': PENDING,
                                           'FTW_Input_Directives.source': sd['source'],
                                           'FTW_Input_Directives.target': sd['target'],
                                           },
                                    update={'$set': {'FTW_Input_Directives.$.state': 'Done'},
                                            '$push': {'log': {
                                                'timestamp': datetime.datetime.utcnow(), 
                                                'message'  : log_msg}}
                                    }
                                )

                        except Exception as e :
                            # Update the CU's state 'FAILED'.
                            ts = datetime.datetime.utcnow()
                            logentry = {'message'  : "Input transfer failed: %s" % e,
                                        'timestamp': ts}

                            um_col.update({'_id': compute_unit_id}, {
                                '$set': {'state': FAILED},
                                '$push': {
                                    'statehistory': {'state': FAILED, 'timestamp': ts},
                                    'log': logentry
                                }
                            })

                            logger.exception(str(logentry))

                    # Code below is only to be run by the "first" or only worker
                    if self._worker_number > 1:
                        continue

                    # If the CU was canceled we can skip the remainder of this loop.
                    if state == CANCELED:
                        continue

                    #
                    # Check to see if there are more pending Directives, if not, we are Done
                    #
                    cursor_w = um_col.find({"unitmanager": self.unit_manager_id,
                                            "$or": [ {"Agent_Input_Status": EXECUTING},
                                                     {"FTW_Input_Status": EXECUTING}
                                                   ]
                                            }
                                           )
                    # Iterate over all the returned CUs (if any)
                    for cu in cursor_w:
                        # See if there are any FTW Input Directives still pending
                        if cu['FTW_Input_Status'] == EXECUTING and \
                                not any(d['state'] == EXECUTING or d['state'] == PENDING for d in cu['FTW_Input_Directives']):
                            # All Input Directives for this FTW are done, mark the CU accordingly
                            um_col.update({"_id": cu["_id"]},
                                          {'$set': {'FTW_Input_Status': DONE},
                                           '$push': {'log': {
                                                'timestamp': datetime.datetime.utcnow(),
                                                'message'  : 'All FTW Input Staging Directives done - %d.' % self._worker_number}}
                                           }
                            )

                        # See if there are any Agent Input Directives still pending or executing,
                        # if not, mark it DONE.
                        if cu['Agent_Input_Status'] == EXECUTING and \
                                not any(d['state'] == EXECUTING or d['state'] == PENDING for d in cu['Agent_Input_Directives']):
                            # All Input Directives for this Agent are done, mark the CU accordingly
                            um_col.update({"_id": cu["_id"]},
                                           {'$set': {'Agent_Input_Status': DONE},
                                            '$push': {'log': {
                                                'timestamp': datetime.datetime.utcnow(), 
                                                'message'  : 'All Agent Input Staging Directives done - %d.' % self._worker_number}}
                                           }
                            )

                    #
                    # Check for all CUs if both Agent and FTW staging is done, we can then mark the CU PendingExecution
                    #
                    ts = datetime.datetime.utcnow()
                    um_col.find_and_modify(
                        query={"unitmanager": self.unit_manager_id,
                               "Agent_Input_Status": { "$in": [ None, DONE ] },
                               "FTW_Input_Status": { "$in": [ None, DONE ] },
                               "state": STAGING_INPUT
                        },
                        update={"$set": {
                                    "state": PENDING_EXECUTION
                                },
                                "$push": {
                                    "statehistory": {"state": PENDING_EXECUTION, "timestamp": ts}
                                }
                        }
                    )

            except Exception as e :

                logger.exception("transfer worker error: %s" % e)
                self._session.close (cleanup=False)
                raise

        except SystemExit as e :
            logger.debug("input file transfer thread caught system exit -- forcing application shutdown")
            import thread
            thread.interrupt_main ()

Example #14

Show file

    def _start_pilot_bulk(self, resource, schema, pilots):
        """
        For each pilot, we prepare by determining what files need to be staged,
        and what job description needs to be submitted.

        We expect `_prepare_pilot(resource, pilot)` to return a dict with:

            { 
              'js' : saga.job.Description,
              'ft' : [ 
                { 'src' : string  # absolute source file name
                  'tgt' : string  # relative target file name
                  'rem' : bool    # shall we remove src?
                }, 
                ... ]
            }

        When transfering data, we'll ensure that each src is only transferred
        once (in fact, we put all src files into a tarball and unpack that on
        the target side).

        The returned dicts are expected to only contain files which actually
        need staging, ie. which have not been staged during a previous pilot
        submission.  That implies one of two things: either this component is
        stateful, and remembers what has been staged -- which makes it difficult
        to use multiple component instances; or the component inspects the
        target resource for existing files -- which involves additional
        expensive remote hops.
        FIXME: since neither is implemented at this point we won't discuss the
               tradeoffs further -- right now files are unique per pilot bulk.

        Once all dicts are collected, we create one additional file which
        contains the staging information, and then pack all src files into
        a tarball for staging.  We transfer the tarball, and *immediately*
        trigger the untaring on the target resource, which is thus *not* part of
        the bootstrapping process.
        NOTE: this is to avoid untaring race conditions for multiple pilots, and
              also to simplify bootstrapping dependencies -- the bootstrappers
              are likely within the tarball after all...
        """

        rcfg = self._session.get_resource_config(resource, schema)
        sid = self._session.uid

        # we create a fake session_sandbox with all pilot_sandboxes in /tmp, and
        # then tar it up.  Once we untar that tarball on the target machine, we
        # should have all sandboxes and all files required to bootstrap the
        # pilots
        # FIXME: on untar, there is a race between multiple launcher components
        #        within the same session toward the same target resource.
        tmp_dir = os.path.abspath(tempfile.mkdtemp(prefix='rp_agent_tar_dir'))
        tar_name = '%s.%s.tgz' % (sid, self.uid)
        tar_tgt = '%s/%s' % (tmp_dir, tar_name)
        tar_url = rs.Url('file://localhost/%s' % tar_tgt)

        # we need the session sandbox url, but that is (at least in principle)
        # dependent on the schema to use for pilot startup.  So we confirm here
        # that the bulk is consistent wrt. to the schema.
        # FIXME: if it is not, it needs to be splitted into schema-specific
        # sub-bulks
        schema = pilots[0]['description'].get('access_schema')
        for pilot in pilots[1:]:
            assert(schema == pilot['description'].get('access_schema')), \
                    'inconsistent scheme on launch / staging'

        session_sandbox = self._session._get_session_sandbox(pilots[0]).path

        # we will create the session sandbox before we untar, so we can use that
        # as workdir, and pack all paths relative to that session sandbox.  That
        # implies that we have to recheck that all URLs in fact do point into
        # the session sandbox.

        ft_list = list()  # files to stage
        jd_list = list()  # jobs  to submit
        for pilot in pilots:
            info = self._prepare_pilot(resource, rcfg, pilot)
            ft_list += info['ft']
            jd_list.append(info['jd'])
            self._prof.prof('staging_in_start', uid=pilot['uid'])

        for ft in ft_list:
            src = os.path.abspath(ft['src'])
            tgt = os.path.relpath(os.path.normpath(ft['tgt']), session_sandbox)
            # src_dir = os.path.dirname(src)
            tgt_dir = os.path.dirname(tgt)

            if tgt_dir.startswith('..'):
                raise ValueError('staging target %s outside of pilot sandbox' %
                                 ft['tgt'])

            if not os.path.isdir('%s/%s' % (tmp_dir, tgt_dir)):
                os.makedirs('%s/%s' % (tmp_dir, tgt_dir))

            if src == '/dev/null':
                # we want an empty file -- touch it (tar will refuse to
                # handle a symlink to /dev/null)
                open('%s/%s' % (tmp_dir, tgt), 'a').close()
            else:
                os.symlink(src, '%s/%s' % (tmp_dir, tgt))

        # tar.  If any command fails, this will raise.
        cmd = "cd %s && tar zchf %s *" % (tmp_dir, tar_tgt)
        self._log.debug('cmd: %s', cmd)
        try:
            out = sp.check_output(["/bin/sh", "-c", cmd], stderr=sp.STDOUT)
        except Exception:
            self._log.exception('callout failed: %s', out)
            raise
        else:
            self._log.debug('out: %s', out)

        # remove all files marked for removal-after-pack
        for ft in ft_list:
            if ft['rem']:
                os.unlink(ft['src'])

        fs_endpoint = rcfg['filesystem_endpoint']
        fs_url = rs.Url(fs_endpoint)

        self._log.debug("rs.file.Directory ('%s')", fs_url)

        with self._cache_lock:
            if fs_url in self._saga_fs_cache:
                fs = self._saga_fs_cache[fs_url]
            else:
                fs = rsfs.Directory(fs_url, session=self._session)
                self._saga_fs_cache[fs_url] = fs

        tar_rem = rs.Url(fs_url)
        tar_rem.path = "%s/%s" % (session_sandbox, tar_name)

        fs.copy(tar_url, tar_rem, flags=rsfs.CREATE_PARENTS)

        shutil.rmtree(tmp_dir)

        # we now need to untar on the target machine.
        js_url = ru.Url(pilots[0]['js_url'])

        # well, we actually don't need to talk to the lrms, but only need
        # a shell on the headnode.  That seems true for all LRMSs we use right
        # now.  So, lets convert the URL:
        if '+' in js_url.scheme:
            parts = js_url.scheme.split('+')
            if 'gsissh' in parts: js_url.scheme = 'gsissh'
            elif 'ssh' in parts: js_url.scheme = 'ssh'
        else:
            # In the non-combined '+' case we need to distinguish between
            # a url that was the result of a hop or a local lrms.
            if js_url.scheme not in ['ssh', 'gsissh']:
                js_url.scheme = 'fork'

        with self._cache_lock:
            if js_url in self._saga_js_cache:
                js_tmp = self._saga_js_cache[js_url]
            else:
                js_tmp = rs.job.Service(js_url, session=self._session)
                self._saga_js_cache[js_url] = js_tmp

    ## cmd = "tar zmxvf %s/%s -C / ; rm -f %s" % \
        cmd = "tar zmxvf %s/%s -C %s" % \
                (session_sandbox, tar_name, session_sandbox)
        j = js_tmp.run_job(cmd)
        j.wait()

        self._log.debug('tar cmd : %s', cmd)
        self._log.debug('tar done: %s, %s, %s', j.state, j.stdout, j.stderr)

        for pilot in pilots:
            self._prof.prof('staging_in_stop', uid=pilot['uid'])
            self._prof.prof('submission_start', uid=pilot['uid'])

        # look up or create JS for actual pilot submission.  This might result
        # in the same js url as above, or not.
        js_ep = rcfg['job_manager_endpoint']
        with self._cache_lock:
            if js_ep in self._saga_js_cache:
                js = self._saga_js_cache[js_ep]
            else:
                js = rs.job.Service(js_ep, session=self._session)
                self._saga_js_cache[js_ep] = js

        # now that the scripts are in place and configured,
        # we can launch the agent
        jc = rs.job.Container()

        for jd in jd_list:
            self._log.debug('jd: %s', pprint.pformat(jd.as_dict()))
            jc.add(js.create_job(jd))

        jc.run()

        # we assume here that the tasks arrive in the same order as the job
        # descriptions.  For uniform sets of pilots the order does not matter
        # much though.  Either way, this needs confirming on SAGA level
        # FIXME
        for j, jd in zip(jc.get_tasks(), jd_list):

            # do a quick error check
            if j.state == rs.FAILED:
                self._log.error('%s: %s : %s : %s', j.id, j.state, j.stderr,
                                j.stdout)
                raise RuntimeError("SAGA Job state is FAILED. (%s)" % jd.name)

            pilot = None
            pid = jd.name
            for p in pilots:
                if p['uid'] == pid:
                    pilot = p
                    break

            assert (pilot)

            # Update the Pilot's state to 'PMGR_ACTIVE_PENDING' if SAGA job
            # submission was successful.  Since the pilot leaves the scope of
            # the PMGR for the time being, we update the complete DB document
            pilot['$all'] = True

            # FIXME: update the right pilot
            with self._pilots_lock:

                self._pilots[pid] = dict()
                self._pilots[pid]['pilot'] = pilot
                self._pilots[pid]['job'] = j

            # make sure we watch that pilot
            with self._check_lock:
                self._checking.append(pid)

        for pilot in pilots:
            self._prof.prof('submission_stop', uid=pilot['uid'])

Example #15

Show file

    def _get_resource_sandbox(self, pilot):
        """
        for a given pilot dict, determine the global RP sandbox, based on the
        pilot's 'resource' attribute.
        """

        self.is_valid()

        # FIXME: this should get 'resource, schema=None' as parameters

        resource = pilot['description'].get('resource')
        schema   = pilot['description'].get('access_schema')

        if not resource:
            raise ValueError('Cannot get pilot sandbox w/o resource target')

        # the global sandbox will be the same for all pilots on any resource, so
        # we cache it
        with self._cache_lock:

            if resource not in self._cache['resource_sandbox']:

                # cache miss -- determine sandbox and fill cache
                rcfg   = self.get_resource_config(resource, schema)
                fs_url = rs.Url(rcfg['filesystem_endpoint'])

                # Get the sandbox from either the pilot_desc or resource conf
                sandbox_raw = pilot['description'].get('sandbox')
                if not sandbox_raw:
                    sandbox_raw = rcfg.get('default_remote_workdir', "$PWD")
        
                # If the sandbox contains expandables, we need to resolve those remotely.
                # NOTE: Note that this will only work for (gsi)ssh or shell based access mechanisms
                if '$' not in sandbox_raw and '`' not in sandbox_raw:
                    # no need to expand further
                    sandbox_base = sandbox_raw

                else:
                    js_url = rs.Url(rcfg['job_manager_endpoint'])
        
                    if 'ssh' in js_url.schema.split('+'):
                        js_url.schema = 'ssh'
                    elif 'gsissh' in js_url.schema.split('+'):
                        js_url.schema = 'gsissh'
                    elif 'fork' in js_url.schema.split('+'):
                        js_url.schema = 'fork'
                    elif '+' not in js_url.schema:
                        # For local access to queueing systems use fork
                        js_url.schema = 'fork'
                    else:
                        raise Exception("unsupported access schema: %s" % js_url.schema)
        
                    self._log.debug("rsup.PTYShell('%s')", js_url)
                    shell = rsup.PTYShell(js_url, self)
        
                    ret, out, err = shell.run_sync(' echo "WORKDIR: %s"' % sandbox_raw)
                    if ret == 0 and 'WORKDIR:' in out:
                        sandbox_base = out.split(":")[1].strip()
                        self._log.debug("sandbox base %s: '%s'", js_url, sandbox_base)
                    else:
                        raise RuntimeError("Couldn't get remote working directory.")
        
                # at this point we have determined the remote 'pwd' - the global sandbox
                # is relative to it.
                fs_url.path = "%s/radical.pilot.sandbox" % sandbox_base
        
                # before returning, keep the URL string in cache
                self._cache['resource_sandbox'][resource] = fs_url

            return self._cache['resource_sandbox'][resource]

Example #16

Show file

    def _prepare_pilot(self, resource, rcfg, pilot):

        pid = pilot["uid"]
        ret = {'ft': list(), 'jd': None}

        # # ----------------------------------------------------------------------
        # # the rcfg can contain keys with string expansion placeholders where
        # # values from the pilot description need filling in.  A prominent
        # # example is `%(pd.project)s`, where the pilot description's `PROJECT`
        # # value needs to be filled in (here in lowercase).
        # expand = dict()
        # for k,v in pilot['description'].iteritems():
        #     if v is None:
        #         v = ''
        #     expand['pd.%s' % k] = v
        #     if isinstance(v, basestring):
        #         expand['pd.%s' % k.upper()] = v.upper()
        #         expand['pd.%s' % k.lower()] = v.lower()
        #     else:
        #         expand['pd.%s' % k.upper()] = v
        #         expand['pd.%s' % k.lower()] = v
        #
        # for k in rcfg:
        #     if isinstance(rcfg[k], basestring):
        #         orig     = rcfg[k]
        #         rcfg[k]  = rcfg[k] % expand
        #         expanded = rcfg[k]
        #         if orig != expanded:
        #             self._log.debug('RCFG:\n%s\n%s', orig, expanded)

        # ----------------------------------------------------------------------
        # Database connection parameters
        sid = self._session.uid
        database_url = self._session.dburl

        # some default values are determined at runtime
        default_virtenv = '%%(resource_sandbox)s/ve.%s.%s' % \
                          (resource, self._rp_version)

        # ----------------------------------------------------------------------
        # pilot description and resource configuration
        number_cores = pilot['description']['cores']
        number_gpus = pilot['description']['gpus']
        runtime = pilot['description']['runtime']
        queue = pilot['description']['queue']
        project = pilot['description']['project']
        cleanup = pilot['description']['cleanup']
        memory = pilot['description']['memory']
        candidate_hosts = pilot['description']['candidate_hosts']

        # ----------------------------------------------------------------------
        # get parameters from resource cfg, set defaults where needed
        agent_launch_method = rcfg.get('agent_launch_method')
        agent_dburl = rcfg.get('agent_mongodb_endpoint', database_url)
        agent_spawner = rcfg.get('agent_spawner', DEFAULT_AGENT_SPAWNER)
        rc_agent_config = rcfg.get('agent_config', DEFAULT_AGENT_CONFIG)
        agent_scheduler = rcfg.get('agent_scheduler')
        tunnel_bind_device = rcfg.get('tunnel_bind_device')
        default_queue = rcfg.get('default_queue')
        forward_tunnel_endpoint = rcfg.get('forward_tunnel_endpoint')
        lrms = rcfg.get('lrms')
        mpi_launch_method = rcfg.get('mpi_launch_method', '')
        pre_bootstrap_0 = rcfg.get('pre_bootstrap_0', [])
        pre_bootstrap_1 = rcfg.get('pre_bootstrap_1', [])
        python_interpreter = rcfg.get('python_interpreter')
        task_launch_method = rcfg.get('task_launch_method')
        rp_version = rcfg.get('rp_version', DEFAULT_RP_VERSION)
        virtenv_mode = rcfg.get('virtenv_mode', DEFAULT_VIRTENV_MODE)
        virtenv = rcfg.get('virtenv', default_virtenv)
        cores_per_node = rcfg.get('cores_per_node', 0)
        gpus_per_node = rcfg.get('gpus_per_node', 0)
        lfs_path_per_node = rcfg.get('lfs_path_per_node', None)
        lfs_size_per_node = rcfg.get('lfs_size_per_node', 0)
        python_dist = rcfg.get('python_dist')
        virtenv_dist = rcfg.get('virtenv_dist', DEFAULT_VIRTENV_DIST)
        cu_tmp = rcfg.get('cu_tmp')
        spmd_variation = rcfg.get('spmd_variation')
        shared_filesystem = rcfg.get('shared_filesystem', True)
        stage_cacerts = rcfg.get('stage_cacerts', False)
        cu_pre_exec = rcfg.get('cu_pre_exec')
        cu_post_exec = rcfg.get('cu_post_exec')
        export_to_cu = rcfg.get('export_to_cu')
        mandatory_args = rcfg.get('mandatory_args', [])
        saga_jd_supplement = rcfg.get('saga_jd_supplement', {})

        import pprint
        self._log.debug(cores_per_node)
        self._log.debug(pprint.pformat(rcfg))

        # make sure that mandatory args are known
        for ma in mandatory_args:
            if pilot['description'].get(ma) is None:
                raise ValueError('attribute "%s" is required for "%s"' %
                                 (ma, resource))

        # get pilot and global sandbox
        resource_sandbox = self._session._get_resource_sandbox(pilot).path
        session_sandbox = self._session._get_session_sandbox(pilot).path
        pilot_sandbox = self._session._get_pilot_sandbox(pilot).path

        pilot['resource_sandbox'] = str(
            self._session._get_resource_sandbox(pilot))
        pilot['pilot_sandbox'] = str(self._session._get_pilot_sandbox(pilot))
        pilot['client_sandbox'] = str(self._session._get_client_sandbox())

        # Agent configuration that is not part of the public API.
        # The agent config can either be a config dict, or
        # a string pointing to a configuration name.  If neither
        # is given, check if 'RADICAL_PILOT_AGENT_CONFIG' is
        # set.  The last fallback is 'agent_default'
        agent_config = pilot['description'].get('_config')
        if not agent_config:
            agent_config = os.environ.get('RADICAL_PILOT_AGENT_CONFIG')
        if not agent_config:
            agent_config = rc_agent_config

        if isinstance(agent_config, dict):

            # use dict as is
            agent_cfg = agent_config

        elif isinstance(agent_config, basestring):
            try:
                # interpret as a config name
                agent_cfg_file = os.path.join(self._conf_dir,
                                              "agent_%s.json" % agent_config)

                self._log.info("Read agent config file: %s", agent_cfg_file)
                agent_cfg = ru.read_json(agent_cfg_file)

                # allow for user level overload
                user_cfg_file = '%s/.radical/pilot/config/%s' \
                              % (os.environ['HOME'], os.path.basename(agent_cfg_file))

                if os.path.exists(user_cfg_file):
                    self._log.info("merging user config: %s" % user_cfg_file)
                    user_cfg = ru.read_json(user_cfg_file)
                    ru.dict_merge(agent_cfg, user_cfg, policy='overwrite')

            except Exception as e:
                self._log.exception("Error reading agent config file: %s" % e)
                raise

        else:
            # we can't handle this type
            raise TypeError(
                'agent config must be string (config name) or dict')

        # expand variables in virtenv string
        virtenv = virtenv % {
            'pilot_sandbox': pilot_sandbox,
            'session_sandbox': session_sandbox,
            'resource_sandbox': resource_sandbox
        }

        # Check for deprecated global_virtenv
        if 'global_virtenv' in rcfg:
            raise RuntimeError("'global_virtenv' is deprecated (%s)" %
                               resource)

        # Create a host:port string for use by the bootstrap_0.
        db_url = rs.Url(agent_dburl)
        if db_url.port:
            db_hostport = "%s:%d" % (db_url.host, db_url.port)
        else:
            db_hostport = "%s:%d" % (db_url.host, 27017)  # mongodb default

        # ----------------------------------------------------------------------
        # the version of the agent is derived from
        # rp_version, which has the following format
        # and interpretation:
        #
        # case rp_version:
        #   @<token>:
        #   @tag/@branch/@commit: # no sdist staging
        #       git clone $github_base radical.pilot.src
        #       (cd radical.pilot.src && git checkout token)
        #       pip install -t $VIRTENV/rp_install/ radical.pilot.src
        #       rm -rf radical.pilot.src
        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
        #
        #   release: # no sdist staging
        #       pip install -t $VIRTENV/rp_install radical.pilot
        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
        #
        #   local: # needs sdist staging
        #       tar zxf $sdist.tgz
        #       pip install -t $VIRTENV/rp_install $sdist/
        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
        #
        #   debug: # needs sdist staging
        #       tar zxf $sdist.tgz
        #       pip install -t $SANDBOX/rp_install $sdist/
        #       export PYTHONPATH=$SANDBOX/rp_install:$PYTHONPATH
        #
        #   installed: # no sdist staging
        #       true
        # esac
        #
        # virtenv_mode
        #   private : error  if ve exists, otherwise create, then use
        #   update  : update if ve exists, otherwise create, then use
        #   create  : use    if ve exists, otherwise create, then use
        #   use     : use    if ve exists, otherwise error,  then exit
        #   recreate: delete if ve exists, otherwise create, then use
        #
        # examples   :
        #   [email protected]
        #   virtenv@devel
        #   virtenv@release
        #   virtenv@installed
        #   stage@local
        #   stage@/tmp/my_agent.py
        #
        # Note that some combinations may be invalid,
        # specifically in the context of virtenv_mode.  If, for
        # example, virtenv_mode is 'use', then the 'virtenv:tag'
        # will not make sense, as the virtenv is not updated.
        # In those cases, the virtenv_mode is honored, and
        # a warning is printed.
        #
        # Also, the 'stage' mode can only be combined with the
        # 'local' source, or with a path to the agent (relative
        # to root_dir, or absolute).
        #
        # A rp_version which does not adhere to the
        # above syntax is ignored, and the fallback stage@local
        # is used.

        if  not rp_version.startswith('@') and \
            not rp_version in ['installed', 'local', 'debug', 'release']:
            raise ValueError("invalid rp_version '%s'" % rp_version)

        if rp_version.startswith('@'):
            rp_version = rp_version[1:]  # strip '@'

        # ----------------------------------------------------------------------
        # sanity checks
        if not python_dist: raise RuntimeError("missing python distribution")
        if not virtenv_dist:
            raise RuntimeError("missing virtualenv distribution")
        if not agent_spawner: raise RuntimeError("missing agent spawner")
        if not agent_scheduler: raise RuntimeError("missing agent scheduler")
        if not lrms: raise RuntimeError("missing LRMS")
        if not agent_launch_method:
            raise RuntimeError("missing agentlaunch method")
        if not task_launch_method:
            raise RuntimeError("missing task launch method")

        # massage some values
        if not queue:
            queue = default_queue

        if cleanup and isinstance(cleanup, bool):
            #  l : log files
            #  u : unit work dirs
            #  v : virtualenv
            #  e : everything (== pilot sandbox)
            if shared_filesystem:
                cleanup = 'luve'
            else:
                # we cannot clean the sandbox from within the agent, as the hop
                # staging would then fail, and we'd get nothing back.
                # FIXME: cleanup needs to be done by the pmgr.launcher, or
                #        someone else, really, after fetching all logs and
                #        profiles.
                cleanup = 'luv'

            # we never cleanup virtenvs which are not private
            if virtenv_mode is not 'private':
                cleanup = cleanup.replace('v', '')

        # add dists to staging files, if needed
        if rp_version in ['local', 'debug']:
            sdist_names = [ru.sdist_name, rs.sdist_name, self._rp_sdist_name]
            sdist_paths = [ru.sdist_path, rs.sdist_path, self._rp_sdist_path]
        else:
            sdist_names = list()
            sdist_paths = list()

        # if cores_per_node is set (!= None), then we need to
        # allocation full nodes, and thus round up
        if cores_per_node:
            cores_per_node = int(cores_per_node)
            number_cores = int(cores_per_node *
                               math.ceil(float(number_cores) / cores_per_node))

        # if gpus_per_node is set (!= None), then we need to
        # allocation full nodes, and thus round up
        if gpus_per_node:
            gpus_per_node = int(gpus_per_node)
            number_gpus = int(gpus_per_node *
                              math.ceil(float(number_gpus) / gpus_per_node))

        # set mandatory args
        bootstrap_args = ""
        bootstrap_args += " -d '%s'" % ':'.join(sdist_names)
        bootstrap_args += " -p '%s'" % pid
        bootstrap_args += " -s '%s'" % sid
        bootstrap_args += " -m '%s'" % virtenv_mode
        bootstrap_args += " -r '%s'" % rp_version
        bootstrap_args += " -b '%s'" % python_dist
        bootstrap_args += " -g '%s'" % virtenv_dist
        bootstrap_args += " -v '%s'" % virtenv
        bootstrap_args += " -y '%d'" % runtime

        # set optional args
        if lrms == "CCM": bootstrap_args += " -c"
        if forward_tunnel_endpoint:
            bootstrap_args += " -f '%s'" % forward_tunnel_endpoint
        if forward_tunnel_endpoint: bootstrap_args += " -h '%s'" % db_hostport
        if python_interpreter:
            bootstrap_args += " -i '%s'" % python_interpreter
        if tunnel_bind_device:
            bootstrap_args += " -t '%s'" % tunnel_bind_device
        if cleanup: bootstrap_args += " -x '%s'" % cleanup

        for arg in pre_bootstrap_0:
            bootstrap_args += " -e '%s'" % arg
        for arg in pre_bootstrap_1:
            bootstrap_args += " -w '%s'" % arg

        agent_cfg['owner'] = 'agent_0'
        agent_cfg['cores'] = number_cores
        agent_cfg['gpus'] = number_gpus
        agent_cfg['lrms'] = lrms
        agent_cfg['spawner'] = agent_spawner
        agent_cfg['scheduler'] = agent_scheduler
        agent_cfg['runtime'] = runtime
        agent_cfg['dburl'] = str(database_url)
        agent_cfg['session_id'] = sid
        agent_cfg['pilot_id'] = pid
        agent_cfg['logdir'] = '.'
        agent_cfg['pilot_sandbox'] = pilot_sandbox
        agent_cfg['session_sandbox'] = session_sandbox
        agent_cfg['resource_sandbox'] = resource_sandbox
        agent_cfg['agent_launch_method'] = agent_launch_method
        agent_cfg['task_launch_method'] = task_launch_method
        agent_cfg['mpi_launch_method'] = mpi_launch_method
        agent_cfg['cores_per_node'] = cores_per_node
        agent_cfg['gpus_per_node'] = gpus_per_node
        agent_cfg['lfs_path_per_node'] = lfs_path_per_node
        agent_cfg['lfs_size_per_node'] = lfs_size_per_node
        agent_cfg['cu_tmp'] = cu_tmp
        agent_cfg['export_to_cu'] = export_to_cu
        agent_cfg['cu_pre_exec'] = cu_pre_exec
        agent_cfg['cu_post_exec'] = cu_post_exec
        agent_cfg['resource_cfg'] = copy.deepcopy(rcfg)
        agent_cfg['debug'] = self._log.getEffectiveLevel()

        # we'll also push the agent config into MongoDB
        pilot['cfg'] = agent_cfg

        # ----------------------------------------------------------------------
        # Write agent config dict to a json file in pilot sandbox.

        agent_cfg_name = 'agent_0.cfg'
        cfg_tmp_handle, cfg_tmp_file = tempfile.mkstemp(prefix='rp.agent_cfg.')
        os.close(cfg_tmp_handle)  # file exists now

        # Convert dict to json file
        self._log.debug("Write agent cfg to '%s'.", cfg_tmp_file)
        self._log.debug(pprint.pformat(agent_cfg))
        ru.write_json(agent_cfg, cfg_tmp_file)

        ret['ft'].append({
            'src': cfg_tmp_file,
            'tgt': '%s/%s' % (pilot_sandbox, agent_cfg_name),
            'rem': True
        })  # purge the tmp file after packing

        # ----------------------------------------------------------------------
        # we also touch the log and profile tarballs in the target pilot sandbox
        ret['ft'].append({
            'src': '/dev/null',
            'tgt': '%s/%s' % (pilot_sandbox, '%s.log.tgz' % pid),
            'rem': False
        })  # don't remove /dev/null
        # only stage profiles if we profile
        if self._prof.enabled:
            ret['ft'].append({
                'src':
                '/dev/null',
                'tgt':
                '%s/%s' % (pilot_sandbox, '%s.prof.tgz' % pid),
                'rem':
                False
            })  # don't remove /dev/null

        # check if we have a sandbox cached for that resource.  If so, we have
        # nothing to do.  Otherwise we create the sandbox and stage the RP
        # stack etc.
        # NOTE: this will race when multiple pilot launcher instances are used!
        with self._cache_lock:

            if resource not in self._sandboxes:

                for sdist in sdist_paths:
                    base = os.path.basename(sdist)
                    ret['ft'].append({
                        'src': sdist,
                        'tgt': '%s/%s' % (session_sandbox, base),
                        'rem': False
                    })

                # Copy the bootstrap shell script.
                bootstrapper_path = os.path.abspath(
                    "%s/agent/%s" % (self._root_dir, BOOTSTRAPPER_0))
                self._log.debug("use bootstrapper %s", bootstrapper_path)

                ret['ft'].append({
                    'src':
                    bootstrapper_path,
                    'tgt':
                    '%s/%s' % (session_sandbox, BOOTSTRAPPER_0),
                    'rem':
                    False
                })

                # Some machines cannot run pip due to outdated CA certs.
                # For those, we also stage an updated certificate bundle
                # TODO: use booleans all the way?
                if stage_cacerts:

                    cc_name = 'cacert.pem.gz'
                    cc_path = os.path.abspath("%s/agent/%s" %
                                              (self._root_dir, cc_name))
                    self._log.debug("use CAs %s", cc_path)

                    ret['ft'].append({
                        'src':
                        cc_path,
                        'tgt':
                        '%s/%s' % (session_sandbox, cc_name),
                        'rem':
                        False
                    })

                self._sandboxes[resource] = True

        # ----------------------------------------------------------------------
        # Create SAGA Job description and submit the pilot job

        jd = rs.job.Description()

        if shared_filesystem:
            bootstrap_tgt = '%s/%s' % (session_sandbox, BOOTSTRAPPER_0)
        else:
            bootstrap_tgt = '%s/%s' % ('.', BOOTSTRAPPER_0)

        jd.name = pid
        jd.executable = "/bin/bash"
        jd.arguments = ['-l %s' % bootstrap_tgt, bootstrap_args]
        jd.working_directory = pilot_sandbox
        jd.project = project
        jd.output = "bootstrap_0.out"
        jd.error = "bootstrap_0.err"
        jd.total_cpu_count = number_cores
        jd.total_gpu_count = number_gpus
        jd.processes_per_host = cores_per_node
        jd.spmd_variation = spmd_variation
        jd.wall_time_limit = runtime
        jd.total_physical_memory = memory
        jd.queue = queue
        jd.candidate_hosts = candidate_hosts
        jd.environment = dict()

        # we set any saga_jd_supplement keys which are not already set above
        for key, val in saga_jd_supplement.iteritems():
            if not jd[key]:
                self._log.debug('supplement %s: %s', key, val)
                jd[key] = val

        if 'RADICAL_PILOT_PROFILE' in os.environ:
            jd.environment['RADICAL_PILOT_PROFILE'] = 'TRUE'

        # for condor backends and the like which do not have shared FSs, we add
        # additional staging directives so that the backend system binds the
        # files from the session and pilot sandboxes to the pilot job.
        jd.file_transfer = list()
        if not shared_filesystem:

            jd.file_transfer.extend([
                'site:%s/%s > %s' %
                (session_sandbox, BOOTSTRAPPER_0, BOOTSTRAPPER_0),
                'site:%s/%s > %s' %
                (pilot_sandbox, agent_cfg_name, agent_cfg_name),
                'site:%s/%s.log.tgz > %s.log.tgz' % (pilot_sandbox, pid, pid),
                'site:%s/%s.log.tgz < %s.log.tgz' % (pilot_sandbox, pid, pid)
            ])

            if 'RADICAL_PILOT_PROFILE' in os.environ:
                jd.file_transfer.extend([
                    'site:%s/%s.prof.tgz > %s.prof.tgz' %
                    (pilot_sandbox, pid, pid),
                    'site:%s/%s.prof.tgz < %s.prof.tgz' %
                    (pilot_sandbox, pid, pid)
                ])

            for sdist in sdist_names:
                jd.file_transfer.extend(
                    ['site:%s/%s > %s' % (session_sandbox, sdist, sdist)])

            if stage_cacerts:
                jd.file_transfer.extend(
                    ['site:%s/%s > %s' % (session_sandbox, cc_name, cc_name)])

        self._log.debug("Bootstrap command line: %s %s", jd.executable,
                        jd.arguments)

        ret['jd'] = jd
        return ret

Example #17

Show file

def fetch_profiles (sid, dburl=None, src=None, tgt=None, access=None, 
        session=None, skip_existing=False, fetch_client=False, log=None):
    '''
    sid: session for which all profiles are fetched
    src: dir to look for client session profiles ($src/$sid/*.prof)
    tgt: dir to store the profile in
         - $tgt/$sid/*.prof,
         - $tgt/$sid/$pilot_id/*.prof)

    returns list of file names
    '''

    if not log and session:
        log = session._log
        rep = session._rep
    elif not log:
        log = ru.Logger('radical.pilot.utils')
        rep = ru.Reporter('radical.pilot.utils')

    ret = list()

    if not dburl:
        dburl = os.environ['RADICAL_PILOT_DBURL']

    if not dburl:
        raise ValueError('RADICAL_PILOT_DBURL is not set')

    if not src:
        src = os.getcwd()

    if not tgt:
        tgt = os.getcwd()

    if not tgt.startswith('/') and '://' not in tgt:
        tgt = "%s/%s" % (os.getcwd(), tgt)

    # we always create a session dir as real target
    tgt_url = rs.Url("%s/%s/" % (tgt, sid))

    # Turn URLs without schema://host into file://localhost,
    # so that they dont become interpreted as relative.
    if not tgt_url.schema:
        tgt_url.schema = 'file'
    if not tgt_url.host:
        tgt_url.host = 'localhost'

    # first fetch session profile
    if fetch_client:
        client_profiles = glob.glob("%s/%s/*.prof" % (src, sid))
        if not client_profiles:
            raise RuntimeError('no client profiles in %s/%s' % (src, sid))

        for client_profile in client_profiles:

            ftgt = rs.Url('%s/%s' % (tgt_url, os.path.basename(client_profile)))
            ret.append("%s" % ftgt.path)

            if skip_existing and os.path.isfile(ftgt.path) \
                    and os.stat(ftgt.path).st_size > 0:
                pass
            else:
                prof_file = rs.fs.File(client_profile, session=session)
                prof_file.copy(ftgt, flags=rs.fs.CREATE_PARENTS)
                prof_file.close()

            if not os.path.isfile(client_profile):
                raise RuntimeError('client profilefile %s does not exist' % client_profile)

    _, db, _, _, _ = ru.mongodb_connect (dburl)

    json_docs = get_session_docs(db, sid)

    pilots = json_docs['pilot']
    num_pilots = len(pilots)
    log.debug("Session: %s", sid)
    log.debug("Number of pilots in session: %d", num_pilots)

    for pilot in pilots:

        try:
            log.debug("processing pilot '%s'", pilot['uid'])

            sandbox_url = rs.Url(pilot['pilot_sandbox'])

            if access:
                # Allow to use a different access schema than used for the the run.
                # Useful if you ran from the headnode, but would like to retrieve
                # the profiles to your desktop (Hello Titan).
                access_url = rs.Url(access)
                sandbox_url.schema = access_url.schema
                sandbox_url.host   = access_url.host

              # print "Overriding remote sandbox: %s" % sandbox_url

            sandbox = rs.fs.Directory (sandbox_url, session=session)

            # Try to fetch a tarball of profiles, so that we can get them all in one (SAGA) go!
            PROFILES_TARBALL = '%s.prof.tgz' % pilot['uid']
            tarball_available = False
            try:
                if  sandbox.is_file(PROFILES_TARBALL) and \
                    sandbox.get_size(PROFILES_TARBALL):

                    log.info("profiles tarball exists")
                    ftgt = rs.Url('%s/%s' % (tgt_url, PROFILES_TARBALL))

                    if skip_existing and os.path.isfile(ftgt.path) \
                            and os.stat(ftgt.path).st_size > 0:

                        log.info("skip fetching of '%s/%s' to '%s'.", 
                                 sandbox_url, PROFILES_TARBALL, tgt_url)
                        tarball_available = True
                    else:

                        log.info("fetch '%s%s' to '%s'.", sandbox_url, 
                                 PROFILES_TARBALL, tgt_url)

                        prof_file = rs.fs.File("%s%s" % (sandbox_url,
                                            PROFILES_TARBALL), session=session)
                        prof_file.copy(ftgt, flags=rs.fs.CREATE_PARENTS)
                        prof_file.close()

                        tarball_available = True
                else:
                    log.warn("profiles tarball doesnt exists!")

            except rs.DoesNotExist:
                log.exception("exception(TODO): profiles tarball doesnt exists!")

            try:
                os.mkdir("%s/%s" % (tgt_url.path, pilot['uid']))
            except OSError:
                pass

            # We now have a local tarball
            if tarball_available:
                log.info("Extract tarball %s to '%s'.", ftgt.path, tgt_url.path)
                try:
                    tarball = tarfile.open(ftgt.path, mode='r:gz')
                    tarball.extractall("%s/%s" % (tgt_url.path, pilot['uid']))

                    profiles = glob.glob("%s/%s/*.prof" % (tgt_url.path, pilot['uid']))
                    ret.extend(profiles)
                    os.unlink(ftgt.path)

                    # If extract succeeded, no need to fetch individual profiles
                    rep.ok("+ %s (profiles)\n" % pilot['uid'])
                    continue

                except Exception as e:
                    log.warn('could not extract tarball %s [%s]', ftgt.path, e)

            # If we dont have a tarball (for whichever reason), fetch individual profiles
            profiles = sandbox.list('*.prof')
            for prof in profiles:

                ftgt = rs.Url('%s/%s/%s' % (tgt_url, pilot['uid'], prof))
                ret.append("%s" % ftgt.path)

                if skip_existing and os.path.isfile(ftgt.path) \
                                 and os.stat(ftgt.path).st_size > 0:
                    pass
                else:
                    prof_file = rs.fs.File("%s%s" % (sandbox_url, prof), session=session)
                    prof_file.copy(ftgt, flags=rs.fs.CREATE_PARENTS)
                    prof_file.close()

            rep.ok("+ %s (profiles)\n" % pilot['uid'])

        except Exception as e:
            rep.error("- %s (profiles)\n" % pilot['uid'])
            log.exception('failed to fet profile for %s', pilot['uid'])

    return ret

Example #18

Show file

File: output_file_transfer_worker.py Project: 0/radical.pilot

    def run(self):
        """Starts the process when Process.start() is called.
        """

        # make sure to catch sys.exit (which raises SystemExit)
        try:

            # Try to connect to the database and create a tailable cursor.
            try:
                db = self._session.get_db()
                um_col = db["%s.cu" % self._session.uid]
                logger.debug(
                    "Connected to MongoDB. Serving requests for UnitManager %s."
                    % self.unit_manager_id)

            except Exception as e:
                logger.exception("Connection error: %s" % e)
                return

            while not self._stop.is_set() and \
                  not self._session._terminate.is_set():

                # See if we can find a ComputeUnit that is waiting for client output file transfer.
                # FIXME: this method is not bulkable.  See agent pulling for
                #        units for an approach to split the call into two bulkable
                #        ones.
                ts = timestamp()
                compute_unit = um_col.find_and_modify(
                    query={
                        "unitmanager": self.unit_manager_id,
                        "state": PENDING_OUTPUT_STAGING,
                        "control": 'agent'
                    },
                    update={
                        "$set": {
                            "state": STAGING_OUTPUT,
                            "control": 'umgr'
                        },
                        "$push": {
                            "statehistory": {
                                "state": STAGING_OUTPUT,
                                "timestamp": ts
                            }
                        }
                    })

                if compute_unit is None:
                    # Sleep a bit if no new units are available.
                    time.sleep(IDLE_TIME)
                else:
                    logger.info("OFTW CU found, progressing ...")
                    state = STAGING_OUTPUT
                    compute_unit_id = None
                    try:
                        log_messages = []

                        # We have found a new CU. Now we can process the transfer
                        # directive(s) with SAGA.
                        compute_unit_id = str(compute_unit["_id"])

                        self._session.prof.prof('advance',
                                                uid=compute_unit_id,
                                                msg=STAGING_OUTPUT,
                                                state=STAGING_OUTPUT)
                        logger.debug(
                            "OutputStagingController: unit found: %s" %
                            compute_unit_id)

                        remote_sandbox = compute_unit["sandbox"]
                        output_staging = compute_unit.get(
                            "FTW_Output_Directives", [])

                        logger.info(
                            "OutputStagingController: Processing output file transfers for ComputeUnit %s"
                            % compute_unit_id)
                        # Loop over all staging directives and execute them.
                        for sd in output_staging:

                            logger.debug(
                                "OutputStagingController: sd: %s : %s" %
                                (compute_unit_id, sd))

                            # Check if there was a cancel request for this CU
                            # TODO: Can't these cancel requests come from a central place?
                            state_doc = um_col.find_one(
                                {"_id": compute_unit_id}, fields=["state"])
                            if state_doc['state'] == CANCELED:
                                logger.info(
                                    "Compute Unit Canceled, interrupting output file transfers."
                                )
                                self._session.prof.prof('advance',
                                                        uid=compute_unit_id,
                                                        msg=CANCELED,
                                                        state=CANCELED)
                                state = CANCELED
                                # Break out of the loop over all SD's, into the loop over CUs
                                break

                            abs_src = "%s/%s" % (remote_sandbox, sd['source'])

                            if os.path.basename(sd['target']) == sd['target']:
                                abs_target = "file://localhost%s" % os.path.join(
                                    os.getcwd(), sd['target'])
                            else:
                                abs_target = "file://localhost%s" % os.path.abspath(
                                    sd['target'])

                            log_msg = "Transferring output file %s -> %s" % (
                                abs_src, abs_target)
                            log_messages.append(log_msg)
                            logger.debug(log_msg)

                            output_file = saga.filesystem.File(
                                saga.Url(abs_src), session=self._session)

                            if CREATE_PARENTS in sd['flags']:
                                copy_flags = saga.filesystem.CREATE_PARENTS
                            else:
                                copy_flags = 0

                            try:
                                output_file.copy(saga.Url(abs_target),
                                                 flags=copy_flags)
                                output_file.close()
                            except Exception as e:
                                logger.exception(e)
                                raise Exception("copy failed(%s)" % e.message)

                        # If the CU was canceled we can skip the remainder of this loop,
                        # and return to the CU loop
                        if state == CANCELED:
                            continue

                        # Update the CU's state to 'DONE'.
                        ts = timestamp()
                        log_message = "Output transfer completed."
                        um_col.update({'_id': compute_unit_id}, {
                            '$set': {
                                'state': DONE
                            },
                            '$push': {
                                'statehistory': {
                                    'state': DONE,
                                    'timestamp': ts
                                },
                                'log': {
                                    'message': log_message,
                                    'timestamp': ts
                                }
                            }
                        })
                        self._session.prof.prof('advance',
                                                uid=compute_unit_id,
                                                msg=DONE,
                                                state=DONE)

                    except Exception as e:
                        # Update the CU's state to 'FAILED'.
                        ts = timestamp()
                        log_message = "Output transfer failed: %s" % e
                        um_col.update({'_id': compute_unit_id}, {
                            '$set': {
                                'state': FAILED
                            },
                            '$push': {
                                'statehistory': {
                                    'state': FAILED,
                                    'timestamp': ts
                                },
                                'log': {
                                    'message': log_message,
                                    'timestamp': ts
                                }
                            }
                        })
                        logger.exception(log_message)
                        self._session.prof.prof('advance',
                                                uid=compute_unit_id,
                                                msg=FAILED,
                                                state=FAILED)
                        raise

        except SystemExit as e:
            logger.exception(
                "output file transfer thread caught system exit -- forcing application shutdown"
            )
            thread.interrupt_main()

Example #19

Show file

File: static.py Project: xielm12/radical.ensemblemd

    def execute_pattern(self, pattern, resource):

        pattern_start_time = datetime.datetime.now()

        def get_input_data(kernel, instance=None, iteration=None, ktype=None):

            # INPUT DATA:
            ip_list = []
            #------------------------------------------------------------------------------------------------------------------
            # upload_input_data
            data_in = []
            if kernel._kernel._upload_input_data is not None:
                if isinstance(kernel._kernel._upload_input_data, list):
                    pass
                else:
                    kernel._kernel._upload_input_data = [
                        kernel._kernel._upload_input_data
                    ]
                for i in range(0, len(kernel._kernel._upload_input_data)):
                    if (ktype == 'simulation' or ktype == 'analysis'):
                        var = resolve_placeholder_vars(
                            working_dirs=self.working_dirs,
                            path=kernel._kernel._upload_input_data[i],
                            instance=instance,
                            iteration=iteration,
                            type=ktype)
                    else:
                        var = resolve_placeholder_vars(
                            working_dirs=self.working_dirs,
                            path=kernel._kernel._upload_input_data[i])
                    if len(var.split('>')) > 1:
                        temp = {
                            'source': var.split('>')[0].strip(),
                            'target': var.split('>')[1].strip()
                        }
                    else:
                        temp = {
                            'source': var.split('>')[0].strip(),
                            'target':
                            os.path.basename(var.split('>')[0].strip())
                        }
                    data_in.append(temp)

                if ip_list is None:
                    ip_list = data_in
                else:
                    ip_list += data_in
            #------------------------------------------------------------------------------------------------------------------

            #------------------------------------------------------------------------------------------------------------------
            # link_input_data
            data_in = []
            if kernel._kernel._link_input_data is not None:
                if isinstance(kernel._kernel._link_input_data, list):
                    pass
                else:
                    kernel._kernel._link_input_data = [
                        kernel._kernel._link_input_data
                    ]
                for i in range(0, len(kernel._kernel._link_input_data)):
                    if (ktype == 'simulation' or ktype == 'analysis'):
                        var = resolve_placeholder_vars(
                            working_dirs=self.working_dirs,
                            path=kernel._kernel._link_input_data[i],
                            instance=instance,
                            iteration=iteration,
                            type=ktype)
                    else:
                        var = resolve_placeholder_vars(
                            working_dirs=self.working_dirs,
                            path=kernel._kernel._link_input_data[i])
                    if len(var.split('>')) > 1:
                        temp = {
                            'source': var.split('>')[0].strip(),
                            'target': var.split('>')[1].strip(),
                            'action': radical.pilot.LINK
                        }
                    else:
                        temp = {
                            'source': var.split('>')[0].strip(),
                            'target':
                            os.path.basename(var.split('>')[0].strip()),
                            'action': radical.pilot.LINK
                        }
                    data_in.append(temp)

                if ip_list is None:
                    ip_list = data_in
                else:
                    ip_list += data_in
            #------------------------------------------------------------------------------------------------------------------

            #------------------------------------------------------------------------------------------------------------------
            # copy_input_data
            data_in = []
            if kernel._kernel._copy_input_data is not None:
                if isinstance(kernel._kernel._copy_input_data, list):
                    pass
                else:
                    kernel._kernel._copy_input_data = [
                        kernel._kernel._copy_input_data
                    ]
                for i in range(0, len(kernel._kernel._copy_input_data)):
                    if (ktype == 'simulation' or ktype == 'analysis'):
                        var = resolve_placeholder_vars(
                            working_dirs=self.working_dirs,
                            path=kernel._kernel._copy_input_data[i],
                            instance=instance,
                            iteration=iteration,
                            type=ktype)
                    else:
                        var = resolve_placeholder_vars(
                            working_dirs=self.working_dirs,
                            path=kernel._kernel._copy_input_data[i])
                    if len(var.split('>')) > 1:
                        temp = {
                            'source': var.split('>')[0].strip(),
                            'target': var.split('>')[1].strip(),
                            'action': radical.pilot.COPY
                        }
                    else:
                        temp = {
                            'source': var.split('>')[0].strip(),
                            'target':
                            os.path.basename(var.split('>')[0].strip()),
                            'action': radical.pilot.COPY
                        }
                    data_in.append(temp)

                if ip_list is None:
                    ip_list = data_in
                else:
                    ip_list += data_in
            #------------------------------------------------------------------------------------------------------------------

            #------------------------------------------------------------------------------------------------------------------
            # download input data
            if kernel.download_input_data is not None:
                data_in = kernel.download_input_data

                if ip_list is None:
                    ip_list = data_in
                else:
                    ip_list += data_in
            #------------------------------------------------------------------------------------------------------------------
            return ip_list
            #------------------------------------------------------------------------------------------------------------------

        def get_output_data(kernel, instance=None, iteration=None, ktype=None):
            # OUTPUT DATA:
            op_list = []
            #------------------------------------------------------------------------------------------------------------------
            # copy_output_data
            data_out = []
            if kernel._kernel._copy_output_data is not None:
                if isinstance(kernel._kernel._copy_output_data, list):
                    pass
                else:
                    kernel._kernel._copy_output_data = [
                        kernel._kernel._copy_output_data
                    ]
                for i in range(0, len(kernel._kernel._copy_output_data)):
                    if (ktype == 'simulation' or ktype == 'analysis'):
                        var = resolve_placeholder_vars(
                            working_dirs=self.working_dirs,
                            path=kernel._kernel._copy_output_data[i],
                            instance=instance,
                            iteration=iteration,
                            type=ktype)
                    else:
                        var = resolve_placeholder_vars(
                            working_dirs=self.working_dirs,
                            path=kernel._kernel._copy_output_data[i])
                    if len(var.split('>')) > 1:
                        temp = {
                            'source': var.split('>')[0].strip(),
                            'target': var.split('>')[1].strip(),
                            'action': radical.pilot.COPY
                        }
                    else:
                        temp = {
                            'source': var.split('>')[0].strip(),
                            'target':
                            os.path.basename(var.split('>')[0].strip()),
                            'action': radical.pilot.COPY
                        }
                    data_out.append(temp)

                if op_list is None:
                    op_list = data_out
                else:
                    op_list += data_out
            #-----------------------------------------------------------------------------------------------------------------

            #------------------------------------------------------------------------------------------------------------------
            # download_output_data
            data_out = []
            if kernel._kernel._download_output_data is not None:
                if isinstance(kernel._kernel._download_output_data, list):
                    pass
                else:
                    kernel._kernel._download_output_data = [
                        kernel._kernel._download_output_data
                    ]
                for i in range(0, len(kernel._kernel._download_output_data)):
                    if (ktype == 'simulation' or ktype == 'analysis'):
                        var = resolve_placeholder_vars(
                            working_dirs=self.working_dirs,
                            path=kernel._kernel._download_output_data[i],
                            instance=instance,
                            iteration=iteration,
                            type=ktype)
                    else:
                        var = resolve_placeholder_vars(
                            working_dirs=self.working_dirs,
                            path=kernel._kernel._download_output_data[i])
                    if len(var.split('>')) > 1:
                        temp = {
                            'source': var.split('>')[0].strip(),
                            'target': var.split('>')[1].strip()
                        }
                    else:
                        temp = {
                            'source': var.split('>')[0].strip(),
                            'target':
                            os.path.basename(var.split('>')[0].strip())
                        }
                    data_out.append(temp)

                if op_list is None:
                    op_list = data_out
                else:
                    op_list += data_out

            #------------------------------------------------------------------------------------------------------------------
            return op_list
            #------------------------------------------------------------------------------------------------------------------

        #-----------------------------------------------------------------------
        #
        def unit_state_cb(unit, state):

            if state == radical.pilot.FAILED:
                self.get_logger().error(
                    "ComputeUnit error: STDERR: {0}, STDOUT: {0}".format(
                        unit.stderr, unit.stdout))
                self.get_logger().error("Pattern execution FAILED.")
                sys.exit(1)

        #-----------------------------------------------------------------------
        #
        def create_filecheck_command(files_list):

            command_list = []
            for f in files_list:
                command = 'if [ -f "{0}" ]; then exit 0; else echo "File {0} does not exist" >&2; exit 1; fi;'.format(
                    f)
                command_list.append(command)

            return command_list

        self._reporter.ok('>>ok')
        self.get_logger().info(
            "Executing simulation-analysis loop with {0} iterations on {1} allocated core(s) on '{2}'"
            .format(pattern.iterations, resource._cores,
                    resource._resource_key))

        self._reporter.header(
            "Executing simulation-analysis loop with {0} iterations on {1} allocated core(s) on '{2}'"
            .format(pattern.iterations, resource._cores,
                    resource._resource_key))

        all_cus = []

        #print resource._pilot.description['cores']

        self.get_logger().info("Waiting for pilot on {0} to go Active".format(
            resource._resource_key))
        self._reporter.info("Job waiting on queue...".format(
            resource._resource_key))
        resource._pmgr.wait_pilots(resource._pilot.uid, 'Active')
        self._reporter.ok("\nJob is now running !".format(
            resource._resource_key))

        profiling = int(os.environ.get('RADICAL_ENMD_PROFILING', 0))

        if profiling == 1:
            from collections import OrderedDict as od
            pattern._execution_profile = []
            enmd_overhead_dict = od()
            cu_dict = od()

        try:

            start_now = datetime.datetime.now()

            resource._umgr.register_callback(unit_state_cb)

            ########################################################################
            # execute pre_loop
            #

            ################################################################
            # EXECUTE PRE-LOOP

            if profiling == 1:
                probe_preloop_start = datetime.datetime.now()
                enmd_overhead_dict['preloop'] = od()
                enmd_overhead_dict['preloop'][
                    'start_time'] = probe_preloop_start

            pre_loop = pattern.pre_loop()

            if pre_loop is not None:
                pre_loop._bind_to_resource(resource._resource_key)

                cud = radical.pilot.ComputeUnitDescription()
                cud.name = "pre_loop"

                cud.pre_exec = pre_loop._cu_def_pre_exec
                cud.executable = pre_loop._cu_def_executable
                cud.arguments = pre_loop.arguments
                cud.mpi = pre_loop.uses_mpi
                cud.input_staging = get_input_data(kernel=pre_loop)
                cud.output_staging = get_output_data(kernel=pre_loop)

                if pre_loop.exists_remote is not None:
                    cud.post_exec = create_filecheck_command(
                        pre_loop.exists_remote)

                self.get_logger().debug("Created pre_loop CU: {0}.".format(
                    cud.as_dict()))

                self.get_logger().info(
                    "Submitted ComputeUnit(s) for pre_loop step.")
                self._reporter.info("\nWaiting for pre_loop step to complete.")
                if profiling == 1:
                    probe_preloop_wait = datetime.datetime.now()
                    enmd_overhead_dict['preloop'][
                        'wait_time'] = probe_preloop_wait

                unit = resource._umgr.submit_units(cud)
                all_cus.append(unit)
                resource._umgr.wait_units(unit.uid)

                if profiling == 1:
                    probe_preloop_res = datetime.datetime.now()
                    enmd_overhead_dict['preloop'][
                        'res_time'] = probe_preloop_res

                self.get_logger().info("Pre_loop completed.")

                if unit.state != radical.pilot.DONE:
                    raise EnsemblemdError(
                        "Pre-loop CU failed with error: {0}".format(
                            unit.stdout))

                self.working_dirs["pre_loop"] = saga.Url(
                    unit.working_directory).path

                # Process CU information and append it to the dictionary
                if profiling == 1:
                    probe_preloop_done = datetime.datetime.now()
                    enmd_overhead_dict['preloop'][
                        'stop_time'] = probe_preloop_done
                    cu_dict['pre_loop'] = unit

                self._reporter.ok('>> done')
            else:
                self.get_logger().info("No pre_loop stage.")

            ########################################################################
            # execute simulation analysis loop
            #
            for iteration in range(1, pattern.iterations + 1):

                self.working_dirs['iteration_{0}'.format(iteration)] = {}

                ################################################################
                # EXECUTE SIMULATION STEPS

                if profiling == 1:
                    enmd_overhead_dict['iter_{0}'.format(iteration)] = od()
                    cu_dict['iter_{0}'.format(iteration)] = od()

                if isinstance(
                        pattern.simulation_step(iteration=iteration,
                                                instance=1), list):
                    num_sim_kerns = len(
                        pattern.simulation_step(iteration=iteration,
                                                instance=1))
                else:
                    num_sim_kerns = 1
                #print num_sim_kerns

                all_sim_cus = []
                if profiling == 1:
                    enmd_overhead_dict['iter_{0}'.format(
                        iteration)]['sim'] = od()
                    cu_dict['iter_{0}'.format(iteration)]['sim'] = list()

                for kern_step in range(0, num_sim_kerns):

                    if profiling == 1:
                        probe_sim_start = datetime.datetime.now()

                        enmd_overhead_dict['iter_{0}'.format(iteration)][
                            'sim']['kernel_{0}'.format(kern_step)] = od()
                        enmd_overhead_dict['iter_{0}'.format(
                            iteration)]['sim']['kernel_{0}'.format(
                                kern_step)]['start_time'] = probe_sim_start

                    s_units = []
                    for s_instance in range(1,
                                            pattern._simulation_instances + 1):

                        if isinstance(
                                pattern.simulation_step(iteration=iteration,
                                                        instance=s_instance),
                                list):
                            sim_step = pattern.simulation_step(
                                iteration=iteration,
                                instance=s_instance)[kern_step]
                        else:
                            sim_step = pattern.simulation_step(
                                iteration=iteration, instance=s_instance)

                        sim_step._bind_to_resource(resource._resource_key)

                        # Resolve all placeholders
                        #if sim_step.link_input_data is not None:
                        #    for i in range(len(sim_step.link_input_data)):
                        #        sim_step.link_input_data[i] = resolve_placeholder_vars(working_dirs, s_instance, iteration, pattern._simulation_instances, pattern._analysis_instances, "simulation", sim_step.link_input_data[i])

                        cud = radical.pilot.ComputeUnitDescription()
                        cud.name = "sim ;{iteration} ;{instance}".format(
                            iteration=iteration, instance=s_instance)

                        cud.pre_exec = sim_step._cu_def_pre_exec
                        cud.executable = sim_step._cu_def_executable
                        cud.arguments = sim_step.arguments
                        cud.mpi = sim_step.uses_mpi
                        cud.input_staging = get_input_data(kernel=sim_step,
                                                           instance=s_instance,
                                                           iteration=iteration,
                                                           ktype='simulation')
                        cud.output_staging = get_output_data(
                            kernel=sim_step,
                            instance=s_instance,
                            iteration=iteration,
                            ktype='simulation')

                        if sim_step.cores is not None:
                            cud.cores = sim_step.cores

                        if sim_step.exists_remote is not None:
                            cud.post_exec = create_filecheck_command(
                                sim_step.exists_remote)

                        s_units.append(cud)

                        if sim_step.get_instance_type == 'single':
                            break

                    self.get_logger().debug(
                        "Created simulation CU: {0}.".format(cud.as_dict()))

                    self.get_logger().info(
                        "Submitted tasks for simulation iteration {0}.".format(
                            iteration))
                    self.get_logger().info(
                        "Waiting for {3} simulations in iteration {0}/ kernel {1}: {2} to complete."
                        .format(iteration, kern_step + 1, sim_step.name,
                                pattern._simulation_instances))

                    self._reporter.info(
                        "\nIteration {0}: Waiting for {2} simulation tasks: {1} to complete"
                        .format(iteration, sim_step.name,
                                pattern._simulation_instances))
                    if profiling == 1:
                        probe_sim_wait = datetime.datetime.now()
                        enmd_overhead_dict['iter_{0}'.format(
                            iteration)]['sim']['kernel_{0}'.format(
                                kern_step)]['wait_time'] = probe_sim_wait

                    s_cus = resource._umgr.submit_units(s_units)
                    all_cus.extend(s_cus)
                    all_sim_cus.extend(s_cus)

                    uids = [cu.uid for cu in s_cus]
                    resource._umgr.wait_units(uids)

                    if profiling == 1:
                        probe_sim_res = datetime.datetime.now()
                        enmd_overhead_dict['iter_{0}'.format(
                            iteration)]['sim']['kernel_{0}'.format(
                                kern_step)]['res_time'] = probe_sim_res

                    self.get_logger().info(
                        "Simulations in iteration {0}/ kernel {1}: {2} completed."
                        .format(iteration, kern_step + 1, sim_step.name))

                    failed_units = ""
                    for unit in s_cus:
                        if unit.state != radical.pilot.DONE:
                            failed_units += " * Simulation task {0} failed with an error: {1}\n".format(
                                unit.uid, unit.stderr)

                    if profiling == 1:
                        probe_sim_done = datetime.datetime.now()
                        enmd_overhead_dict['iter_{0}'.format(
                            iteration)]['sim']['kernel_{0}'.format(
                                kern_step)]['stop_time'] = probe_sim_done

                    self._reporter.ok('>> done')

                if profiling == 1:
                    probe_post_sim_start = datetime.datetime.now()
                    enmd_overhead_dict['iter_{0}'.format(
                        iteration)]['sim']['post'] = od()
                    enmd_overhead_dict['iter_{0}'.format(iteration)]['sim'][
                        'post']['start_time'] = probe_post_sim_start

                # TODO: ensure working_dir <-> instance mapping
                i = 0
                for cu in s_cus:
                    i += 1
                    self.working_dirs['iteration_{0}'.format(iteration)][
                        'simulation_{0}'.format(i)] = saga.Url(
                            cu.working_directory).path

                if profiling == 1:
                    probe_post_sim_end = datetime.datetime.now()
                    enmd_overhead_dict['iter_{0}'.format(iteration)]['sim'][
                        'post']['stop_time'] = probe_post_sim_end
                    cu_dict['iter_{0}'.format(iteration)]['sim'] = all_sim_cus

                ################################################################
                # EXECUTE ANALYSIS STEPS

                if isinstance(
                        pattern.analysis_step(iteration=iteration, instance=1),
                        list):
                    num_ana_kerns = len(
                        pattern.analysis_step(iteration=iteration, instance=1))
                else:
                    num_ana_kerns = 1
                #print num_ana_kerns

                all_ana_cus = []
                if profiling == 1:
                    enmd_overhead_dict['iter_{0}'.format(
                        iteration)]['ana'] = od()
                    cu_dict['iter_{0}'.format(iteration)]['ana'] = list()

                for kern_step in range(0, num_ana_kerns):

                    if profiling == 1:
                        probe_ana_start = datetime.datetime.now()
                        enmd_overhead_dict['iter_{0}'.format(iteration)][
                            'ana']['kernel_{0}'.format(kern_step)] = od()
                        enmd_overhead_dict['iter_{0}'.format(
                            iteration)]['ana']['kernel_{0}'.format(
                                kern_step)]['start_time'] = probe_ana_start

                    a_units = []
                    for a_instance in range(1,
                                            pattern._analysis_instances + 1):

                        if isinstance(
                                pattern.analysis_step(iteration=iteration,
                                                      instance=a_instance),
                                list):
                            ana_step = pattern.analysis_step(
                                iteration=iteration,
                                instance=a_instance)[kern_step]
                        else:
                            ana_step = pattern.analysis_step(
                                iteration=iteration, instance=a_instance)

                        ana_step._bind_to_resource(resource._resource_key)

                        # Resolve all placeholders
                        #if ana_step.link_input_data is not None:
                        #    for i in range(len(ana_step.link_input_data)):
                        #        ana_step.link_input_data[i] = resolve_placeholder_vars(working_dirs, a_instance, iteration, pattern._simulation_instances, pattern._analysis_instances, "analysis", ana_step.link_input_data[i])

                        cud = radical.pilot.ComputeUnitDescription()
                        cud.name = "ana ; {iteration}; {instance}".format(
                            iteration=iteration, instance=a_instance)

                        cud.pre_exec = ana_step._cu_def_pre_exec
                        cud.executable = ana_step._cu_def_executable
                        cud.arguments = ana_step.arguments
                        cud.mpi = ana_step.uses_mpi
                        cud.input_staging = get_input_data(kernel=ana_step,
                                                           instance=a_instance,
                                                           iteration=iteration,
                                                           ktype='analysis')
                        cud.output_staging = get_output_data(
                            kernel=ana_step,
                            instance=a_instance,
                            iteration=iteration,
                            ktype='analysis')

                        if ana_step.cores is not None:
                            cud.cores = ana_step.cores

                        if ana_step.exists_remote is not None:
                            cud.post_exec = create_filecheck_command(
                                ana_step.exists_remote)

                        a_units.append(cud)

                        if ana_step.get_instance_type == 'single':
                            break

                    self.get_logger().debug("Created analysis CU: {0}.".format(
                        cud.as_dict()))

                    self.get_logger().info(
                        "Submitted tasks for analysis iteration {0}.".format(
                            iteration))
                    self.get_logger().info(
                        "Waiting for analysis tasks in iteration {0}/kernel {1}: {2} to complete."
                        .format(iteration, kern_step + 1, ana_step.name))

                    self._reporter.info(
                        "\nIteration {0}: Waiting for analysis tasks: {1} to complete"
                        .format(iteration, ana_step.name))
                    if profiling == 1:
                        probe_ana_wait = datetime.datetime.now()
                        enmd_overhead_dict['iter_{0}'.format(
                            iteration)]['ana']['kernel_{0}'.format(
                                kern_step)]['wait_time'] = probe_ana_wait

                    a_cus = resource._umgr.submit_units(a_units)
                    all_cus.extend(a_cus)
                    all_ana_cus.extend(a_cus)

                    uids = [cu.uid for cu in a_cus]
                    resource._umgr.wait_units(uids)

                    if profiling == 1:
                        probe_ana_res = datetime.datetime.now()
                        enmd_overhead_dict['iter_{0}'.format(
                            iteration)]['ana']['kernel_{0}'.format(
                                kern_step)]['res_time'] = probe_ana_res

                    self.get_logger().info(
                        "Analysis in iteration {0}/kernel {1}: {2} completed.".
                        format(iteration, kern_step + 1, ana_step.name))

                    failed_units = ""
                    for unit in a_cus:
                        if unit.state != radical.pilot.DONE:
                            failed_units += " * Analysis task {0} failed with an error: {1}\n".format(
                                unit.uid, unit.stderr)

                    if profiling == 1:
                        probe_ana_done = datetime.datetime.now()
                        enmd_overhead_dict['iter_{0}'.format(
                            iteration)]['ana']['kernel_{0}'.format(
                                kern_step)]['stop_time'] = probe_ana_done

                    self._reporter.ok('>> done')

                if profiling == 1:
                    probe_post_ana_start = datetime.datetime.now()
                    enmd_overhead_dict['iter_{0}'.format(
                        iteration)]['ana']['post'] = od()
                    enmd_overhead_dict['iter_{0}'.format(iteration)]['ana'][
                        'post']['start_time'] = probe_post_ana_start

                if (pattern.adaptive_simulation == False):
                    pass
                else:
                    pattern._simulation_instances = pattern.get_new_simulation_instances(
                        a_cus[0].stdout)

                i = 0
                for cu in a_cus:
                    i += 1
                    self.working_dirs['iteration_{0}'.format(iteration)][
                        'analysis_{0}'.format(i)] = saga.Url(
                            cu.working_directory).path

                if profiling == 1:
                    probe_post_ana_end = datetime.datetime.now()
                    enmd_overhead_dict['iter_{0}'.format(iteration)]['ana'][
                        'post']['stop_time'] = probe_post_ana_end
                    cu_dict['iter_{0}'.format(iteration)]['ana'] = all_ana_cus

            self._reporter.header('Pattern execution successfully finished')

            # ONLY PROFILING SECTION BELOW
            if profiling == 1:

                #Pattern overhead logging
                title = "iteration,step,kernel,probe,timestamp"
                f1 = open('enmd_pat_overhead.csv', 'w')
                f1.write(title + "\n\n")
                iter = 'None'
                step = 'pre_loop'
                kern = 'None'
                for key, val in enmd_overhead_dict['preloop'].items():
                    probe = key
                    timestamp = val
                    entry = '{0},{1},{2},{3},{4}\n'.format(
                        iter, step, kern, probe, timestamp)
                    f1.write(entry)

                iters = pattern.iterations

                for i in range(1, iters + 1):
                    iter = 'iter_{0}'.format(i)
                    for key1, val1 in enmd_overhead_dict[iter].items():
                        step = key1
                        for key2, val2 in val1.items():
                            kern = key2
                            for key3, val3 in val2.items():
                                probe = key3
                                timestamp = val3
                                entry = '{0},{1},{2},{3},{4}\n'.format(
                                    iter.split('_')[1], step, kern, probe,
                                    timestamp)
                                f1.write(entry)

                f1.close()

                #CU data logging
                title = "uid, iter, step, Scheduling, StagingInput, AgentStagingInputPending, AgentStagingInput, AllocatingPending, Allocating, ExecutingPending, Executing, AgentStagingOutputPending, AgentStagingOutput, PendingOutputStaging, StagingOutput, Done"
                f2 = open(
                    "execution_profile_{mysession}.csv".format(
                        mysession=resource._session.uid), 'w')
                f2.write(title + "\n\n")
                iter = 'None'
                step = 'pre_loop'

                if step in cu_dict:
                    cu = cu_dict['pre_loop']

                    st_data = {}
                    for st in cu.state_history:
                        st_dict = st.as_dict()
                        st_data["{0}".format(st_dict["state"])] = {}
                        st_data["{0}".format(
                            st_dict["state"])] = st_dict["timestamp"]

                    states = [
                        'Scheduling,'
                        'StagingInput', 'AgentStagingInputPending',
                        'AgentStagingInput', 'AllocatingPending', 'Allocating',
                        'ExecutingPending', 'Executing',
                        'AgentStagingOutputPending', 'AgentStagingOutput',
                        'PendingOutputStaging', 'StagingOutput', 'Done'
                    ]

                    for state in states:
                        if (state in st_data) is False:
                            st_data[state] = None

                    line = "{uid}, {iter}, {step}, {Scheduling}, {StagingInput}, {AgentStagingInputPending}, {AgentStagingInput}, {AllocatingPending}, {Allocating}, {ExecutingPending},{Executing}, {AgentStagingOutputPending}, {AgentStagingOutput}, {PendingOutputStaging}, {StagingOutput}, {Done}".format(
                        uid=cu.uid,
                        iter=0,
                        step='pre_loop',
                        Scheduling=(st_data['Scheduling']),
                        StagingInput=(st_data['StagingInput']),
                        AgentStagingInputPending=(
                            st_data['AgentStagingInputPending']),
                        AgentStagingInput=(st_data['AgentStagingInput']),
                        AllocatingPending=(st_data['AllocatingPending']),
                        Allocating=(st_data['Allocating']),
                        ExecutingPending=(st_data['ExecutingPending']),
                        Executing=(st_data['Executing']),
                        AgentStagingOutputPending=(
                            st_data['AgentStagingOutputPending']),
                        AgentStagingOutput=(st_data['AgentStagingOutput']),
                        PendingOutputStaging=(st_data['PendingOutputStaging']),
                        StagingOutput=(st_data['StagingOutput']),
                        Done=(st_data['Done']))
                    f2.write(line + '\n')
                else:
                    print 'No pre_loop step in the pattern'

                for i in range(1, iters + 1):
                    iter = 'iter_{0}'.format(i)
                    for key, val in cu_dict[iter].items():
                        step = key
                        cus = val

                        if step == 'sim':
                            for cu in cus:
                                st_data = {}
                                for st in cu.state_history:
                                    st_dict = st.as_dict()
                                    st_data["{0}".format(
                                        st_dict["state"])] = {}
                                    st_data["{0}".format(
                                        st_dict["state"]
                                    )] = st_dict["timestamp"]

                                states = [
                                    'Scheduling,'
                                    'StagingInput', 'AgentStagingInputPending',
                                    'AgentStagingInput', 'AllocatingPending',
                                    'Allocating', 'ExecutingPending',
                                    'Executing', 'AgentStagingOutputPending',
                                    'AgentStagingOutput',
                                    'PendingOutputStaging', 'StagingOutput',
                                    'Done'
                                ]

                                for state in states:
                                    if (state in st_data) is False:
                                        st_data[state] = None

                                line = "{uid}, {iter}, {step}, {Scheduling}, {StagingInput}, {AgentStagingInputPending}, {AgentStagingInput}, {AllocatingPending}, {Allocating}, {ExecutingPending},{Executing}, {AgentStagingOutputPending}, {AgentStagingOutput}, {PendingOutputStaging}, {StagingOutput}, {Done}".format(
                                    uid=cu.uid,
                                    iter=iter.split('_')[1],
                                    step=step,
                                    Scheduling=(st_data['Scheduling']),
                                    StagingInput=(st_data['StagingInput']),
                                    AgentStagingInputPending=(
                                        st_data['AgentStagingInputPending']),
                                    AgentStagingInput=(
                                        st_data['AgentStagingInput']),
                                    AllocatingPending=(
                                        st_data['AllocatingPending']),
                                    Allocating=(st_data['Allocating']),
                                    ExecutingPending=(
                                        st_data['ExecutingPending']),
                                    Executing=(st_data['Executing']),
                                    AgentStagingOutputPending=(
                                        st_data['AgentStagingOutputPending']),
                                    AgentStagingOutput=(
                                        st_data['AgentStagingOutput']),
                                    PendingOutputStaging=(
                                        st_data['PendingOutputStaging']),
                                    StagingOutput=(st_data['StagingOutput']),
                                    Done=(st_data['Done']))

                                f2.write(line + '\n')

                        elif step == 'ana':
                            for cu in cus:
                                st_data = {}
                                for st in cu.state_history:
                                    st_dict = st.as_dict()
                                    st_data["{0}".format(
                                        st_dict["state"])] = {}
                                    st_data["{0}".format(
                                        st_dict["state"]
                                    )] = st_dict["timestamp"]

                                states = [
                                    'Scheduling,'
                                    'StagingInput', 'AgentStagingInputPending',
                                    'AgentStagingInput', 'AllocatingPending',
                                    'Allocating', 'ExecutingPending',
                                    'Executing', 'AgentStagingOutputPending',
                                    'AgentStagingOutput',
                                    'PendingOutputStaging', 'StagingOutput',
                                    'Done'
                                ]

                                for state in states:
                                    if (state in st_data) is False:
                                        st_data[state] = None

                                line = "{uid}, {iter}, {step}, {Scheduling}, {StagingInput}, {AgentStagingInputPending}, {AgentStagingInput}, {AllocatingPending}, {Allocating}, {ExecutingPending},{Executing}, {AgentStagingOutputPending}, {AgentStagingOutput}, {PendingOutputStaging}, {StagingOutput}, {Done}".format(
                                    uid=cu.uid,
                                    iter=iter.split('_')[1],
                                    step=step,
                                    Scheduling=(st_data['Scheduling']),
                                    StagingInput=(st_data['StagingInput']),
                                    AgentStagingInputPending=(
                                        st_data['AgentStagingInputPending']),
                                    AgentStagingInput=(
                                        st_data['AgentStagingInput']),
                                    AllocatingPending=(
                                        st_data['AllocatingPending']),
                                    Allocating=(st_data['Allocating']),
                                    ExecutingPending=(
                                        st_data['ExecutingPending']),
                                    Executing=(st_data['Executing']),
                                    AgentStagingOutputPending=(
                                        st_data['AgentStagingOutputPending']),
                                    AgentStagingOutput=(
                                        st_data['AgentStagingOutput']),
                                    PendingOutputStaging=(
                                        st_data['PendingOutputStaging']),
                                    StagingOutput=(st_data['StagingOutput']),
                                    Done=(st_data['Done']))

                                f2.write(line + '\n')

                f2.close()

        except KeyboardInterrupt:

            self._reporter.error('Execution interupted')
            traceback.print_exc()