Example #1
0
def run(pipeline_config_filename, dry_run=False):
    """
    Runs the pipeline
    """
    # YAML input file.
    # Load the text and then expand any environment variables
    raw_config_text = open(pipeline_config_filename).read()
    config_text = os.path.expandvars(raw_config_text)
    # Then parse with YAML
    pipe_config = yaml.load(config_text)

    # Optional logging of pipeline infrastructure to
    # file.
    log_file = pipe_config.get('pipeline_log')
    if log_file:
        parsl.set_file_logger(log_file)

    # Required configuration information
    # List of stage names, must be imported somewhere
    stages = pipe_config['stages']

    # Python modules in which to search for pipeline stages
    modules = pipe_config['modules'].split()

    # parsl execution/launcher configuration information
    launcher = pipe_config.get("launcher", "local")
    if launcher == "local":
        launcher_config = sites.local.make_launcher(stages)
    elif launcher == "cori":
        launcher_config = sites.cori.make_launcher(stages)
    elif launcher == "cori-interactive":
        launcher_config = sites.cori_interactive.make_launcher(stages)
    else:
        raise ValueError(f"Unknown launcher {launcher}")
    #
    # launcher_config = pipe_config['launcher']

    # Inputs and outputs
    output_dir = pipe_config['output_dir']
    inputs = pipe_config['inputs']
    log_dir = pipe_config['log_dir']
    resume = pipe_config['resume']

    stages_config = pipe_config['config']

    for module in modules:
        __import__(module)

    # Create and run pipeline
    pipeline = Pipeline(launcher_config, stages)

    if dry_run:
        pipeline.dry_run(inputs, output_dir, stages_config)
    else:
        pipeline.run(inputs, output_dir, log_dir, resume, stages_config)
Example #2
0
def main():
    display("Loading DFK")
    parsl.set_file_logger("parsl.log", level=logging.DEBUG)

    dfk = parsl.DataFlowKernel(config=parsl_configs.rccNodeExclusive)

    display("Loading App")
    full_app = gen_full_tokenizer(dfk)

    display("Loading data iter")
    datIter = subjectIter()

    display("Starting run")

    running = {}
    done = False
    succCount = 0
    doneIter = False
    try:
        while not done:
            #Only add maxRunning jobs to the queue at once
            while len(running) < maxRunning:
                batch = []
                for i in range(perBatch):
                    try:
                        batch.append(next(datIter))
                    except StopIteration:
                        #If none left just skip adding
                        doneIter = True
                        break
                batchName = batch[0]['wos_id']
                running[batchName] = full_app(batch)
            succCount += checkRunning(running)
            display("Completed {}".format(succCount))
            #End the loop if all jobs are done and no more can be added
            if doneIter and len(running) < 1:
                done = True

    except KeyboardInterrupt:
        display("Closing down")
        dfk.cleanup()
        raise
    except:
        resetStdout()
        raise

    dfk.cleanup()
    display("Done")
Example #3
0
def run(pipeline_config_filename):
    """
    Runs the pipeline
    """
    # YAML input file.
    pipe_config = yaml.load(open(pipeline_config_filename))

    # Optional logging of pipeline infrastructure to
    # file.
    log_file = pipe_config.get('pipeline_log')
    if log_file:
        parsl.set_file_logger(log_file)

    # Required configuration information
    # List of stage names, must be imported somewhere
    stages = pipe_config['stages']

    # Python modules in which to search for pipeline stages
    modules = pipe_config['modules'].split()

    # parsl execution/launcher configuration information
    launcher = pipe_config.get("launcher", "local")
    if launcher == "local":
        launcher_config = sites.local.make_launcher(stages)
    elif launcher == "cori":
        launcher_config = sites.cori.make_launcher(stages)
    else:
        raise ValueError(f"Unknown launcher {launcher}")
    #
    # launcher_config = pipe_config['launcher']

    # Inputs and outputs
    output_dir = pipe_config['output_dir']
    inputs = pipe_config['inputs']
    log_dir = pipe_config['log_dir']
    resume = pipe_config['resume']

    stages_config = pipe_config['config']

    for module in modules:
        __import__(module)

    # Create and run pipeline
    pipeline = Pipeline(launcher_config, stages)
    pipeline.run(inputs, output_dir, log_dir, resume, stages_config)
Example #4
0
    def __init__(self, config=Config()):
        """Initialize the DataFlowKernel.

        Parameters
        ----------
        config : Config
            A specification of all configuration options. For more details see the
            :class:~`parsl.config.Config` documentation.
        """

        # this will be used to check cleanup only happens once
        self.cleanup_called = False

        if isinstance(config, dict):
            raise ConfigurationError(
                'Expected `Config` class, received dictionary. For help, '
                'see http://parsl.readthedocs.io/en/stable/stubs/parsl.config.Config.html'
            )
        self._config = config
        self.run_dir = make_rundir(config.run_dir)
        parsl.set_file_logger("{}/parsl.log".format(self.run_dir),
                              level=logging.DEBUG)
        logger.debug("Starting DataFlowKernel with config\n{}".format(config))
        logger.info("Parsl version: {}".format(get_version()))

        self.checkpoint_lock = threading.Lock()

        self.usage_tracker = UsageTracker(self)
        self.usage_tracker.send_message()

        # ES logging
        self.tasks_completed_count = 0
        self.tasks_failed_count = 0
        self.monitoring_config = config.monitoring_config
        if self.monitoring_config is not None and self.monitoring_config.database_type == 'local_database'\
                and self.monitoring_config.eng_link is None:
            # uses the rundir as the default location.
            logger.info(
                'Local monitoring database can be found inside the run_dir at: {}'
                .format(self.run_dir))
            self.monitoring_config.eng_link = "sqlite:///{}".format(
                os.path.join(os.path.abspath(self.run_dir), 'monitoring.db'))
        if self.monitoring_config is None:
            self.db_logger = get_db_logger()
        else:
            self.db_logger = get_db_logger(
                monitoring_config=self.monitoring_config)
        self.workflow_name = None
        if self.monitoring_config is not None and self.monitoring_config.workflow_name is not None:
            self.workflow_name = self.monitoring_config.workflow_name
        else:
            for frame in inspect.stack():
                fname = os.path.basename(str(frame.filename))
                parsl_file_names = ['dflow.py']
                # Find first file name not considered a parsl file
                if fname not in parsl_file_names:
                    self.workflow_name = fname
                    break

        self.workflow_version = None
        if self.monitoring_config is not None and self.monitoring_config.version is not None:
            self.workflow_version = self.monitoring_config.version
        self.time_began = time.time()
        self.time_completed = None
        self.run_id = str(uuid4())
        self.dashboard = self.monitoring_config.dashboard_link if self.monitoring_config is not None else None
        # TODO: make configurable
        logger.info("Run id is: " + self.run_id)
        if self.dashboard is not None:
            logger.info("Dashboard is found at " + self.dashboard)
        # start tornado logging server
        if self.monitoring_config is not None and self.monitoring_config.database_type == 'local_database':
            self.logging_server = multiprocessing.Process(
                target=logging_server.run,
                kwargs={'monitoring_config': self.monitoring_config})
            self.logging_server.start()
            self.web_app = multiprocessing.Process(
                target=index.run,
                kwargs={'monitoring_config': self.monitoring_config})
            self.web_app.start()
        else:
            self.logging_server = None
            self.web_app = None
        workflow_info = {
            'python_version': sys.version_info,
            'parsl_version': get_version(),
            "time_began": str(self.time_began),
            'time_completed': str(None),
            'run_id': self.run_id,
            'workflow_name': self.workflow_name,
            'workflow_version': self.workflow_version,
            'rundir': self.run_dir,
            'tasks_completed_count': self.tasks_completed_count,
            'tasks_failed_count': self.tasks_failed_count,
            'user': getuser(),
            'host': gethostname(),
        }
        self.db_logger.info("DFK start", extra=workflow_info)
        # ES logging end

        checkpoints = self.load_checkpoints(config.checkpoint_files)
        self.memoizer = Memoizer(self,
                                 memoize=config.app_cache,
                                 checkpoint=checkpoints)
        self.checkpointed_tasks = 0
        self._checkpoint_timer = None
        self.checkpoint_mode = config.checkpoint_mode

        data_manager = DataManager(
            max_threads=config.data_management_max_threads,
            executors=config.executors)
        self.executors = {
            e.label: e
            for e in config.executors + [data_manager]
        }
        for executor in self.executors.values():
            executor.run_dir = self.run_dir
            if hasattr(executor, 'provider'):
                if hasattr(executor.provider, 'script_dir'):
                    executor.provider.script_dir = os.path.join(
                        self.run_dir, 'submit_scripts')
                    if executor.provider.channel.script_dir is None:
                        executor.provider.channel.script_dir = os.path.join(
                            self.run_dir, 'submit_scripts')
                        if not executor.provider.channel.isdir(self.run_dir):
                            parent, child = pathlib.Path(
                                self.run_dir).parts[-2:]
                            remote_run_dir = os.path.join(parent, child)
                            executor.provider.channel.script_dir = os.path.join(
                                remote_run_dir, 'remote_submit_scripts')
                            executor.provider.script_dir = os.path.join(
                                self.run_dir, 'local_submit_scripts')
                    executor.provider.channel.makedirs(
                        executor.provider.channel.script_dir, exist_ok=True)
                    os.makedirs(executor.provider.script_dir, exist_ok=True)
            executor.start()

        if self.checkpoint_mode == "periodic":
            try:
                h, m, s = map(int, config.checkpoint_period.split(':'))
                checkpoint_period = (h * 3600) + (m * 60) + s
                self._checkpoint_timer = Timer(self.checkpoint,
                                               interval=checkpoint_period)
            except Exception:
                logger.error(
                    "invalid checkpoint_period provided:{0} expected HH:MM:SS".
                    format(config.checkpoint_period))
                self._checkpoint_timer = Timer(self.checkpoint,
                                               interval=(30 * 60))

        if any([x.managed for x in config.executors]):
            self.flowcontrol = FlowControl(self)
        else:
            self.flowcontrol = FlowNoControl(self)

        self.task_count = 0
        self.tasks = {}
        self.submitter_lock = threading.Lock()

        atexit.register(self.atexit_cleanup)
Example #5
0
    def __init__(self, config=Config()):
        """Initialize the DataFlowKernel.

        Parameters
        ----------
        config : Config
            A specification of all configuration options. For more details see the
            :class:~`parsl.config.Config` documentation.
        """

        # this will be used to check cleanup only happens once
        self.cleanup_called = False

        if isinstance(config, dict):
            raise ConfigurationError(
                    'Expected `Config` class, received dictionary. For help, '
                    'see http://parsl.readthedocs.io/en/stable/stubs/parsl.config.Config.html')
        self._config = config
        self.run_dir = make_rundir(config.run_dir)

        if config.initialize_logging:
            parsl.set_file_logger("{}/parsl.log".format(self.run_dir), level=logging.DEBUG)

        logger.debug("Starting DataFlowKernel with config\n{}".format(config))

        if sys.version_info < (3, 6):
            logger.warning("Support for python versions < 3.6 is deprecated and will be removed after parsl 0.10")

        logger.info("Parsl version: {}".format(get_version()))

        self.checkpoint_lock = threading.Lock()

        self.usage_tracker = UsageTracker(self)
        self.usage_tracker.send_message()

        # Monitoring
        self.run_id = str(uuid4())
        self.tasks_completed_count = 0
        self.tasks_failed_count = 0
        self.tasks_dep_fail_count = 0

        self.monitoring = config.monitoring
        # hub address and port for interchange to connect
        self.hub_address = None
        self.hub_interchange_port = None
        if self.monitoring:
            if self.monitoring.logdir is None:
                self.monitoring.logdir = self.run_dir
            self.hub_address = self.monitoring.hub_address
            self.hub_interchange_port = self.monitoring.start(self.run_id)

        self.time_began = datetime.datetime.now()
        self.time_completed = None

        # TODO: make configurable
        logger.info("Run id is: " + self.run_id)

        self.workflow_name = None
        if self.monitoring is not None and self.monitoring.workflow_name is not None:
            self.workflow_name = self.monitoring.workflow_name
        else:
            for frame in inspect.stack():
                fname = os.path.basename(str(frame.filename))
                parsl_file_names = ['dflow.py', 'typeguard.py']
                # Find first file name not considered a parsl file
                if fname not in parsl_file_names:
                    self.workflow_name = fname
                    break

        self.workflow_version = str(self.time_began.replace(microsecond=0))
        if self.monitoring is not None and self.monitoring.workflow_version is not None:
            self.workflow_version = self.monitoring.workflow_version

        workflow_info = {
                'python_version': "{}.{}.{}".format(sys.version_info.major,
                                                    sys.version_info.minor,
                                                    sys.version_info.micro),
                'parsl_version': get_version(),
                "time_began": self.time_began,
                'time_completed': None,
                'workflow_duration': None,
                'run_id': self.run_id,
                'workflow_name': self.workflow_name,
                'workflow_version': self.workflow_version,
                'rundir': self.run_dir,
                'tasks_completed_count': self.tasks_completed_count,
                'tasks_failed_count': self.tasks_failed_count,
                'user': getuser(),
                'host': gethostname(),
        }

        if self.monitoring:
            self.monitoring.send(MessageType.WORKFLOW_INFO,
                                 workflow_info)

        checkpoints = self.load_checkpoints(config.checkpoint_files)
        self.memoizer = Memoizer(self, memoize=config.app_cache, checkpoint=checkpoints)
        self.checkpointed_tasks = 0
        self._checkpoint_timer = None
        self.checkpoint_mode = config.checkpoint_mode

        self.data_manager = DataManager(self)
        self.executors = {}
        data_manager_executor = ThreadPoolExecutor(max_threads=config.data_management_max_threads, label='data_manager')
        self.add_executors(config.executors + [data_manager_executor])

        if self.checkpoint_mode == "periodic":
            try:
                h, m, s = map(int, config.checkpoint_period.split(':'))
                checkpoint_period = (h * 3600) + (m * 60) + s
                self._checkpoint_timer = Timer(self.checkpoint, interval=checkpoint_period, name="Checkpoint")
            except Exception:
                logger.error("invalid checkpoint_period provided: {0} expected HH:MM:SS".format(config.checkpoint_period))
                self._checkpoint_timer = Timer(self.checkpoint, interval=(30 * 60), name="Checkpoint")

        # if we use the functionality of dynamically adding executors
        # all executors should be managed.
        if any([x.managed for x in config.executors]):
            self.flowcontrol = FlowControl(self)
        else:
            self.flowcontrol = FlowNoControl(self)

        self.task_count = 0
        self.tasks = {}
        self.submitter_lock = threading.Lock()

        atexit.register(self.atexit_cleanup)
Example #6
0
    def __init__(self,
                 config=None,
                 executors=None,
                 lazyErrors=True,
                 appCache=True,
                 rundir=None,
                 retries=0,
                 checkpointFiles=None,
                 checkpointMode=None,
                 data_manager=None):
        """ Initialize the DataFlowKernel.

        Please note that keyword args passed to the DFK here will always override
        options passed in via the config.

        KWargs:
            - config (dict) : A single data object encapsulating all config attributes
            - executors (list of Executor objs): Optional, kept for (somewhat) backward compatibility with 0.2.0
            - lazyErrors(bool) : Default=True, allow workflow to continue on app failures.
            - appCache (bool) :Enable caching of apps
            - rundir (str) : Path to run directory. Defaults to ./runinfo/runNNN
            - retries(int): Default=0, Set the number of retry attempts in case of failure
            - checkpointFiles (list of str): List of filepaths to checkpoint files
            - checkpointMode (None, 'dfk_exit', 'task_exit', 'periodic'): Method to use.
            - data_manager (DataManager): User created DataManager
        Returns:
            DataFlowKernel object
        """
        # Create run dirs for this run
        self.rundir = make_rundir(config=config, path=rundir)
        parsl.set_file_logger("{}/parsl.log".format(self.rundir),
                              level=logging.DEBUG)

        logger.info("Parsl version: {}".format(parsl.__version__))
        logger.info("Libsubmit version: {}".format(libsubmit.__version__))

        # Update config with defaults
        self._config = update_config(config, self.rundir)

        # Set the data manager
        if data_manager:
            self.data_manager = data_manager
        else:
            self.data_manager = DataManager(config=self._config)

        # Start the anonymized usage tracker and send init msg
        self.usage_tracker = UsageTracker(self)
        self.usage_tracker.send_message()

        # Load checkpoints if any
        cpts = self.load_checkpoints(checkpointFiles)
        # Initialize the memoizer
        self.memoizer = Memoizer(self, memoize=appCache, checkpoint=cpts)
        self.checkpointed_tasks = 0
        self._checkpoint_timer = None

        if self._config:
            self._executors_managed = True
            # Create the executors
            epf = EPF()
            self.executors = epf.make(self.rundir, self._config)

            # set global vars from config
            self.lazy_fail = self._config["globals"].get(
                "lazyErrors", lazyErrors)
            self.fail_retries = self._config["globals"].get("retries", retries)
            self.flowcontrol = FlowControl(self, self._config)
            self.checkpoint_mode = self._config["globals"].get(
                "checkpointMode", checkpointMode)
            if self.checkpoint_mode == "periodic":
                period = self._config["globals"].get("checkpointPeriod",
                                                     "00:30:00")
                try:
                    h, m, s = map(int, period.split(':'))
                    checkpoint_period = (h * 3600) + (m * 60) + s
                    self._checkpoint_timer = Timer(self.checkpoint,
                                                   interval=checkpoint_period)
                except Exception as e:
                    logger.error(
                        "invalid checkpointPeriod provided:{0} expected HH:MM:SS"
                        .format(period))
                    self._checkpoint_timer = Timer(self.checkpoint,
                                                   interval=(30 * 60))

        else:
            self._executors_managed = False
            self.fail_retries = retries
            self.lazy_fail = lazyErrors
            self.executors = {i: x for i, x in enumerate(executors)}
            self.flowcontrol = FlowNoControl(self, None)
            self.checkpoint_mode = checkpointMode

        self.task_count = 0
        self.fut_task_lookup = {}
        self.tasks = {}
        self.task_launch_lock = threading.Lock()

        logger.debug("Using executors: {0}".format(self.executors))
        atexit.register(self.cleanup)
Example #7
0
from parsl.providers import SlurmProvider
from parsl.launchers import SrunLauncher
from parsl.executors import HighThroughputExecutor
from parsl.addresses import address_by_hostname
from parsl.app.app import bash_app
from parsl.app.app import python_app


@python_app()
def worker_info():
    #import subprocess
    import os
    return os.uname()


parsl.set_file_logger(filename='parsl-ornl-slurm-log')

config = Config(
    app_cache=True,
    checkpoint_files=None,
    checkpoint_mode=None,
    checkpoint_period=None,
    data_management_max_threads=10,
    executors=[
        HighThroughputExecutor(
            address='130.199.185.13',
            cores_per_worker=1.0,
            heartbeat_period=30,
            heartbeat_threshold=120,
            interchange_port_range=(55000, 56000),
            label='dcde-ext.ornl.gov-slurm',
Example #8
0
def remote_side_bash_executor(func, *args, **kwargs):
    """Executes the supplied function with *args and **kwargs to get a
    command-line to run, and then run that command-line using bash.
    """
    import os
    import time
    import subprocess
    import logging
    import parsl.app.errors as pe
    from parsl import set_file_logger
    from parsl.utils import get_std_fname_mode

    logbase = "/tmp"
    format_string = "%(asctime)s.%(msecs)03d %(name)s:%(lineno)d [%(levelname)s]  %(message)s"

    # make this name unique per invocation so that each invocation can
    # log to its own file. It would be better to include the task_id here
    # but that is awkward to wire through at the moment as apps do not
    # have access to that execution context.
    t = time.time()

    logname = __name__ + "." + str(t)
    logger = logging.getLogger(logname)
    set_file_logger(filename='{0}/bashexec.{1}.log'.format(logbase, t),
                    name=logname,
                    level=logging.DEBUG,
                    format_string=format_string)

    func_name = func.__name__

    executable = None

    # Try to run the func to compose the commandline
    try:
        # Execute the func to get the commandline
        executable = func(*args, **kwargs)

        if not isinstance(executable, str):
            raise ValueError(
                f"Expected a str for bash_app commandline, got {type(executable)}"
            )

    except AttributeError as e:
        if executable is not None:
            raise pe.AppBadFormatting(
                "App formatting failed for app '{}' with AttributeError: {}".
                format(func_name, e))
        else:
            raise pe.BashAppNoReturn(
                "Bash app '{}' did not return a value, or returned None - with this exception: {}"
                .format(func_name, e))

    except IndexError as e:
        raise pe.AppBadFormatting(
            "App formatting failed for app '{}' with IndexError: {}".format(
                func_name, e))
    except Exception as e:
        logger.error(
            "Caught exception during formatting of app '{}': {}".format(
                func_name, e))
        raise e

    logger.debug("Executable: %s", executable)

    # Updating stdout, stderr if values passed at call time.

    def open_std_fd(fdname):
        # fdname is 'stdout' or 'stderr'
        stdfspec = kwargs.get(fdname)  # spec is str name or tuple (name, mode)
        if stdfspec is None:
            return None

        fname, mode = get_std_fname_mode(fdname, stdfspec)
        try:
            if os.path.dirname(fname):
                os.makedirs(os.path.dirname(fname), exist_ok=True)
            fd = open(fname, mode)
        except Exception as e:
            raise pe.BadStdStreamFile(fname, e)
        return fd

    std_out = open_std_fd('stdout')
    std_err = open_std_fd('stderr')
    timeout = kwargs.get('walltime')

    if std_err is not None:
        print('--> executable follows <--\n{}\n--> end executable <--'.format(
            executable),
              file=std_err,
              flush=True)

    returncode = None
    try:
        proc = subprocess.Popen(executable,
                                stdout=std_out,
                                stderr=std_err,
                                shell=True,
                                executable='/bin/bash')
        proc.wait(timeout=timeout)
        returncode = proc.returncode

    except subprocess.TimeoutExpired:
        raise pe.AppTimeout("[{}] App exceeded walltime: {}".format(
            func_name, timeout))

    except Exception as e:
        raise pe.AppException(
            "[{}] App caught exception with returncode: {}".format(
                func_name, returncode), e)

    if returncode != 0:
        raise pe.BashExitFailure(func_name, proc.returncode)

    # TODO : Add support for globs here

    missing = []
    for outputfile in kwargs.get('outputs', []):
        fpath = outputfile.filepath

        if not os.path.exists(fpath):
            missing.extend([outputfile])

    if missing:
        raise pe.MissingOutputs("[{}] Missing outputs".format(func_name),
                                missing)

    return returncode
Example #9
0
    def __init__(self, config=Config()):
        """Initialize the DataFlowKernel.

        Parameters
        ----------
        config : Config
            A specification of all configuration options. For more details see the
            :class:~`parsl.config.Config` documentation.
        """

        # this will be used to check cleanup only happens once
        self.cleanup_called = False

        if isinstance(config, dict):
            raise ConfigurationError(
                'Expected `Config` class, received dictionary. For help, '
                'see http://parsl.readthedocs.io/en/stable/stubs/parsl.config.Config.html'
            )
        self._config = config
        logger.debug("Starting DataFlowKernel with config\n{}".format(config))
        self.run_dir = make_rundir(config.run_dir)
        parsl.set_file_logger("{}/parsl.log".format(self.run_dir),
                              level=logging.DEBUG)

        logger.info("Parsl version: {}".format(get_version()))
        logger.info("Libsubmit version: {}".format(libsubmit.__version__))

        self.checkpoint_lock = threading.Lock()

        self.usage_tracker = UsageTracker(self)
        self.usage_tracker.send_message()

        checkpoints = self.load_checkpoints(config.checkpoint_files)
        self.memoizer = Memoizer(self,
                                 memoize=config.app_cache,
                                 checkpoint=checkpoints)
        self.checkpointed_tasks = 0
        self._checkpoint_timer = None
        self.checkpoint_mode = config.checkpoint_mode

        data_manager = DataManager.get_data_manager(
            max_threads=config.data_management_max_threads,
            executors=config.executors)
        self.executors = {
            e.label: e
            for e in config.executors + [data_manager]
        }
        for executor in self.executors.values():
            executor.run_dir = self.run_dir  # FIXME we should have a real interface for this
            executor.start()

        if self.checkpoint_mode == "periodic":
            try:
                h, m, s = map(int, config.checkpoint_period.split(':'))
                checkpoint_period = (h * 3600) + (m * 60) + s
                self._checkpoint_timer = Timer(self.checkpoint,
                                               interval=checkpoint_period)
            except Exception as e:
                logger.error(
                    "invalid checkpoint_period provided:{0} expected HH:MM:SS".
                    format(period))
                self._checkpoint_timer = Timer(self.checkpoint,
                                               interval=(30 * 60))

        if any([x.managed for x in config.executors]):
            self.flowcontrol = FlowControl(self)
        else:
            self.flowcontrol = FlowNoControl(self)

        self.task_count = 0
        self.fut_task_lookup = {}
        self.tasks = {}
        self.task_launch_lock = threading.Lock()

        atexit.register(self.atexit_cleanup)
Example #10
0
def remote_side_sandbox_executor(func, *args, **kwargs):
    """Executes the supplied function with *args and **kwargs to get a
    command-line to run, and then run that command-line using bash.
    """
    import os
    import time
    import subprocess
    import logging
    import parsl.app.errors as pe
    from parsl import set_file_logger
    from parsl.utils import get_std_fname_mode

    sandbox = Sandbox("scratch")

    logbase = "/tmp"
    format_string = "%(asctime)s.%(msecs)03d %(name)s:%(lineno)d [%(levelname)s]  %(message)s"

    # make this name unique per invocation so that each invocation can
    # log to its own file. It would be better to include the task_id here
    # but that is awkward to wire through at the moment as apps do not
    # have access to that execution context.
    t = time.time()

    logname = __name__ + "." + str(t)
    logger = logging.getLogger(logname)
    set_file_logger(filename='{0}/bashexec.{1}.log'.format(logbase, t),
                    name=logname,
                    level=logging.DEBUG,
                    format_string=format_string)

    func_name = func.__name__

    executable = None

    # Try to run the func to compose the commandline
    try:
        # Execute the func to get the commandline
        executable = func(*args, **kwargs)

    except AttributeError as e:
        if executable is not None:
            raise pe.AppBadFormatting(
                "App formatting failed for app '{}' with AttributeError: {}".
                format(func_name, e))
        else:
            raise pe.BashAppNoReturn(
                "Bash app '{}' did not return a value, or returned None - with this exception: {}"
                .format(func_name, e))

    except IndexError as e:
        raise pe.AppBadFormatting(
            "App formatting failed for app '{}' with IndexError: {}".format(
                func_name, e))
    except Exception as e:
        logger.error(
            "Caught exception during formatting of app '{}': {}".format(
                func_name, e))
        raise e

    logger.debug("Executable: %s", executable)

    # Updating stdout, stderr if values passed at call time.

    def open_std_fd(fdname):
        # fdname is 'stdout' or 'stderr'
        stdfspec = kwargs.get(fdname)  # spec is str name or tuple (name, mode)
        if stdfspec is None:
            return None

        fname, mode = get_std_fname_mode(fdname, stdfspec)
        try:
            if os.path.dirname(fname):
                os.makedirs(os.path.dirname(fname), exist_ok=True)
            fd = open(fname, mode)
        except Exception as e:
            raise pe.BadStdStreamFile(fname, e)
        return fd

    std_out = open_std_fd('stdout')
    std_err = open_std_fd('stderr')
    timeout = kwargs.get('walltime')
    project = kwargs.get('project', "")
    unique_id = kwargs.get('unique_id', "HUMAN")

    sandbox.create_working_dir(unique_id, func_name)

    workflow_schema = "workflow://" + project + "/" + unique_id + "/"

    if std_err is not None:
        print('--> executable follows <--\n{}\n--> end executable <--'.format(
            executable),
              file=std_err,
              flush=True)

    return_value = None
    try:

        cwd = None

        working_directory = "scratch" + os.path.sep + unique_id

        os.makedirs(working_directory)

        cwd = os.getcwd()
        os.chdir(working_directory)
        logger.debug("workflow://schema: %s", workflow_schema)

        # Resolve workflow:// inputs
        for input in kwargs.get('inputs', []):
            if "workflow://" in input:
                print(input)

        proc = subprocess.Popen(executable,
                                stdout=std_out,
                                stderr=std_err,
                                shell=True,
                                executable='/bin/bash')
        proc.wait(timeout=timeout)

        return_value = {
            'unique_id': unique_id,
            'working_directory': working_directory,
            'workflow_schema': workflow_schema,
            'return_code': proc.returncode
        }

        if cwd is not None:
            os.chdir(cwd)

    except subprocess.TimeoutExpired:
        raise pe.AppTimeout("[{}] App exceeded walltime: {}".format(
            func_name, timeout))

    except Exception as e:
        raise pe.AppException(
            "[{}] App caught exception with return value: {}".format(
                func_name, json.dumps(return_value)), e)

    if proc.returncode != 0:
        raise pe.BashExitFailure(func_name, proc.returncode)

    # TODO : Add support for globs here

    missing = []
    for outputfile in kwargs.get('outputs', []):
        fpath = outputfile.filepath

        if not os.path.exists(fpath):
            missing.extend([outputfile])

    if missing:
        raise pe.MissingOutputs("[{}] Missing outputs".format(func_name),
                                missing)

    return return_value
Example #11
0
    def __init__(self,
                 config=None,
                 executors=None,
                 lazyErrors=True,
                 appCache=True,
                 rundir=None,
                 retries=0,
                 checkpointFiles=None):
        """ Initialize the DataFlowKernel

        Please note that keyword args passed to the DFK here will always override
        options passed in via the config.

        KWargs:
            - config (Dict) : A single data object encapsulating all config attributes
            - executors (list of Executor objs): Optional, kept for (somewhat) backward compatibility with 0.2.0
            - lazyErrors(Bool) : Default=True, allow workflow to continue on app failures.
            - appCache (Bool) :Enable caching of apps
            - rundir (str) : Path to run directory. Defaults to ./runinfo/runNNN
            - retries(int): Default=0, Set the number of retry attempts in case of failure
            - checkpointFiles (list of str): List of filepaths to checkpoint files

        Returns:
            DataFlowKernel object
        """
        # Create run dirs for this run
        self.rundir = make_rundir(config=config, path=rundir)
        parsl.set_file_logger("{}/parsl.log".format(self.rundir),
                              level=logging.INFO)

        logger.info("Parsl version: {}".format(parsl.__version__))
        logger.info("Libsubmit version: {}".format(libsubmit.__version__))

        # Update config with defaults
        self._config = update_config(config, self.rundir)

        # Start the anonymized usage tracker and send init msg
        self.usage_tracker = UsageTracker(self)
        self.usage_tracker.send_message()

        # Load checkpoints if any
        cpts = self.load_checkpoints(checkpointFiles)
        # Initialize the memoizer
        self.memoizer = Memoizer(self, memoize=appCache, checkpoint=cpts)

        if self._config:
            self._executors_managed = True
            # Create the executors
            epf = EPF()
            self.executors = epf.make(self.rundir, self._config)

            # set global vars from config
            self.lazy_fail = self._config["globals"].get(
                "lazyErrors", lazyErrors)
            self.fail_retries = self._config["globals"].get("retries", retries)
            self.flowcontrol = FlowControl(self, self._config)
        else:
            self._executors_managed = False
            self.fail_retries = retries
            self.lazy_fail = lazyErrors
            self.executors = {i: x for i, x in enumerate(executors)}
            self.flowcontrol = FlowNoControl(self, None)

        self.task_count = 0
        self.fut_task_lookup = {}
        self.tasks = {}
        self.task_launch_lock = threading.Lock()

        logger.debug("Using executors: {0}".format(self.executors))
        atexit.register(self.cleanup)
Example #12
0
def sandbox_runner(func, *args, **kwargs):
    """Executes the supplied function with *args and **kwargs to get a
    command-line to run, and then run that command-line using bash.
    """
    import os
    import time
    import subprocess
    import logging
    import parsl.app.errors as pe
    from parsl import set_file_logger
    from parsl.utils import get_std_fname_mode
    from parsl.data_provider.files import File

    import json

    # create a sandbox passing the name of the scratch directory
    sandbox = Sandbox()

    logbase = "/tmp"
    format_string = "%(asctime)s.%(msecs)03d %(name)s:%(lineno)d [%(levelname)s]  %(message)s"

    # make this name unique per invocation so that each invocation can
    # log to its own file. It would be better to include the task_id here
    # but that is awkward to wire through at the moment as apps do not
    # have access to that execution context.
    t = time.time()

    logname = __name__ + "." + str(t)
    logger = logging.getLogger(logname)
    set_file_logger(filename='{0}/bashexec.{1}.log'.format(logbase, t), name=logname, level=logging.DEBUG,
                    format_string=format_string)

    func_name = func.__name__

    executable = None

    # the workflow_name
    sandbox.workflow_name = kwargs.get('project', "")

    # app name
    sandbox.app_name = kwargs.get("workflow_app_name", "")

    # create a working dir with the sandbox
    sandbox.createWorkingDirectory()

    # workflow schema as workflow:///funcNameUUID
    workflow_schema = "workflow://" + sandbox.workflow_name + "/" + sandbox.app_name

    # tasks dep of the current task
    if 'tasks' in kwargs:
        sandbox.tasks_dep = kwargs.get('tasks', "")
        logger.debug(sandbox.tasks_dep)

    # Try to run the func to compose the commandline
    try:
        # Execute the func to get the commandline
        executable = func(*args, **kwargs)
        executable = sandbox.define_command(executable)
        logger.debug(executable)
    except AttributeError as e:
        if executable is not None:
            raise pe.AppBadFormatting("App formatting failed for app '{}' with AttributeError: {}".format(func_name, e))
        else:
            raise pe.BashAppNoReturn(
                "Bash app '{}' did not return a value, or returned None - with this exception: {}".format(func_name, e))

    except IndexError as e:
        raise pe.AppBadFormatting("App formatting failed for app '{}' with IndexError: {}".format(func_name, e))
    except Exception as e:
        raise e

    # Updating stdout, stderr if values passed at call time.

    def open_std_fd(fdname):
        # fdname is 'stdout' or 'stderr'
        stdfspec = kwargs.get(fdname)  # spec is str name or tuple (name, mode)
        if stdfspec is None:
            return None

        fname, mode = get_std_fname_mode(fdname, stdfspec)
        try:
            if os.path.dirname(fname):
                os.makedirs(os.path.dirname(fname), exist_ok=True)
            fd = open(fname, mode)
        except Exception as e:
            raise pe.BadStdStreamFile(fname, e)
        return fd

    std_out = open_std_fd('stdout')
    std_err = open_std_fd('stderr')
    timeout = kwargs.get('walltime')

    if std_err is not None:
        print('--> executable follows <--\n{}\n--> end executable <--'.format(executable), file=std_err, flush=True)

    return_value = None
    try:

        logger.debug("workflow://schema: %s", workflow_schema)

        proc = subprocess.Popen(executable, stdout=std_out, stderr=std_err, shell=True, executable='/bin/bash')
        proc.wait(timeout=timeout)

        return_value = {
            'return_code': proc.returncode,
            'working_directory': sandbox.working_directory,
        }

    except subprocess.TimeoutExpired:
        raise pe.AppTimeout("[{}] App exceeded walltime: {}".format(func_name, timeout))

    except Exception as e:
        raise pe.AppException("[{}] App caught exception with return value: {}"
                              .format(func_name, json.dumps(return_value)), e)

    if proc.returncode != 0:
        raise pe.BashExitFailure(func_name, proc.returncode)

    # TODO : Add support for globs here

    missing = []
    for outputfile in kwargs.get('outputs', []):
        fpath = outputfile.filepath

        if not os.path.exists(fpath):
            missing.extend([outputfile])

    if missing:
        raise pe.MissingOutputs("[{}] Missing outputs".format(func_name), missing)
    return return_value
Example #13
0
    def __init__(self, config=Config()):
        """Initialize the DataFlowKernel.

        Parameters
        ----------
        config : Config
            A specification of all configuration options. For more details see the
            :class:~`parsl.config.Config` documentation.
        """

        # this will be used to check cleanup only happens once
        self.cleanup_called = False

        if isinstance(config, dict):
            raise ConfigurationError(
                'Expected `Config` class, received dictionary. For help, '
                'see http://parsl.readthedocs.io/en/stable/stubs/parsl.config.Config.html'
            )
        self._config = config
        logger.debug("Starting DataFlowKernel with config\n{}".format(config))
        self.run_dir = make_rundir(config.run_dir)
        parsl.set_file_logger("{}/parsl.log".format(self.run_dir),
                              level=logging.DEBUG)

        logger.info("Parsl version: {}".format(get_version()))
        logger.info("Libsubmit version: {}".format(libsubmit.__version__))

        self.checkpoint_lock = threading.Lock()

        self.usage_tracker = UsageTracker(self)
        self.usage_tracker.send_message()

        # ES logging
        self.db_logger_config = config.db_logger_config
        self.db_logger = get_db_logger(
            enable_es_logging=False
        ) if self.db_logger_config is None else get_db_logger(
            **self.db_logger_config)
        self.workflow_name = str(inspect.stack()[1][1])
        self.time_began = datetime.now()
        self.time_completed = None
        self.run_id = self.workflow_name + "-" + str(self.time_began.minute)
        self.dashboard = self.db_logger_config.get(
            'dashboard_link',
            None) if self.db_logger_config is not None else None
        # TODO: make configurable
        logger.info("Run id is: " + self.run_id)
        if self.dashboard is not None:
            logger.info("Dashboard is found at " + self.dashboard)
        self.db_logger.info("Python version: {}".format(sys.version_info))
        self.db_logger.info("Parsl version: {}".format(get_version()))
        self.db_logger.info("Libsubmit version: {}".format(
            libsubmit.__version__))
        self.db_logger.info(
            "DFK start",
            extra={
                "time_began":
                str(self.time_began.strftime('%Y-%m-%d %H:%M:%S')),
                'time_completed': str(self.time_completed),
                'task_run_id': self.run_id,
                'rundir': self.run_dir
            })
        self.db_logger.info("Name of script/workflow: " + self.run_id,
                            extra={'task_run_id': self.run_id})
        for executor in self._config.executors:
            self.db_logger.info("Listed executor: " + executor.label,
                                extra={'task_run_id': self.run_id})
        # ES logging end

        checkpoints = self.load_checkpoints(config.checkpoint_files)
        self.memoizer = Memoizer(self,
                                 memoize=config.app_cache,
                                 checkpoint=checkpoints)
        self.checkpointed_tasks = 0
        self._checkpoint_timer = None
        self.checkpoint_mode = config.checkpoint_mode

        data_manager = DataManager.get_data_manager(
            max_threads=config.data_management_max_threads,
            executors=config.executors)
        self.executors = {
            e.label: e
            for e in config.executors + [data_manager]
        }
        for executor in self.executors.values():
            executor.run_dir = self.run_dir  # FIXME we should have a real interface for this
            executor.start()

        if self.checkpoint_mode == "periodic":
            try:
                h, m, s = map(int, config.checkpoint_period.split(':'))
                checkpoint_period = (h * 3600) + (m * 60) + s
                self._checkpoint_timer = Timer(self.checkpoint,
                                               interval=checkpoint_period)
            except Exception as e:
                logger.error(
                    "invalid checkpoint_period provided:{0} expected HH:MM:SS".
                    format(config.checkpoint_period))
                self._checkpoint_timer = Timer(self.checkpoint,
                                               interval=(30 * 60))

        if any([x.managed for x in config.executors]):
            self.flowcontrol = FlowControl(self)
        else:
            self.flowcontrol = FlowNoControl(self)

        self.task_count = 0
        self.fut_task_lookup = {}
        self.tasks = {}
        self.task_launch_lock = threading.Lock()

        atexit.register(self.atexit_cleanup)
Example #14
0
def run(pipeline_config_filename, dry_run=False, pycmd='python3'):
    """
    Runs the pipeline
    """

    # Get current time in Unix milliseconds to define log directory
    init_time_ms = int(time.time() * 1e3)

    # YAML input file.
    pipe_config = yaml.load(open(pipeline_config_filename))
    output_dir = pipe_config['output_dir']
    os.makedirs(output_dir, exist_ok=True)

    # Log directory should not already exist
    log_dir = f'{output_dir}/run_{str(init_time_ms)}'
    os.makedirs(log_dir, exist_ok=False)

    # Copy the main config files into the log directory
    shutil.copyfile(pipeline_config_filename, f'{log_dir}/pipeline_config.yml')
    stages_config = pipe_config['config']
    shutil.copyfile(stages_config, f'{log_dir}/stages_config.yml')

    # Optional logging of pipeline infrastructure to
    # file.
    log_file = f'{log_dir}/pipeline_log.txt'

    if log_file:
        parsl.set_file_logger(log_file)

    # Required configuration information
    # List of stage names, must be imported somewhere
    stages = pipe_config['stages']

    # Python modules in which to search for pipeline stages
    modules = pipe_config['modules'].split()

    # parsl execution/launcher configuration information
    launcher = pipe_config.get("launcher", "local")
    if launcher == "local":
        launcher_config = sites.local.make_launcher(stages)
    elif launcher == "cori":
        launcher_config = sites.cori.make_launcher(stages)
    elif launcher == "cori-interactive":
        launcher_config = sites.cori_interactive.make_launcher(stages)
    else:
        raise ValueError(f"Unknown launcher {launcher}")
    #
    # launcher_config = pipe_config['launcher']

    # Inputs and outputs
    inputs = pipe_config['inputs']
    resume = pipe_config['resume']

    for module in modules:
        __import__(module)

    # Create and run pipeline
    pipeline = Pipeline(launcher_config, stages, log_dir, pycmd=pycmd)

    if dry_run:
        pipeline.dry_run(inputs, output_dir, stages_config)
    else:
        pipeline.run(inputs, output_dir, resume, stages_config)
Example #15
0
from operator import itemgetter

import parsl
from parsl.app.app import python_app, bash_app
#from parsl.configs.local_threads import config

import logging
from parsl.config import Config
from parsl.executors.threads import ThreadPoolExecutor
from parsl.monitoring import MonitoringHub
from parsl.addresses import address_by_hostname

# Define a configuration for using local threads and pilot jobs
#parsl.set_stream_logger()
FILENAME = 'log_monitor.txt'
parsl.set_file_logger(FILENAME, level=logging.DEBUG)
config = Config(
    executors=[
        ThreadPoolExecutor(
            max_threads=8, 
            label='local_threads'
        )
    ],
    monitoring =MonitoringHub(
        hub_address=address_by_hostname(),
        hub_port=55055,
        logging_level=logging.INFO,
        resource_monitoring_interval=10,
    ),
    strategy=None
)
Example #16
0
def workflow_config(name,
                    nodes,
                    cores_per_node=24,
                    interval=30,
                    monitor=False):
    import parsl
    from parsl.config import Config
    from parsl.channels import LocalChannel
    from parsl.launchers import SrunLauncher
    from parsl.providers import LocalProvider
    from parsl.addresses import address_by_interface
    from parsl.executors import HighThroughputExecutor
    from parsl.monitoring.monitoring import MonitoringHub

    parsl.set_stream_logger()
    parsl.set_file_logger('script.output', level=logging.DEBUG)

    logging.info('Configuring Parsl Workflow Infrastructure')

    #Read where datasets are...
    env_str = str()
    with open('parsl.env', 'r') as reader:
        env_str = reader.read()

    logging.info(f'Task Environment {env_str}')

    mon_hub = MonitoringHub(
        workflow_name=name,
        hub_address=address_by_interface('ib0'),
        hub_port=60001,
        resource_monitoring_enabled=True,
        monitoring_debug=False,
        resource_monitoring_interval=interval,
    ) if monitor else None

    config = Config(
        executors=[
            HighThroughputExecutor(
                label=name,
                # Optional: The network interface on node 0 which compute nodes can communicate with.
                # address=address_by_interface('enp4s0f0' or 'ib0')
                address=address_by_interface('ib0'),
                # one worker per manager / node
                max_workers=cores_per_node,
                provider=LocalProvider(
                    channel=LocalChannel(script_dir='.'),
                    # make sure the nodes_per_block matches the nodes requested in the submit script in the next step
                    nodes_per_block=nodes,
                    # make sure
                    launcher=SrunLauncher(overrides=f'-c {cores_per_node}'),
                    cmd_timeout=120,
                    init_blocks=1,
                    max_blocks=1,
                    worker_init=env_str,
                ),
            )
        ],
        monitoring=mon_hub,
        strategy=None,
    )

    logging.info('Loading Parsl Config')

    parsl.load(config)
    return