def run(pipeline_config_filename, dry_run=False): """ Runs the pipeline """ # YAML input file. # Load the text and then expand any environment variables raw_config_text = open(pipeline_config_filename).read() config_text = os.path.expandvars(raw_config_text) # Then parse with YAML pipe_config = yaml.load(config_text) # Optional logging of pipeline infrastructure to # file. log_file = pipe_config.get('pipeline_log') if log_file: parsl.set_file_logger(log_file) # Required configuration information # List of stage names, must be imported somewhere stages = pipe_config['stages'] # Python modules in which to search for pipeline stages modules = pipe_config['modules'].split() # parsl execution/launcher configuration information launcher = pipe_config.get("launcher", "local") if launcher == "local": launcher_config = sites.local.make_launcher(stages) elif launcher == "cori": launcher_config = sites.cori.make_launcher(stages) elif launcher == "cori-interactive": launcher_config = sites.cori_interactive.make_launcher(stages) else: raise ValueError(f"Unknown launcher {launcher}") # # launcher_config = pipe_config['launcher'] # Inputs and outputs output_dir = pipe_config['output_dir'] inputs = pipe_config['inputs'] log_dir = pipe_config['log_dir'] resume = pipe_config['resume'] stages_config = pipe_config['config'] for module in modules: __import__(module) # Create and run pipeline pipeline = Pipeline(launcher_config, stages) if dry_run: pipeline.dry_run(inputs, output_dir, stages_config) else: pipeline.run(inputs, output_dir, log_dir, resume, stages_config)
def main(): display("Loading DFK") parsl.set_file_logger("parsl.log", level=logging.DEBUG) dfk = parsl.DataFlowKernel(config=parsl_configs.rccNodeExclusive) display("Loading App") full_app = gen_full_tokenizer(dfk) display("Loading data iter") datIter = subjectIter() display("Starting run") running = {} done = False succCount = 0 doneIter = False try: while not done: #Only add maxRunning jobs to the queue at once while len(running) < maxRunning: batch = [] for i in range(perBatch): try: batch.append(next(datIter)) except StopIteration: #If none left just skip adding doneIter = True break batchName = batch[0]['wos_id'] running[batchName] = full_app(batch) succCount += checkRunning(running) display("Completed {}".format(succCount)) #End the loop if all jobs are done and no more can be added if doneIter and len(running) < 1: done = True except KeyboardInterrupt: display("Closing down") dfk.cleanup() raise except: resetStdout() raise dfk.cleanup() display("Done")
def run(pipeline_config_filename): """ Runs the pipeline """ # YAML input file. pipe_config = yaml.load(open(pipeline_config_filename)) # Optional logging of pipeline infrastructure to # file. log_file = pipe_config.get('pipeline_log') if log_file: parsl.set_file_logger(log_file) # Required configuration information # List of stage names, must be imported somewhere stages = pipe_config['stages'] # Python modules in which to search for pipeline stages modules = pipe_config['modules'].split() # parsl execution/launcher configuration information launcher = pipe_config.get("launcher", "local") if launcher == "local": launcher_config = sites.local.make_launcher(stages) elif launcher == "cori": launcher_config = sites.cori.make_launcher(stages) else: raise ValueError(f"Unknown launcher {launcher}") # # launcher_config = pipe_config['launcher'] # Inputs and outputs output_dir = pipe_config['output_dir'] inputs = pipe_config['inputs'] log_dir = pipe_config['log_dir'] resume = pipe_config['resume'] stages_config = pipe_config['config'] for module in modules: __import__(module) # Create and run pipeline pipeline = Pipeline(launcher_config, stages) pipeline.run(inputs, output_dir, log_dir, resume, stages_config)
def __init__(self, config=Config()): """Initialize the DataFlowKernel. Parameters ---------- config : Config A specification of all configuration options. For more details see the :class:~`parsl.config.Config` documentation. """ # this will be used to check cleanup only happens once self.cleanup_called = False if isinstance(config, dict): raise ConfigurationError( 'Expected `Config` class, received dictionary. For help, ' 'see http://parsl.readthedocs.io/en/stable/stubs/parsl.config.Config.html' ) self._config = config self.run_dir = make_rundir(config.run_dir) parsl.set_file_logger("{}/parsl.log".format(self.run_dir), level=logging.DEBUG) logger.debug("Starting DataFlowKernel with config\n{}".format(config)) logger.info("Parsl version: {}".format(get_version())) self.checkpoint_lock = threading.Lock() self.usage_tracker = UsageTracker(self) self.usage_tracker.send_message() # ES logging self.tasks_completed_count = 0 self.tasks_failed_count = 0 self.monitoring_config = config.monitoring_config if self.monitoring_config is not None and self.monitoring_config.database_type == 'local_database'\ and self.monitoring_config.eng_link is None: # uses the rundir as the default location. logger.info( 'Local monitoring database can be found inside the run_dir at: {}' .format(self.run_dir)) self.monitoring_config.eng_link = "sqlite:///{}".format( os.path.join(os.path.abspath(self.run_dir), 'monitoring.db')) if self.monitoring_config is None: self.db_logger = get_db_logger() else: self.db_logger = get_db_logger( monitoring_config=self.monitoring_config) self.workflow_name = None if self.monitoring_config is not None and self.monitoring_config.workflow_name is not None: self.workflow_name = self.monitoring_config.workflow_name else: for frame in inspect.stack(): fname = os.path.basename(str(frame.filename)) parsl_file_names = ['dflow.py'] # Find first file name not considered a parsl file if fname not in parsl_file_names: self.workflow_name = fname break self.workflow_version = None if self.monitoring_config is not None and self.monitoring_config.version is not None: self.workflow_version = self.monitoring_config.version self.time_began = time.time() self.time_completed = None self.run_id = str(uuid4()) self.dashboard = self.monitoring_config.dashboard_link if self.monitoring_config is not None else None # TODO: make configurable logger.info("Run id is: " + self.run_id) if self.dashboard is not None: logger.info("Dashboard is found at " + self.dashboard) # start tornado logging server if self.monitoring_config is not None and self.monitoring_config.database_type == 'local_database': self.logging_server = multiprocessing.Process( target=logging_server.run, kwargs={'monitoring_config': self.monitoring_config}) self.logging_server.start() self.web_app = multiprocessing.Process( target=index.run, kwargs={'monitoring_config': self.monitoring_config}) self.web_app.start() else: self.logging_server = None self.web_app = None workflow_info = { 'python_version': sys.version_info, 'parsl_version': get_version(), "time_began": str(self.time_began), 'time_completed': str(None), 'run_id': self.run_id, 'workflow_name': self.workflow_name, 'workflow_version': self.workflow_version, 'rundir': self.run_dir, 'tasks_completed_count': self.tasks_completed_count, 'tasks_failed_count': self.tasks_failed_count, 'user': getuser(), 'host': gethostname(), } self.db_logger.info("DFK start", extra=workflow_info) # ES logging end checkpoints = self.load_checkpoints(config.checkpoint_files) self.memoizer = Memoizer(self, memoize=config.app_cache, checkpoint=checkpoints) self.checkpointed_tasks = 0 self._checkpoint_timer = None self.checkpoint_mode = config.checkpoint_mode data_manager = DataManager( max_threads=config.data_management_max_threads, executors=config.executors) self.executors = { e.label: e for e in config.executors + [data_manager] } for executor in self.executors.values(): executor.run_dir = self.run_dir if hasattr(executor, 'provider'): if hasattr(executor.provider, 'script_dir'): executor.provider.script_dir = os.path.join( self.run_dir, 'submit_scripts') if executor.provider.channel.script_dir is None: executor.provider.channel.script_dir = os.path.join( self.run_dir, 'submit_scripts') if not executor.provider.channel.isdir(self.run_dir): parent, child = pathlib.Path( self.run_dir).parts[-2:] remote_run_dir = os.path.join(parent, child) executor.provider.channel.script_dir = os.path.join( remote_run_dir, 'remote_submit_scripts') executor.provider.script_dir = os.path.join( self.run_dir, 'local_submit_scripts') executor.provider.channel.makedirs( executor.provider.channel.script_dir, exist_ok=True) os.makedirs(executor.provider.script_dir, exist_ok=True) executor.start() if self.checkpoint_mode == "periodic": try: h, m, s = map(int, config.checkpoint_period.split(':')) checkpoint_period = (h * 3600) + (m * 60) + s self._checkpoint_timer = Timer(self.checkpoint, interval=checkpoint_period) except Exception: logger.error( "invalid checkpoint_period provided:{0} expected HH:MM:SS". format(config.checkpoint_period)) self._checkpoint_timer = Timer(self.checkpoint, interval=(30 * 60)) if any([x.managed for x in config.executors]): self.flowcontrol = FlowControl(self) else: self.flowcontrol = FlowNoControl(self) self.task_count = 0 self.tasks = {} self.submitter_lock = threading.Lock() atexit.register(self.atexit_cleanup)
def __init__(self, config=Config()): """Initialize the DataFlowKernel. Parameters ---------- config : Config A specification of all configuration options. For more details see the :class:~`parsl.config.Config` documentation. """ # this will be used to check cleanup only happens once self.cleanup_called = False if isinstance(config, dict): raise ConfigurationError( 'Expected `Config` class, received dictionary. For help, ' 'see http://parsl.readthedocs.io/en/stable/stubs/parsl.config.Config.html') self._config = config self.run_dir = make_rundir(config.run_dir) if config.initialize_logging: parsl.set_file_logger("{}/parsl.log".format(self.run_dir), level=logging.DEBUG) logger.debug("Starting DataFlowKernel with config\n{}".format(config)) if sys.version_info < (3, 6): logger.warning("Support for python versions < 3.6 is deprecated and will be removed after parsl 0.10") logger.info("Parsl version: {}".format(get_version())) self.checkpoint_lock = threading.Lock() self.usage_tracker = UsageTracker(self) self.usage_tracker.send_message() # Monitoring self.run_id = str(uuid4()) self.tasks_completed_count = 0 self.tasks_failed_count = 0 self.tasks_dep_fail_count = 0 self.monitoring = config.monitoring # hub address and port for interchange to connect self.hub_address = None self.hub_interchange_port = None if self.monitoring: if self.monitoring.logdir is None: self.monitoring.logdir = self.run_dir self.hub_address = self.monitoring.hub_address self.hub_interchange_port = self.monitoring.start(self.run_id) self.time_began = datetime.datetime.now() self.time_completed = None # TODO: make configurable logger.info("Run id is: " + self.run_id) self.workflow_name = None if self.monitoring is not None and self.monitoring.workflow_name is not None: self.workflow_name = self.monitoring.workflow_name else: for frame in inspect.stack(): fname = os.path.basename(str(frame.filename)) parsl_file_names = ['dflow.py', 'typeguard.py'] # Find first file name not considered a parsl file if fname not in parsl_file_names: self.workflow_name = fname break self.workflow_version = str(self.time_began.replace(microsecond=0)) if self.monitoring is not None and self.monitoring.workflow_version is not None: self.workflow_version = self.monitoring.workflow_version workflow_info = { 'python_version': "{}.{}.{}".format(sys.version_info.major, sys.version_info.minor, sys.version_info.micro), 'parsl_version': get_version(), "time_began": self.time_began, 'time_completed': None, 'workflow_duration': None, 'run_id': self.run_id, 'workflow_name': self.workflow_name, 'workflow_version': self.workflow_version, 'rundir': self.run_dir, 'tasks_completed_count': self.tasks_completed_count, 'tasks_failed_count': self.tasks_failed_count, 'user': getuser(), 'host': gethostname(), } if self.monitoring: self.monitoring.send(MessageType.WORKFLOW_INFO, workflow_info) checkpoints = self.load_checkpoints(config.checkpoint_files) self.memoizer = Memoizer(self, memoize=config.app_cache, checkpoint=checkpoints) self.checkpointed_tasks = 0 self._checkpoint_timer = None self.checkpoint_mode = config.checkpoint_mode self.data_manager = DataManager(self) self.executors = {} data_manager_executor = ThreadPoolExecutor(max_threads=config.data_management_max_threads, label='data_manager') self.add_executors(config.executors + [data_manager_executor]) if self.checkpoint_mode == "periodic": try: h, m, s = map(int, config.checkpoint_period.split(':')) checkpoint_period = (h * 3600) + (m * 60) + s self._checkpoint_timer = Timer(self.checkpoint, interval=checkpoint_period, name="Checkpoint") except Exception: logger.error("invalid checkpoint_period provided: {0} expected HH:MM:SS".format(config.checkpoint_period)) self._checkpoint_timer = Timer(self.checkpoint, interval=(30 * 60), name="Checkpoint") # if we use the functionality of dynamically adding executors # all executors should be managed. if any([x.managed for x in config.executors]): self.flowcontrol = FlowControl(self) else: self.flowcontrol = FlowNoControl(self) self.task_count = 0 self.tasks = {} self.submitter_lock = threading.Lock() atexit.register(self.atexit_cleanup)
def __init__(self, config=None, executors=None, lazyErrors=True, appCache=True, rundir=None, retries=0, checkpointFiles=None, checkpointMode=None, data_manager=None): """ Initialize the DataFlowKernel. Please note that keyword args passed to the DFK here will always override options passed in via the config. KWargs: - config (dict) : A single data object encapsulating all config attributes - executors (list of Executor objs): Optional, kept for (somewhat) backward compatibility with 0.2.0 - lazyErrors(bool) : Default=True, allow workflow to continue on app failures. - appCache (bool) :Enable caching of apps - rundir (str) : Path to run directory. Defaults to ./runinfo/runNNN - retries(int): Default=0, Set the number of retry attempts in case of failure - checkpointFiles (list of str): List of filepaths to checkpoint files - checkpointMode (None, 'dfk_exit', 'task_exit', 'periodic'): Method to use. - data_manager (DataManager): User created DataManager Returns: DataFlowKernel object """ # Create run dirs for this run self.rundir = make_rundir(config=config, path=rundir) parsl.set_file_logger("{}/parsl.log".format(self.rundir), level=logging.DEBUG) logger.info("Parsl version: {}".format(parsl.__version__)) logger.info("Libsubmit version: {}".format(libsubmit.__version__)) # Update config with defaults self._config = update_config(config, self.rundir) # Set the data manager if data_manager: self.data_manager = data_manager else: self.data_manager = DataManager(config=self._config) # Start the anonymized usage tracker and send init msg self.usage_tracker = UsageTracker(self) self.usage_tracker.send_message() # Load checkpoints if any cpts = self.load_checkpoints(checkpointFiles) # Initialize the memoizer self.memoizer = Memoizer(self, memoize=appCache, checkpoint=cpts) self.checkpointed_tasks = 0 self._checkpoint_timer = None if self._config: self._executors_managed = True # Create the executors epf = EPF() self.executors = epf.make(self.rundir, self._config) # set global vars from config self.lazy_fail = self._config["globals"].get( "lazyErrors", lazyErrors) self.fail_retries = self._config["globals"].get("retries", retries) self.flowcontrol = FlowControl(self, self._config) self.checkpoint_mode = self._config["globals"].get( "checkpointMode", checkpointMode) if self.checkpoint_mode == "periodic": period = self._config["globals"].get("checkpointPeriod", "00:30:00") try: h, m, s = map(int, period.split(':')) checkpoint_period = (h * 3600) + (m * 60) + s self._checkpoint_timer = Timer(self.checkpoint, interval=checkpoint_period) except Exception as e: logger.error( "invalid checkpointPeriod provided:{0} expected HH:MM:SS" .format(period)) self._checkpoint_timer = Timer(self.checkpoint, interval=(30 * 60)) else: self._executors_managed = False self.fail_retries = retries self.lazy_fail = lazyErrors self.executors = {i: x for i, x in enumerate(executors)} self.flowcontrol = FlowNoControl(self, None) self.checkpoint_mode = checkpointMode self.task_count = 0 self.fut_task_lookup = {} self.tasks = {} self.task_launch_lock = threading.Lock() logger.debug("Using executors: {0}".format(self.executors)) atexit.register(self.cleanup)
from parsl.providers import SlurmProvider from parsl.launchers import SrunLauncher from parsl.executors import HighThroughputExecutor from parsl.addresses import address_by_hostname from parsl.app.app import bash_app from parsl.app.app import python_app @python_app() def worker_info(): #import subprocess import os return os.uname() parsl.set_file_logger(filename='parsl-ornl-slurm-log') config = Config( app_cache=True, checkpoint_files=None, checkpoint_mode=None, checkpoint_period=None, data_management_max_threads=10, executors=[ HighThroughputExecutor( address='130.199.185.13', cores_per_worker=1.0, heartbeat_period=30, heartbeat_threshold=120, interchange_port_range=(55000, 56000), label='dcde-ext.ornl.gov-slurm',
def remote_side_bash_executor(func, *args, **kwargs): """Executes the supplied function with *args and **kwargs to get a command-line to run, and then run that command-line using bash. """ import os import time import subprocess import logging import parsl.app.errors as pe from parsl import set_file_logger from parsl.utils import get_std_fname_mode logbase = "/tmp" format_string = "%(asctime)s.%(msecs)03d %(name)s:%(lineno)d [%(levelname)s] %(message)s" # make this name unique per invocation so that each invocation can # log to its own file. It would be better to include the task_id here # but that is awkward to wire through at the moment as apps do not # have access to that execution context. t = time.time() logname = __name__ + "." + str(t) logger = logging.getLogger(logname) set_file_logger(filename='{0}/bashexec.{1}.log'.format(logbase, t), name=logname, level=logging.DEBUG, format_string=format_string) func_name = func.__name__ executable = None # Try to run the func to compose the commandline try: # Execute the func to get the commandline executable = func(*args, **kwargs) if not isinstance(executable, str): raise ValueError( f"Expected a str for bash_app commandline, got {type(executable)}" ) except AttributeError as e: if executable is not None: raise pe.AppBadFormatting( "App formatting failed for app '{}' with AttributeError: {}". format(func_name, e)) else: raise pe.BashAppNoReturn( "Bash app '{}' did not return a value, or returned None - with this exception: {}" .format(func_name, e)) except IndexError as e: raise pe.AppBadFormatting( "App formatting failed for app '{}' with IndexError: {}".format( func_name, e)) except Exception as e: logger.error( "Caught exception during formatting of app '{}': {}".format( func_name, e)) raise e logger.debug("Executable: %s", executable) # Updating stdout, stderr if values passed at call time. def open_std_fd(fdname): # fdname is 'stdout' or 'stderr' stdfspec = kwargs.get(fdname) # spec is str name or tuple (name, mode) if stdfspec is None: return None fname, mode = get_std_fname_mode(fdname, stdfspec) try: if os.path.dirname(fname): os.makedirs(os.path.dirname(fname), exist_ok=True) fd = open(fname, mode) except Exception as e: raise pe.BadStdStreamFile(fname, e) return fd std_out = open_std_fd('stdout') std_err = open_std_fd('stderr') timeout = kwargs.get('walltime') if std_err is not None: print('--> executable follows <--\n{}\n--> end executable <--'.format( executable), file=std_err, flush=True) returncode = None try: proc = subprocess.Popen(executable, stdout=std_out, stderr=std_err, shell=True, executable='/bin/bash') proc.wait(timeout=timeout) returncode = proc.returncode except subprocess.TimeoutExpired: raise pe.AppTimeout("[{}] App exceeded walltime: {}".format( func_name, timeout)) except Exception as e: raise pe.AppException( "[{}] App caught exception with returncode: {}".format( func_name, returncode), e) if returncode != 0: raise pe.BashExitFailure(func_name, proc.returncode) # TODO : Add support for globs here missing = [] for outputfile in kwargs.get('outputs', []): fpath = outputfile.filepath if not os.path.exists(fpath): missing.extend([outputfile]) if missing: raise pe.MissingOutputs("[{}] Missing outputs".format(func_name), missing) return returncode
def __init__(self, config=Config()): """Initialize the DataFlowKernel. Parameters ---------- config : Config A specification of all configuration options. For more details see the :class:~`parsl.config.Config` documentation. """ # this will be used to check cleanup only happens once self.cleanup_called = False if isinstance(config, dict): raise ConfigurationError( 'Expected `Config` class, received dictionary. For help, ' 'see http://parsl.readthedocs.io/en/stable/stubs/parsl.config.Config.html' ) self._config = config logger.debug("Starting DataFlowKernel with config\n{}".format(config)) self.run_dir = make_rundir(config.run_dir) parsl.set_file_logger("{}/parsl.log".format(self.run_dir), level=logging.DEBUG) logger.info("Parsl version: {}".format(get_version())) logger.info("Libsubmit version: {}".format(libsubmit.__version__)) self.checkpoint_lock = threading.Lock() self.usage_tracker = UsageTracker(self) self.usage_tracker.send_message() checkpoints = self.load_checkpoints(config.checkpoint_files) self.memoizer = Memoizer(self, memoize=config.app_cache, checkpoint=checkpoints) self.checkpointed_tasks = 0 self._checkpoint_timer = None self.checkpoint_mode = config.checkpoint_mode data_manager = DataManager.get_data_manager( max_threads=config.data_management_max_threads, executors=config.executors) self.executors = { e.label: e for e in config.executors + [data_manager] } for executor in self.executors.values(): executor.run_dir = self.run_dir # FIXME we should have a real interface for this executor.start() if self.checkpoint_mode == "periodic": try: h, m, s = map(int, config.checkpoint_period.split(':')) checkpoint_period = (h * 3600) + (m * 60) + s self._checkpoint_timer = Timer(self.checkpoint, interval=checkpoint_period) except Exception as e: logger.error( "invalid checkpoint_period provided:{0} expected HH:MM:SS". format(period)) self._checkpoint_timer = Timer(self.checkpoint, interval=(30 * 60)) if any([x.managed for x in config.executors]): self.flowcontrol = FlowControl(self) else: self.flowcontrol = FlowNoControl(self) self.task_count = 0 self.fut_task_lookup = {} self.tasks = {} self.task_launch_lock = threading.Lock() atexit.register(self.atexit_cleanup)
def remote_side_sandbox_executor(func, *args, **kwargs): """Executes the supplied function with *args and **kwargs to get a command-line to run, and then run that command-line using bash. """ import os import time import subprocess import logging import parsl.app.errors as pe from parsl import set_file_logger from parsl.utils import get_std_fname_mode sandbox = Sandbox("scratch") logbase = "/tmp" format_string = "%(asctime)s.%(msecs)03d %(name)s:%(lineno)d [%(levelname)s] %(message)s" # make this name unique per invocation so that each invocation can # log to its own file. It would be better to include the task_id here # but that is awkward to wire through at the moment as apps do not # have access to that execution context. t = time.time() logname = __name__ + "." + str(t) logger = logging.getLogger(logname) set_file_logger(filename='{0}/bashexec.{1}.log'.format(logbase, t), name=logname, level=logging.DEBUG, format_string=format_string) func_name = func.__name__ executable = None # Try to run the func to compose the commandline try: # Execute the func to get the commandline executable = func(*args, **kwargs) except AttributeError as e: if executable is not None: raise pe.AppBadFormatting( "App formatting failed for app '{}' with AttributeError: {}". format(func_name, e)) else: raise pe.BashAppNoReturn( "Bash app '{}' did not return a value, or returned None - with this exception: {}" .format(func_name, e)) except IndexError as e: raise pe.AppBadFormatting( "App formatting failed for app '{}' with IndexError: {}".format( func_name, e)) except Exception as e: logger.error( "Caught exception during formatting of app '{}': {}".format( func_name, e)) raise e logger.debug("Executable: %s", executable) # Updating stdout, stderr if values passed at call time. def open_std_fd(fdname): # fdname is 'stdout' or 'stderr' stdfspec = kwargs.get(fdname) # spec is str name or tuple (name, mode) if stdfspec is None: return None fname, mode = get_std_fname_mode(fdname, stdfspec) try: if os.path.dirname(fname): os.makedirs(os.path.dirname(fname), exist_ok=True) fd = open(fname, mode) except Exception as e: raise pe.BadStdStreamFile(fname, e) return fd std_out = open_std_fd('stdout') std_err = open_std_fd('stderr') timeout = kwargs.get('walltime') project = kwargs.get('project', "") unique_id = kwargs.get('unique_id', "HUMAN") sandbox.create_working_dir(unique_id, func_name) workflow_schema = "workflow://" + project + "/" + unique_id + "/" if std_err is not None: print('--> executable follows <--\n{}\n--> end executable <--'.format( executable), file=std_err, flush=True) return_value = None try: cwd = None working_directory = "scratch" + os.path.sep + unique_id os.makedirs(working_directory) cwd = os.getcwd() os.chdir(working_directory) logger.debug("workflow://schema: %s", workflow_schema) # Resolve workflow:// inputs for input in kwargs.get('inputs', []): if "workflow://" in input: print(input) proc = subprocess.Popen(executable, stdout=std_out, stderr=std_err, shell=True, executable='/bin/bash') proc.wait(timeout=timeout) return_value = { 'unique_id': unique_id, 'working_directory': working_directory, 'workflow_schema': workflow_schema, 'return_code': proc.returncode } if cwd is not None: os.chdir(cwd) except subprocess.TimeoutExpired: raise pe.AppTimeout("[{}] App exceeded walltime: {}".format( func_name, timeout)) except Exception as e: raise pe.AppException( "[{}] App caught exception with return value: {}".format( func_name, json.dumps(return_value)), e) if proc.returncode != 0: raise pe.BashExitFailure(func_name, proc.returncode) # TODO : Add support for globs here missing = [] for outputfile in kwargs.get('outputs', []): fpath = outputfile.filepath if not os.path.exists(fpath): missing.extend([outputfile]) if missing: raise pe.MissingOutputs("[{}] Missing outputs".format(func_name), missing) return return_value
def __init__(self, config=None, executors=None, lazyErrors=True, appCache=True, rundir=None, retries=0, checkpointFiles=None): """ Initialize the DataFlowKernel Please note that keyword args passed to the DFK here will always override options passed in via the config. KWargs: - config (Dict) : A single data object encapsulating all config attributes - executors (list of Executor objs): Optional, kept for (somewhat) backward compatibility with 0.2.0 - lazyErrors(Bool) : Default=True, allow workflow to continue on app failures. - appCache (Bool) :Enable caching of apps - rundir (str) : Path to run directory. Defaults to ./runinfo/runNNN - retries(int): Default=0, Set the number of retry attempts in case of failure - checkpointFiles (list of str): List of filepaths to checkpoint files Returns: DataFlowKernel object """ # Create run dirs for this run self.rundir = make_rundir(config=config, path=rundir) parsl.set_file_logger("{}/parsl.log".format(self.rundir), level=logging.INFO) logger.info("Parsl version: {}".format(parsl.__version__)) logger.info("Libsubmit version: {}".format(libsubmit.__version__)) # Update config with defaults self._config = update_config(config, self.rundir) # Start the anonymized usage tracker and send init msg self.usage_tracker = UsageTracker(self) self.usage_tracker.send_message() # Load checkpoints if any cpts = self.load_checkpoints(checkpointFiles) # Initialize the memoizer self.memoizer = Memoizer(self, memoize=appCache, checkpoint=cpts) if self._config: self._executors_managed = True # Create the executors epf = EPF() self.executors = epf.make(self.rundir, self._config) # set global vars from config self.lazy_fail = self._config["globals"].get( "lazyErrors", lazyErrors) self.fail_retries = self._config["globals"].get("retries", retries) self.flowcontrol = FlowControl(self, self._config) else: self._executors_managed = False self.fail_retries = retries self.lazy_fail = lazyErrors self.executors = {i: x for i, x in enumerate(executors)} self.flowcontrol = FlowNoControl(self, None) self.task_count = 0 self.fut_task_lookup = {} self.tasks = {} self.task_launch_lock = threading.Lock() logger.debug("Using executors: {0}".format(self.executors)) atexit.register(self.cleanup)
def sandbox_runner(func, *args, **kwargs): """Executes the supplied function with *args and **kwargs to get a command-line to run, and then run that command-line using bash. """ import os import time import subprocess import logging import parsl.app.errors as pe from parsl import set_file_logger from parsl.utils import get_std_fname_mode from parsl.data_provider.files import File import json # create a sandbox passing the name of the scratch directory sandbox = Sandbox() logbase = "/tmp" format_string = "%(asctime)s.%(msecs)03d %(name)s:%(lineno)d [%(levelname)s] %(message)s" # make this name unique per invocation so that each invocation can # log to its own file. It would be better to include the task_id here # but that is awkward to wire through at the moment as apps do not # have access to that execution context. t = time.time() logname = __name__ + "." + str(t) logger = logging.getLogger(logname) set_file_logger(filename='{0}/bashexec.{1}.log'.format(logbase, t), name=logname, level=logging.DEBUG, format_string=format_string) func_name = func.__name__ executable = None # the workflow_name sandbox.workflow_name = kwargs.get('project', "") # app name sandbox.app_name = kwargs.get("workflow_app_name", "") # create a working dir with the sandbox sandbox.createWorkingDirectory() # workflow schema as workflow:///funcNameUUID workflow_schema = "workflow://" + sandbox.workflow_name + "/" + sandbox.app_name # tasks dep of the current task if 'tasks' in kwargs: sandbox.tasks_dep = kwargs.get('tasks', "") logger.debug(sandbox.tasks_dep) # Try to run the func to compose the commandline try: # Execute the func to get the commandline executable = func(*args, **kwargs) executable = sandbox.define_command(executable) logger.debug(executable) except AttributeError as e: if executable is not None: raise pe.AppBadFormatting("App formatting failed for app '{}' with AttributeError: {}".format(func_name, e)) else: raise pe.BashAppNoReturn( "Bash app '{}' did not return a value, or returned None - with this exception: {}".format(func_name, e)) except IndexError as e: raise pe.AppBadFormatting("App formatting failed for app '{}' with IndexError: {}".format(func_name, e)) except Exception as e: raise e # Updating stdout, stderr if values passed at call time. def open_std_fd(fdname): # fdname is 'stdout' or 'stderr' stdfspec = kwargs.get(fdname) # spec is str name or tuple (name, mode) if stdfspec is None: return None fname, mode = get_std_fname_mode(fdname, stdfspec) try: if os.path.dirname(fname): os.makedirs(os.path.dirname(fname), exist_ok=True) fd = open(fname, mode) except Exception as e: raise pe.BadStdStreamFile(fname, e) return fd std_out = open_std_fd('stdout') std_err = open_std_fd('stderr') timeout = kwargs.get('walltime') if std_err is not None: print('--> executable follows <--\n{}\n--> end executable <--'.format(executable), file=std_err, flush=True) return_value = None try: logger.debug("workflow://schema: %s", workflow_schema) proc = subprocess.Popen(executable, stdout=std_out, stderr=std_err, shell=True, executable='/bin/bash') proc.wait(timeout=timeout) return_value = { 'return_code': proc.returncode, 'working_directory': sandbox.working_directory, } except subprocess.TimeoutExpired: raise pe.AppTimeout("[{}] App exceeded walltime: {}".format(func_name, timeout)) except Exception as e: raise pe.AppException("[{}] App caught exception with return value: {}" .format(func_name, json.dumps(return_value)), e) if proc.returncode != 0: raise pe.BashExitFailure(func_name, proc.returncode) # TODO : Add support for globs here missing = [] for outputfile in kwargs.get('outputs', []): fpath = outputfile.filepath if not os.path.exists(fpath): missing.extend([outputfile]) if missing: raise pe.MissingOutputs("[{}] Missing outputs".format(func_name), missing) return return_value
def __init__(self, config=Config()): """Initialize the DataFlowKernel. Parameters ---------- config : Config A specification of all configuration options. For more details see the :class:~`parsl.config.Config` documentation. """ # this will be used to check cleanup only happens once self.cleanup_called = False if isinstance(config, dict): raise ConfigurationError( 'Expected `Config` class, received dictionary. For help, ' 'see http://parsl.readthedocs.io/en/stable/stubs/parsl.config.Config.html' ) self._config = config logger.debug("Starting DataFlowKernel with config\n{}".format(config)) self.run_dir = make_rundir(config.run_dir) parsl.set_file_logger("{}/parsl.log".format(self.run_dir), level=logging.DEBUG) logger.info("Parsl version: {}".format(get_version())) logger.info("Libsubmit version: {}".format(libsubmit.__version__)) self.checkpoint_lock = threading.Lock() self.usage_tracker = UsageTracker(self) self.usage_tracker.send_message() # ES logging self.db_logger_config = config.db_logger_config self.db_logger = get_db_logger( enable_es_logging=False ) if self.db_logger_config is None else get_db_logger( **self.db_logger_config) self.workflow_name = str(inspect.stack()[1][1]) self.time_began = datetime.now() self.time_completed = None self.run_id = self.workflow_name + "-" + str(self.time_began.minute) self.dashboard = self.db_logger_config.get( 'dashboard_link', None) if self.db_logger_config is not None else None # TODO: make configurable logger.info("Run id is: " + self.run_id) if self.dashboard is not None: logger.info("Dashboard is found at " + self.dashboard) self.db_logger.info("Python version: {}".format(sys.version_info)) self.db_logger.info("Parsl version: {}".format(get_version())) self.db_logger.info("Libsubmit version: {}".format( libsubmit.__version__)) self.db_logger.info( "DFK start", extra={ "time_began": str(self.time_began.strftime('%Y-%m-%d %H:%M:%S')), 'time_completed': str(self.time_completed), 'task_run_id': self.run_id, 'rundir': self.run_dir }) self.db_logger.info("Name of script/workflow: " + self.run_id, extra={'task_run_id': self.run_id}) for executor in self._config.executors: self.db_logger.info("Listed executor: " + executor.label, extra={'task_run_id': self.run_id}) # ES logging end checkpoints = self.load_checkpoints(config.checkpoint_files) self.memoizer = Memoizer(self, memoize=config.app_cache, checkpoint=checkpoints) self.checkpointed_tasks = 0 self._checkpoint_timer = None self.checkpoint_mode = config.checkpoint_mode data_manager = DataManager.get_data_manager( max_threads=config.data_management_max_threads, executors=config.executors) self.executors = { e.label: e for e in config.executors + [data_manager] } for executor in self.executors.values(): executor.run_dir = self.run_dir # FIXME we should have a real interface for this executor.start() if self.checkpoint_mode == "periodic": try: h, m, s = map(int, config.checkpoint_period.split(':')) checkpoint_period = (h * 3600) + (m * 60) + s self._checkpoint_timer = Timer(self.checkpoint, interval=checkpoint_period) except Exception as e: logger.error( "invalid checkpoint_period provided:{0} expected HH:MM:SS". format(config.checkpoint_period)) self._checkpoint_timer = Timer(self.checkpoint, interval=(30 * 60)) if any([x.managed for x in config.executors]): self.flowcontrol = FlowControl(self) else: self.flowcontrol = FlowNoControl(self) self.task_count = 0 self.fut_task_lookup = {} self.tasks = {} self.task_launch_lock = threading.Lock() atexit.register(self.atexit_cleanup)
def run(pipeline_config_filename, dry_run=False, pycmd='python3'): """ Runs the pipeline """ # Get current time in Unix milliseconds to define log directory init_time_ms = int(time.time() * 1e3) # YAML input file. pipe_config = yaml.load(open(pipeline_config_filename)) output_dir = pipe_config['output_dir'] os.makedirs(output_dir, exist_ok=True) # Log directory should not already exist log_dir = f'{output_dir}/run_{str(init_time_ms)}' os.makedirs(log_dir, exist_ok=False) # Copy the main config files into the log directory shutil.copyfile(pipeline_config_filename, f'{log_dir}/pipeline_config.yml') stages_config = pipe_config['config'] shutil.copyfile(stages_config, f'{log_dir}/stages_config.yml') # Optional logging of pipeline infrastructure to # file. log_file = f'{log_dir}/pipeline_log.txt' if log_file: parsl.set_file_logger(log_file) # Required configuration information # List of stage names, must be imported somewhere stages = pipe_config['stages'] # Python modules in which to search for pipeline stages modules = pipe_config['modules'].split() # parsl execution/launcher configuration information launcher = pipe_config.get("launcher", "local") if launcher == "local": launcher_config = sites.local.make_launcher(stages) elif launcher == "cori": launcher_config = sites.cori.make_launcher(stages) elif launcher == "cori-interactive": launcher_config = sites.cori_interactive.make_launcher(stages) else: raise ValueError(f"Unknown launcher {launcher}") # # launcher_config = pipe_config['launcher'] # Inputs and outputs inputs = pipe_config['inputs'] resume = pipe_config['resume'] for module in modules: __import__(module) # Create and run pipeline pipeline = Pipeline(launcher_config, stages, log_dir, pycmd=pycmd) if dry_run: pipeline.dry_run(inputs, output_dir, stages_config) else: pipeline.run(inputs, output_dir, resume, stages_config)
from operator import itemgetter import parsl from parsl.app.app import python_app, bash_app #from parsl.configs.local_threads import config import logging from parsl.config import Config from parsl.executors.threads import ThreadPoolExecutor from parsl.monitoring import MonitoringHub from parsl.addresses import address_by_hostname # Define a configuration for using local threads and pilot jobs #parsl.set_stream_logger() FILENAME = 'log_monitor.txt' parsl.set_file_logger(FILENAME, level=logging.DEBUG) config = Config( executors=[ ThreadPoolExecutor( max_threads=8, label='local_threads' ) ], monitoring =MonitoringHub( hub_address=address_by_hostname(), hub_port=55055, logging_level=logging.INFO, resource_monitoring_interval=10, ), strategy=None )
def workflow_config(name, nodes, cores_per_node=24, interval=30, monitor=False): import parsl from parsl.config import Config from parsl.channels import LocalChannel from parsl.launchers import SrunLauncher from parsl.providers import LocalProvider from parsl.addresses import address_by_interface from parsl.executors import HighThroughputExecutor from parsl.monitoring.monitoring import MonitoringHub parsl.set_stream_logger() parsl.set_file_logger('script.output', level=logging.DEBUG) logging.info('Configuring Parsl Workflow Infrastructure') #Read where datasets are... env_str = str() with open('parsl.env', 'r') as reader: env_str = reader.read() logging.info(f'Task Environment {env_str}') mon_hub = MonitoringHub( workflow_name=name, hub_address=address_by_interface('ib0'), hub_port=60001, resource_monitoring_enabled=True, monitoring_debug=False, resource_monitoring_interval=interval, ) if monitor else None config = Config( executors=[ HighThroughputExecutor( label=name, # Optional: The network interface on node 0 which compute nodes can communicate with. # address=address_by_interface('enp4s0f0' or 'ib0') address=address_by_interface('ib0'), # one worker per manager / node max_workers=cores_per_node, provider=LocalProvider( channel=LocalChannel(script_dir='.'), # make sure the nodes_per_block matches the nodes requested in the submit script in the next step nodes_per_block=nodes, # make sure launcher=SrunLauncher(overrides=f'-c {cores_per_node}'), cmd_timeout=120, init_blocks=1, max_blocks=1, worker_init=env_str, ), ) ], monitoring=mon_hub, strategy=None, ) logging.info('Loading Parsl Config') parsl.load(config) return