def __init__(self, url, dman=None, cache=False, caching_dir=".", staging='direct'): """Construct a File object from a url string. Args: - url (string) : url string of the file e.g. - 'input.txt' - 'file:///scratch/proj101/input.txt' - 'globus://go#ep1/~/data/input.txt' - 'globus://ddb59aef-6d04-11e5-ba46-22000b92c6ec/home/johndoe/data/input.txt' - dman (DataManager) : data manager """ self.url = url parsed_url = urlparse(self.url) self.scheme = parsed_url.scheme if parsed_url.scheme else 'file' self.netloc = parsed_url.netloc self.path = parsed_url.path self.filename = os.path.basename(self.path) self.dman = dman if dman else DataManager.get_data_manager() self.data_future = {} if self.scheme != 'file': self.dman.add_file(self) self.cache = cache self.caching_dir = caching_dir self.staging = staging
def __init__(self, config=Config()): """Initialize the DataFlowKernel. Parameters ---------- config : Config A specification of all configuration options. For more details see the :class:~`parsl.config.Config` documentation. """ # this will be used to check cleanup only happens once self.cleanup_called = False if isinstance(config, dict): raise ConfigurationError( 'Expected `Config` class, received dictionary. For help, ' 'see http://parsl.readthedocs.io/en/stable/stubs/parsl.config.Config.html' ) self._config = config logger.debug("Starting DataFlowKernel with config\n{}".format(config)) self.run_dir = make_rundir(config.run_dir) parsl.set_file_logger("{}/parsl.log".format(self.run_dir), level=logging.DEBUG) logger.info("Parsl version: {}".format(get_version())) logger.info("Libsubmit version: {}".format(libsubmit.__version__)) self.checkpoint_lock = threading.Lock() self.usage_tracker = UsageTracker(self) self.usage_tracker.send_message() # ES logging self.db_logger_config = config.db_logger_config self.db_logger = get_db_logger( enable_es_logging=False ) if self.db_logger_config is None else get_db_logger( **self.db_logger_config) self.workflow_name = str(inspect.stack()[1][1]) self.time_began = datetime.now() self.time_completed = None self.run_id = self.workflow_name + "-" + str(self.time_began.minute) self.dashboard = self.db_logger_config.get( 'dashboard_link', None) if self.db_logger_config is not None else None # TODO: make configurable logger.info("Run id is: " + self.run_id) if self.dashboard is not None: logger.info("Dashboard is found at " + self.dashboard) self.db_logger.info("Python version: {}".format(sys.version_info)) self.db_logger.info("Parsl version: {}".format(get_version())) self.db_logger.info("Libsubmit version: {}".format( libsubmit.__version__)) self.db_logger.info( "DFK start", extra={ "time_began": str(self.time_began.strftime('%Y-%m-%d %H:%M:%S')), 'time_completed': str(self.time_completed), 'task_run_id': self.run_id, 'rundir': self.run_dir }) self.db_logger.info("Name of script/workflow: " + self.run_id, extra={'task_run_id': self.run_id}) for executor in self._config.executors: self.db_logger.info("Listed executor: " + executor.label, extra={'task_run_id': self.run_id}) # ES logging end checkpoints = self.load_checkpoints(config.checkpoint_files) self.memoizer = Memoizer(self, memoize=config.app_cache, checkpoint=checkpoints) self.checkpointed_tasks = 0 self._checkpoint_timer = None self.checkpoint_mode = config.checkpoint_mode data_manager = DataManager.get_data_manager( max_threads=config.data_management_max_threads, executors=config.executors) self.executors = { e.label: e for e in config.executors + [data_manager] } for executor in self.executors.values(): executor.run_dir = self.run_dir # FIXME we should have a real interface for this executor.start() if self.checkpoint_mode == "periodic": try: h, m, s = map(int, config.checkpoint_period.split(':')) checkpoint_period = (h * 3600) + (m * 60) + s self._checkpoint_timer = Timer(self.checkpoint, interval=checkpoint_period) except Exception as e: logger.error( "invalid checkpoint_period provided:{0} expected HH:MM:SS". format(config.checkpoint_period)) self._checkpoint_timer = Timer(self.checkpoint, interval=(30 * 60)) if any([x.managed for x in config.executors]): self.flowcontrol = FlowControl(self) else: self.flowcontrol = FlowNoControl(self) self.task_count = 0 self.fut_task_lookup = {} self.tasks = {} self.task_launch_lock = threading.Lock() atexit.register(self.atexit_cleanup)
def __init__(self, config=Config()): """Initialize the DataFlowKernel. Parameters ---------- config : Config A specification of all configuration options. For more details see the :class:~`parsl.config.Config` documentation. """ # this will be used to check cleanup only happens once self.cleanup_called = False if isinstance(config, dict): raise ConfigurationError( 'Expected `Config` class, received dictionary. For help, ' 'see http://parsl.readthedocs.io/en/stable/stubs/parsl.config.Config.html' ) self._config = config logger.debug("Starting DataFlowKernel with config\n{}".format(config)) self.run_dir = make_rundir(config.run_dir) parsl.set_file_logger("{}/parsl.log".format(self.run_dir), level=logging.DEBUG) logger.info("Parsl version: {}".format(get_version())) logger.info("Libsubmit version: {}".format(libsubmit.__version__)) self.checkpoint_lock = threading.Lock() self.usage_tracker = UsageTracker(self) self.usage_tracker.send_message() checkpoints = self.load_checkpoints(config.checkpoint_files) self.memoizer = Memoizer(self, memoize=config.app_cache, checkpoint=checkpoints) self.checkpointed_tasks = 0 self._checkpoint_timer = None self.checkpoint_mode = config.checkpoint_mode data_manager = DataManager.get_data_manager( max_threads=config.data_management_max_threads, executors=config.executors) self.executors = { e.label: e for e in config.executors + [data_manager] } for executor in self.executors.values(): executor.run_dir = self.run_dir # FIXME we should have a real interface for this executor.start() if self.checkpoint_mode == "periodic": try: h, m, s = map(int, config.checkpoint_period.split(':')) checkpoint_period = (h * 3600) + (m * 60) + s self._checkpoint_timer = Timer(self.checkpoint, interval=checkpoint_period) except Exception as e: logger.error( "invalid checkpoint_period provided:{0} expected HH:MM:SS". format(period)) self._checkpoint_timer = Timer(self.checkpoint, interval=(30 * 60)) if any([x.managed for x in config.executors]): self.flowcontrol = FlowControl(self) else: self.flowcontrol = FlowNoControl(self) self.task_count = 0 self.fut_task_lookup = {} self.tasks = {} self.task_launch_lock = threading.Lock() atexit.register(self.atexit_cleanup)
def __init__(self, config=None, executors=None, lazyErrors=True, appCache=True, rundir=None, retries=0, checkpointFiles=None, checkpointMode=None): """ Initialize the DataFlowKernel. Please note that keyword args passed to the DFK here will always override options passed in via the config. KWargs: - config (dict): A single data object encapsulating all config attributes - executors (list of Executor objs): Optional, kept for (somewhat) backward compatibility with 0.2.0 - lazyErrors(bool): Default=True, allow workflow to continue on app failures. - appCache (bool): Enable caching of apps - rundir (str): Path to run directory. Defaults to ./runinfo/runNNN - retries(int): Default=0, Set the number of retry attempts in case of failure - checkpointFiles (list of str): List of filepaths to checkpoint files - checkpointMode (None, 'dfk_exit', 'task_exit', 'periodic'): Method to use. Returns: DataFlowKernel object """ # this will be used to check cleanup only happens once self.cleanup_called = False # Create run dirs for this run self.rundir = make_rundir(config=config, path=rundir) parsl.set_file_logger("{}/parsl.log".format(self.rundir), level=logging.DEBUG) logger.info("Parsl version: {}".format(get_version())) logger.info("Libsubmit version: {}".format(libsubmit.__version__)) self.checkpoint_lock = threading.Lock() # Update config with defaults self._config = update_config(config, self.rundir) # Initialize the data manager self.data_manager = DataManager.get_data_manager(self, config=self._config) # Start the anonymized usage tracker and send init msg self.usage_tracker = UsageTracker(self) self.usage_tracker.send_message() # Load Memoizer with checkpoints before we start the run. if checkpointFiles: checkpoint_src = checkpointFiles elif self._config and self._config["globals"]["checkpointFiles"]: checkpoint_src = self._config["globals"]["checkpointFiles"] else: checkpoint_src = None cpts = self.load_checkpoints(checkpoint_src) # Initialize the memoizer self.memoizer = Memoizer(self, memoize=appCache, checkpoint=cpts) self.checkpointed_tasks = 0 self._checkpoint_timer = None if self._config: self._executors_managed = True # Create the executors epf = EPF() self.executors = epf.make(self.rundir, self._config) # set global vars from config self.lazy_fail = self._config["globals"].get("lazyErrors", lazyErrors) self.fail_retries = self._config["globals"].get("retries", retries) self.flowcontrol = FlowControl(self, self._config) self.checkpoint_mode = self._config["globals"].get("checkpointMode", checkpointMode) if self.checkpoint_mode == "periodic": period = self._config["globals"].get("checkpointPeriod", "00:30:00") try: h, m, s = map(int, period.split(':')) checkpoint_period = (h * 3600) + (m * 60) + s self._checkpoint_timer = Timer(self.checkpoint, interval=checkpoint_period) except Exception as e: logger.error("invalid checkpointPeriod provided:{0} expected HH:MM:SS".format(period)) self._checkpoint_timer = Timer(self.checkpoint, interval=(30 * 60)) else: self._executors_managed = False self.fail_retries = retries self.lazy_fail = lazyErrors self.executors = {i: x for i, x in enumerate(executors)} self.flowcontrol = FlowNoControl(self, None) self.checkpoint_mode = checkpointMode self.task_count = 0 self.fut_task_lookup = {} self.tasks = {} self.task_launch_lock = threading.Lock() logger.debug("Using executors: {0}".format(self.executors)) atexit.register(self.atexit_cleanup)