Ejemplo n.º 1
0
    def __init__(self, config=None, executors=None, lazy_fail=True,
                 rundir=None, fail_retries=2):
        """ Initialize the DataFlowKernel

        Please note that keyword args passed to the DFK here will always override
        options passed in via the config.

        KWargs:
            config (Dict) : A single data object encapsulating all config attributes
            executors (list of Executor objs): Optional, kept for (somewhat) backward compatibility with 0.2.0
            lazy_fail(Bool) : Default=True, determine failure behavior
            rundir (str) : Path to run directory. Defaults to ./runinfo/runNNN
            fail_retries(int): Default=2, Set the number of retry attempts in case of failure

        Returns:
            DataFlowKernel object
        """
        # Create run dirs for this run
        self.rundir = make_rundir(config=config, path=rundir)

        # Update config with defaults
        self._config = update_config(config, self.rundir)

        # Start the anonymized usage tracker and send init msg
        self.usage_tracker = UsageTracker(self)
        self.usage_tracker.send_message()

        if self._config :
            self._executors_managed = True
            # Create the executors
            epf = EPF()
            self.executors = epf.make(self.rundir, self._config)

            # set global vars from config
            self.lazy_fail = self._config["globals"].get("lazyFail", lazy_fail)
            self.fail_retries = self._config["globals"].get("fail_retries", fail_retries)
            self.flowcontrol     = FlowControl(self, self._config)
        else:
            self._executors_managed = False
            self.fail_retries = fail_retries
            self.lazy_fail    = lazy_fail
            self.executors    = {i:x for i,x in enumerate(executors)}
            print("Executors : ", self.executors)
            self.flowcontrol  = FlowNoControl(self, None)

        self.task_count      = 0
        self.fut_task_lookup = {}
        self.tasks           = {}


        logger.debug("Using executors: {0}".format(self.executors))
        atexit.register(self.cleanup)
Ejemplo n.º 2
0
    def __init__(self, config=Config()):
        """Initialize the DataFlowKernel.

        Parameters
        ----------
        config : Config
            A specification of all configuration options. For more details see the
            :class:~`parsl.config.Config` documentation.
        """

        # this will be used to check cleanup only happens once
        self.cleanup_called = False

        if isinstance(config, dict):
            raise ConfigurationError(
                'Expected `Config` class, received dictionary. For help, '
                'see http://parsl.readthedocs.io/en/stable/stubs/parsl.config.Config.html'
            )
        self._config = config
        self.run_dir = make_rundir(config.run_dir)
        parsl.set_file_logger("{}/parsl.log".format(self.run_dir),
                              level=logging.DEBUG)
        logger.debug("Starting DataFlowKernel with config\n{}".format(config))
        logger.info("Parsl version: {}".format(get_version()))

        self.checkpoint_lock = threading.Lock()

        self.usage_tracker = UsageTracker(self)
        self.usage_tracker.send_message()

        # ES logging
        self.tasks_completed_count = 0
        self.tasks_failed_count = 0
        self.monitoring_config = config.monitoring_config
        if self.monitoring_config is not None and self.monitoring_config.database_type == 'local_database'\
                and self.monitoring_config.eng_link is None:
            # uses the rundir as the default location.
            logger.info(
                'Local monitoring database can be found inside the run_dir at: {}'
                .format(self.run_dir))
            self.monitoring_config.eng_link = "sqlite:///{}".format(
                os.path.join(os.path.abspath(self.run_dir), 'monitoring.db'))
        if self.monitoring_config is None:
            self.db_logger = get_db_logger()
        else:
            self.db_logger = get_db_logger(
                monitoring_config=self.monitoring_config)
        self.workflow_name = None
        if self.monitoring_config is not None and self.monitoring_config.workflow_name is not None:
            self.workflow_name = self.monitoring_config.workflow_name
        else:
            for frame in inspect.stack():
                fname = os.path.basename(str(frame.filename))
                parsl_file_names = ['dflow.py']
                # Find first file name not considered a parsl file
                if fname not in parsl_file_names:
                    self.workflow_name = fname
                    break

        self.workflow_version = None
        if self.monitoring_config is not None and self.monitoring_config.version is not None:
            self.workflow_version = self.monitoring_config.version
        self.time_began = time.time()
        self.time_completed = None
        self.run_id = str(uuid4())
        self.dashboard = self.monitoring_config.dashboard_link if self.monitoring_config is not None else None
        # TODO: make configurable
        logger.info("Run id is: " + self.run_id)
        if self.dashboard is not None:
            logger.info("Dashboard is found at " + self.dashboard)
        # start tornado logging server
        if self.monitoring_config is not None and self.monitoring_config.database_type == 'local_database':
            self.logging_server = multiprocessing.Process(
                target=logging_server.run,
                kwargs={'monitoring_config': self.monitoring_config})
            self.logging_server.start()
            self.web_app = multiprocessing.Process(
                target=index.run,
                kwargs={'monitoring_config': self.monitoring_config})
            self.web_app.start()
        else:
            self.logging_server = None
            self.web_app = None
        workflow_info = {
            'python_version': sys.version_info,
            'parsl_version': get_version(),
            "time_began": str(self.time_began),
            'time_completed': str(None),
            'run_id': self.run_id,
            'workflow_name': self.workflow_name,
            'workflow_version': self.workflow_version,
            'rundir': self.run_dir,
            'tasks_completed_count': self.tasks_completed_count,
            'tasks_failed_count': self.tasks_failed_count,
            'user': getuser(),
            'host': gethostname(),
        }
        self.db_logger.info("DFK start", extra=workflow_info)
        # ES logging end

        checkpoints = self.load_checkpoints(config.checkpoint_files)
        self.memoizer = Memoizer(self,
                                 memoize=config.app_cache,
                                 checkpoint=checkpoints)
        self.checkpointed_tasks = 0
        self._checkpoint_timer = None
        self.checkpoint_mode = config.checkpoint_mode

        data_manager = DataManager(
            max_threads=config.data_management_max_threads,
            executors=config.executors)
        self.executors = {
            e.label: e
            for e in config.executors + [data_manager]
        }
        for executor in self.executors.values():
            executor.run_dir = self.run_dir
            if hasattr(executor, 'provider'):
                if hasattr(executor.provider, 'script_dir'):
                    executor.provider.script_dir = os.path.join(
                        self.run_dir, 'submit_scripts')
                    if executor.provider.channel.script_dir is None:
                        executor.provider.channel.script_dir = os.path.join(
                            self.run_dir, 'submit_scripts')
                        if not executor.provider.channel.isdir(self.run_dir):
                            parent, child = pathlib.Path(
                                self.run_dir).parts[-2:]
                            remote_run_dir = os.path.join(parent, child)
                            executor.provider.channel.script_dir = os.path.join(
                                remote_run_dir, 'remote_submit_scripts')
                            executor.provider.script_dir = os.path.join(
                                self.run_dir, 'local_submit_scripts')
                    executor.provider.channel.makedirs(
                        executor.provider.channel.script_dir, exist_ok=True)
                    os.makedirs(executor.provider.script_dir, exist_ok=True)
            executor.start()

        if self.checkpoint_mode == "periodic":
            try:
                h, m, s = map(int, config.checkpoint_period.split(':'))
                checkpoint_period = (h * 3600) + (m * 60) + s
                self._checkpoint_timer = Timer(self.checkpoint,
                                               interval=checkpoint_period)
            except Exception:
                logger.error(
                    "invalid checkpoint_period provided:{0} expected HH:MM:SS".
                    format(config.checkpoint_period))
                self._checkpoint_timer = Timer(self.checkpoint,
                                               interval=(30 * 60))

        if any([x.managed for x in config.executors]):
            self.flowcontrol = FlowControl(self)
        else:
            self.flowcontrol = FlowNoControl(self)

        self.task_count = 0
        self.tasks = {}
        self.submitter_lock = threading.Lock()

        atexit.register(self.atexit_cleanup)
Ejemplo n.º 3
0
    def __init__(self, config=Config()):
        """Initialize the DataFlowKernel.

        Parameters
        ----------
        config : Config
            A specification of all configuration options. For more details see the
            :class:~`parsl.config.Config` documentation.
        """

        # this will be used to check cleanup only happens once
        self.cleanup_called = False

        if isinstance(config, dict):
            raise ConfigurationError(
                    'Expected `Config` class, received dictionary. For help, '
                    'see http://parsl.readthedocs.io/en/stable/stubs/parsl.config.Config.html')
        self._config = config
        self.run_dir = make_rundir(config.run_dir)

        if config.initialize_logging:
            parsl.set_file_logger("{}/parsl.log".format(self.run_dir), level=logging.DEBUG)

        logger.debug("Starting DataFlowKernel with config\n{}".format(config))

        if sys.version_info < (3, 6):
            logger.warning("Support for python versions < 3.6 is deprecated and will be removed after parsl 0.10")

        logger.info("Parsl version: {}".format(get_version()))

        self.checkpoint_lock = threading.Lock()

        self.usage_tracker = UsageTracker(self)
        self.usage_tracker.send_message()

        # Monitoring
        self.run_id = str(uuid4())
        self.tasks_completed_count = 0
        self.tasks_failed_count = 0
        self.tasks_dep_fail_count = 0

        self.monitoring = config.monitoring
        # hub address and port for interchange to connect
        self.hub_address = None
        self.hub_interchange_port = None
        if self.monitoring:
            if self.monitoring.logdir is None:
                self.monitoring.logdir = self.run_dir
            self.hub_address = self.monitoring.hub_address
            self.hub_interchange_port = self.monitoring.start(self.run_id)

        self.time_began = datetime.datetime.now()
        self.time_completed = None

        # TODO: make configurable
        logger.info("Run id is: " + self.run_id)

        self.workflow_name = None
        if self.monitoring is not None and self.monitoring.workflow_name is not None:
            self.workflow_name = self.monitoring.workflow_name
        else:
            for frame in inspect.stack():
                fname = os.path.basename(str(frame.filename))
                parsl_file_names = ['dflow.py', 'typeguard.py']
                # Find first file name not considered a parsl file
                if fname not in parsl_file_names:
                    self.workflow_name = fname
                    break

        self.workflow_version = str(self.time_began.replace(microsecond=0))
        if self.monitoring is not None and self.monitoring.workflow_version is not None:
            self.workflow_version = self.monitoring.workflow_version

        workflow_info = {
                'python_version': "{}.{}.{}".format(sys.version_info.major,
                                                    sys.version_info.minor,
                                                    sys.version_info.micro),
                'parsl_version': get_version(),
                "time_began": self.time_began,
                'time_completed': None,
                'workflow_duration': None,
                'run_id': self.run_id,
                'workflow_name': self.workflow_name,
                'workflow_version': self.workflow_version,
                'rundir': self.run_dir,
                'tasks_completed_count': self.tasks_completed_count,
                'tasks_failed_count': self.tasks_failed_count,
                'user': getuser(),
                'host': gethostname(),
        }

        if self.monitoring:
            self.monitoring.send(MessageType.WORKFLOW_INFO,
                                 workflow_info)

        checkpoints = self.load_checkpoints(config.checkpoint_files)
        self.memoizer = Memoizer(self, memoize=config.app_cache, checkpoint=checkpoints)
        self.checkpointed_tasks = 0
        self._checkpoint_timer = None
        self.checkpoint_mode = config.checkpoint_mode

        self.data_manager = DataManager(self)
        self.executors = {}
        data_manager_executor = ThreadPoolExecutor(max_threads=config.data_management_max_threads, label='data_manager')
        self.add_executors(config.executors + [data_manager_executor])

        if self.checkpoint_mode == "periodic":
            try:
                h, m, s = map(int, config.checkpoint_period.split(':'))
                checkpoint_period = (h * 3600) + (m * 60) + s
                self._checkpoint_timer = Timer(self.checkpoint, interval=checkpoint_period, name="Checkpoint")
            except Exception:
                logger.error("invalid checkpoint_period provided: {0} expected HH:MM:SS".format(config.checkpoint_period))
                self._checkpoint_timer = Timer(self.checkpoint, interval=(30 * 60), name="Checkpoint")

        # if we use the functionality of dynamically adding executors
        # all executors should be managed.
        if any([x.managed for x in config.executors]):
            self.flowcontrol = FlowControl(self)
        else:
            self.flowcontrol = FlowNoControl(self)

        self.task_count = 0
        self.tasks = {}
        self.submitter_lock = threading.Lock()

        atexit.register(self.atexit_cleanup)
Ejemplo n.º 4
0
    def __init__(self,
                 config=None,
                 executors=None,
                 lazyErrors=True,
                 appCache=True,
                 rundir=None,
                 retries=0,
                 checkpointFiles=None,
                 checkpointMode=None,
                 data_manager=None):
        """ Initialize the DataFlowKernel.

        Please note that keyword args passed to the DFK here will always override
        options passed in via the config.

        KWargs:
            - config (dict) : A single data object encapsulating all config attributes
            - executors (list of Executor objs): Optional, kept for (somewhat) backward compatibility with 0.2.0
            - lazyErrors(bool) : Default=True, allow workflow to continue on app failures.
            - appCache (bool) :Enable caching of apps
            - rundir (str) : Path to run directory. Defaults to ./runinfo/runNNN
            - retries(int): Default=0, Set the number of retry attempts in case of failure
            - checkpointFiles (list of str): List of filepaths to checkpoint files
            - checkpointMode (None, 'dfk_exit', 'task_exit', 'periodic'): Method to use.
            - data_manager (DataManager): User created DataManager
        Returns:
            DataFlowKernel object
        """
        # Create run dirs for this run
        self.rundir = make_rundir(config=config, path=rundir)
        parsl.set_file_logger("{}/parsl.log".format(self.rundir),
                              level=logging.DEBUG)

        logger.info("Parsl version: {}".format(parsl.__version__))
        logger.info("Libsubmit version: {}".format(libsubmit.__version__))

        # Update config with defaults
        self._config = update_config(config, self.rundir)

        # Set the data manager
        if data_manager:
            self.data_manager = data_manager
        else:
            self.data_manager = DataManager(config=self._config)

        # Start the anonymized usage tracker and send init msg
        self.usage_tracker = UsageTracker(self)
        self.usage_tracker.send_message()

        # Load checkpoints if any
        cpts = self.load_checkpoints(checkpointFiles)
        # Initialize the memoizer
        self.memoizer = Memoizer(self, memoize=appCache, checkpoint=cpts)
        self.checkpointed_tasks = 0
        self._checkpoint_timer = None

        if self._config:
            self._executors_managed = True
            # Create the executors
            epf = EPF()
            self.executors = epf.make(self.rundir, self._config)

            # set global vars from config
            self.lazy_fail = self._config["globals"].get(
                "lazyErrors", lazyErrors)
            self.fail_retries = self._config["globals"].get("retries", retries)
            self.flowcontrol = FlowControl(self, self._config)
            self.checkpoint_mode = self._config["globals"].get(
                "checkpointMode", checkpointMode)
            if self.checkpoint_mode == "periodic":
                period = self._config["globals"].get("checkpointPeriod",
                                                     "00:30:00")
                try:
                    h, m, s = map(int, period.split(':'))
                    checkpoint_period = (h * 3600) + (m * 60) + s
                    self._checkpoint_timer = Timer(self.checkpoint,
                                                   interval=checkpoint_period)
                except Exception as e:
                    logger.error(
                        "invalid checkpointPeriod provided:{0} expected HH:MM:SS"
                        .format(period))
                    self._checkpoint_timer = Timer(self.checkpoint,
                                                   interval=(30 * 60))

        else:
            self._executors_managed = False
            self.fail_retries = retries
            self.lazy_fail = lazyErrors
            self.executors = {i: x for i, x in enumerate(executors)}
            self.flowcontrol = FlowNoControl(self, None)
            self.checkpoint_mode = checkpointMode

        self.task_count = 0
        self.fut_task_lookup = {}
        self.tasks = {}
        self.task_launch_lock = threading.Lock()

        logger.debug("Using executors: {0}".format(self.executors))
        atexit.register(self.cleanup)
Ejemplo n.º 5
0
    def __init__(self, config=Config()):
        """Initialize the DataFlowKernel.

        Parameters
        ----------
        config : Config
            A specification of all configuration options. For more details see the
            :class:~`parsl.config.Config` documentation.
        """

        # this will be used to check cleanup only happens once
        self.cleanup_called = False

        if isinstance(config, dict):
            raise ConfigurationError(
                'Expected `Config` class, received dictionary. For help, '
                'see http://parsl.readthedocs.io/en/stable/stubs/parsl.config.Config.html'
            )
        self._config = config
        logger.debug("Starting DataFlowKernel with config\n{}".format(config))
        self.run_dir = make_rundir(config.run_dir)
        parsl.set_file_logger("{}/parsl.log".format(self.run_dir),
                              level=logging.DEBUG)

        logger.info("Parsl version: {}".format(get_version()))
        logger.info("Libsubmit version: {}".format(libsubmit.__version__))

        self.checkpoint_lock = threading.Lock()

        self.usage_tracker = UsageTracker(self)
        self.usage_tracker.send_message()

        checkpoints = self.load_checkpoints(config.checkpoint_files)
        self.memoizer = Memoizer(self,
                                 memoize=config.app_cache,
                                 checkpoint=checkpoints)
        self.checkpointed_tasks = 0
        self._checkpoint_timer = None
        self.checkpoint_mode = config.checkpoint_mode

        data_manager = DataManager.get_data_manager(
            max_threads=config.data_management_max_threads,
            executors=config.executors)
        self.executors = {
            e.label: e
            for e in config.executors + [data_manager]
        }
        for executor in self.executors.values():
            executor.run_dir = self.run_dir  # FIXME we should have a real interface for this
            executor.start()

        if self.checkpoint_mode == "periodic":
            try:
                h, m, s = map(int, config.checkpoint_period.split(':'))
                checkpoint_period = (h * 3600) + (m * 60) + s
                self._checkpoint_timer = Timer(self.checkpoint,
                                               interval=checkpoint_period)
            except Exception as e:
                logger.error(
                    "invalid checkpoint_period provided:{0} expected HH:MM:SS".
                    format(period))
                self._checkpoint_timer = Timer(self.checkpoint,
                                               interval=(30 * 60))

        if any([x.managed for x in config.executors]):
            self.flowcontrol = FlowControl(self)
        else:
            self.flowcontrol = FlowNoControl(self)

        self.task_count = 0
        self.fut_task_lookup = {}
        self.tasks = {}
        self.task_launch_lock = threading.Lock()

        atexit.register(self.atexit_cleanup)
Ejemplo n.º 6
0
    def __init__(self,
                 config=None,
                 executors=None,
                 lazyErrors=True,
                 appCache=True,
                 rundir=None,
                 retries=0,
                 checkpointFiles=None):
        """ Initialize the DataFlowKernel

        Please note that keyword args passed to the DFK here will always override
        options passed in via the config.

        KWargs:
            - config (Dict) : A single data object encapsulating all config attributes
            - executors (list of Executor objs): Optional, kept for (somewhat) backward compatibility with 0.2.0
            - lazyErrors(Bool) : Default=True, allow workflow to continue on app failures.
            - appCache (Bool) :Enable caching of apps
            - rundir (str) : Path to run directory. Defaults to ./runinfo/runNNN
            - retries(int): Default=0, Set the number of retry attempts in case of failure
            - checkpointFiles (list of str): List of filepaths to checkpoint files

        Returns:
            DataFlowKernel object
        """
        # Create run dirs for this run
        self.rundir = make_rundir(config=config, path=rundir)
        parsl.set_file_logger("{}/parsl.log".format(self.rundir),
                              level=logging.INFO)

        logger.info("Parsl version: {}".format(parsl.__version__))
        logger.info("Libsubmit version: {}".format(libsubmit.__version__))

        # Update config with defaults
        self._config = update_config(config, self.rundir)

        # Start the anonymized usage tracker and send init msg
        self.usage_tracker = UsageTracker(self)
        self.usage_tracker.send_message()

        # Load checkpoints if any
        cpts = self.load_checkpoints(checkpointFiles)
        # Initialize the memoizer
        self.memoizer = Memoizer(self, memoize=appCache, checkpoint=cpts)

        if self._config:
            self._executors_managed = True
            # Create the executors
            epf = EPF()
            self.executors = epf.make(self.rundir, self._config)

            # set global vars from config
            self.lazy_fail = self._config["globals"].get(
                "lazyErrors", lazyErrors)
            self.fail_retries = self._config["globals"].get("retries", retries)
            self.flowcontrol = FlowControl(self, self._config)
        else:
            self._executors_managed = False
            self.fail_retries = retries
            self.lazy_fail = lazyErrors
            self.executors = {i: x for i, x in enumerate(executors)}
            self.flowcontrol = FlowNoControl(self, None)

        self.task_count = 0
        self.fut_task_lookup = {}
        self.tasks = {}
        self.task_launch_lock = threading.Lock()

        logger.debug("Using executors: {0}".format(self.executors))
        atexit.register(self.cleanup)
Ejemplo n.º 7
0
    def __init__(self, config=Config()):
        """Initialize the DataFlowKernel.

        Parameters
        ----------
        config : Config
            A specification of all configuration options. For more details see the
            :class:~`parsl.config.Config` documentation.
        """

        # this will be used to check cleanup only happens once
        self.cleanup_called = False

        if isinstance(config, dict):
            raise ConfigurationError(
                'Expected `Config` class, received dictionary. For help, '
                'see http://parsl.readthedocs.io/en/stable/stubs/parsl.config.Config.html'
            )
        self._config = config
        logger.debug("Starting DataFlowKernel with config\n{}".format(config))
        self.run_dir = make_rundir(config.run_dir)
        parsl.set_file_logger("{}/parsl.log".format(self.run_dir),
                              level=logging.DEBUG)

        logger.info("Parsl version: {}".format(get_version()))
        logger.info("Libsubmit version: {}".format(libsubmit.__version__))

        self.checkpoint_lock = threading.Lock()

        self.usage_tracker = UsageTracker(self)
        self.usage_tracker.send_message()

        # ES logging
        self.db_logger_config = config.db_logger_config
        self.db_logger = get_db_logger(
            enable_es_logging=False
        ) if self.db_logger_config is None else get_db_logger(
            **self.db_logger_config)
        self.workflow_name = str(inspect.stack()[1][1])
        self.time_began = datetime.now()
        self.time_completed = None
        self.run_id = self.workflow_name + "-" + str(self.time_began.minute)
        self.dashboard = self.db_logger_config.get(
            'dashboard_link',
            None) if self.db_logger_config is not None else None
        # TODO: make configurable
        logger.info("Run id is: " + self.run_id)
        if self.dashboard is not None:
            logger.info("Dashboard is found at " + self.dashboard)
        self.db_logger.info("Python version: {}".format(sys.version_info))
        self.db_logger.info("Parsl version: {}".format(get_version()))
        self.db_logger.info("Libsubmit version: {}".format(
            libsubmit.__version__))
        self.db_logger.info(
            "DFK start",
            extra={
                "time_began":
                str(self.time_began.strftime('%Y-%m-%d %H:%M:%S')),
                'time_completed': str(self.time_completed),
                'task_run_id': self.run_id,
                'rundir': self.run_dir
            })
        self.db_logger.info("Name of script/workflow: " + self.run_id,
                            extra={'task_run_id': self.run_id})
        for executor in self._config.executors:
            self.db_logger.info("Listed executor: " + executor.label,
                                extra={'task_run_id': self.run_id})
        # ES logging end

        checkpoints = self.load_checkpoints(config.checkpoint_files)
        self.memoizer = Memoizer(self,
                                 memoize=config.app_cache,
                                 checkpoint=checkpoints)
        self.checkpointed_tasks = 0
        self._checkpoint_timer = None
        self.checkpoint_mode = config.checkpoint_mode

        data_manager = DataManager.get_data_manager(
            max_threads=config.data_management_max_threads,
            executors=config.executors)
        self.executors = {
            e.label: e
            for e in config.executors + [data_manager]
        }
        for executor in self.executors.values():
            executor.run_dir = self.run_dir  # FIXME we should have a real interface for this
            executor.start()

        if self.checkpoint_mode == "periodic":
            try:
                h, m, s = map(int, config.checkpoint_period.split(':'))
                checkpoint_period = (h * 3600) + (m * 60) + s
                self._checkpoint_timer = Timer(self.checkpoint,
                                               interval=checkpoint_period)
            except Exception as e:
                logger.error(
                    "invalid checkpoint_period provided:{0} expected HH:MM:SS".
                    format(config.checkpoint_period))
                self._checkpoint_timer = Timer(self.checkpoint,
                                               interval=(30 * 60))

        if any([x.managed for x in config.executors]):
            self.flowcontrol = FlowControl(self)
        else:
            self.flowcontrol = FlowNoControl(self)

        self.task_count = 0
        self.fut_task_lookup = {}
        self.tasks = {}
        self.task_launch_lock = threading.Lock()

        atexit.register(self.atexit_cleanup)