Esempio n. 1
0
    def __init__(self,
                 vm_reference,
                 init_blocks=1,
                 min_blocks=0,
                 max_blocks=10,
                 parallelism=1,
                 worker_init='',
                 location='westus',
                 group_name='parsl.auto',
                 key_name=None,
                 key_file=None,
                 vnet_name="parsl.auto",
                 linger=False,
                 launcher=SingleNodeLauncher()):
        if not _api_enabled:
            raise OptionalModuleMissing(
                ['azure', 'msrestazure'], "Azure Provider requires the azure module.")

        self._label = 'azure'
        self.init_blocks = init_blocks
        self.min_blocks = min_blocks
        self.max_blocks = max_blocks
        self.max_nodes = max_blocks
        self.parallelism = parallelism
        self.nodes_per_block = 1

        self.worker_init = worker_init
        self.vm_reference = vm_reference
        self.region = location
        self.vnet_name = vnet_name

        self.key_name = key_name
        self.key_file = key_file
        self.location = location
        self.group_name = group_name

        self.launcher = launcher
        self.linger = linger
        self.resources = {}
        self.instances = []

        env_specified = os.getenv("AZURE_CLIENT_ID") is not None and os.getenv(
            "AZURE_CLIENT_SECRET") is not None and os.getenv(
            "AZURE_TENANT_ID") is not None and os.getenv("AZURE_SUBSCRIPTION_ID") is not None

        if key_file is None and not env_specified:
            raise ConfigurationError("Must specify either, 'key_file', or\
                 `AZURE_CLIENT_ID`, `AZURE_CLIENT_SECRET`,\
                  and `AZURE_TENANT_ID` environment variables.")

        if key_file is None:
            self.clientid = os.getenv("AZURE_CLIENT_ID")
            self.clientsecret = os.getenv("AZURE_CLIENT_SECRET")
            self.tenantid = os.getenv("AZURE_TENANT_ID")
            self.subid = os.getenv("AZURE_SUBSCRIPTION_ID")
        else:
            with open(key_file) as fh:
                keys = json.load(fh)
                self.clientid = keys.get("AZURE_CLIENT_ID")
                self.clientsecret = keys.get("AZURE_CLIENT_SECRET")
                self.tenantid = keys.get("AZURE_TENANT_ID")
                self.subid = keys.get("AZURE_SUBSCRIPTION_ID")

        self.get_clients()
Esempio n. 2
0
    def __init__(self, config=Config()):
        """Initialize the DataFlowKernel.

        Parameters
        ----------
        config : Config
            A specification of all configuration options. For more details see the
            :class:~`parsl.config.Config` documentation.
        """

        # this will be used to check cleanup only happens once
        self.cleanup_called = False

        if isinstance(config, dict):
            raise ConfigurationError(
                    'Expected `Config` class, received dictionary. For help, '
                    'see http://parsl.readthedocs.io/en/stable/stubs/parsl.config.Config.html')
        self._config = config
        self.run_dir = make_rundir(config.run_dir)
        parsl.set_file_logger("{}/parsl.log".format(self.run_dir), level=logging.DEBUG)
        logger.debug("Starting DataFlowKernel with config\n{}".format(config))
        logger.info("Parsl version: {}".format(get_version()))

        self.checkpoint_lock = threading.Lock()

        self.usage_tracker = UsageTracker(self)
        self.usage_tracker.send_message()

        # Monitoring
        self.run_id = str(uuid4())
        self.tasks_completed_count = 0
        self.tasks_failed_count = 0

        self.monitoring = config.monitoring
        # hub address and port for interchange to connect
        self.hub_address = None
        self.hub_interchange_port = None
        if self.monitoring:
            if self.monitoring.logdir is None:
                self.monitoring.logdir = self.run_dir
            self.hub_address = self.monitoring.hub_address
            self.hub_interchange_port = self.monitoring.start(self.run_id)

        self.time_began = datetime.datetime.now()
        self.time_completed = None

        # TODO: make configurable
        logger.info("Run id is: " + self.run_id)

        self.workflow_name = None
        if self.monitoring is not None and self.monitoring.workflow_name is not None:
            self.workflow_name = self.monitoring.workflow_name
        else:
            for frame in inspect.stack():
                fname = os.path.basename(str(frame.filename))
                parsl_file_names = ['dflow.py', 'typeguard.py']
                # Find first file name not considered a parsl file
                if fname not in parsl_file_names:
                    self.workflow_name = fname
                    break

        self.workflow_version = str(self.time_began.replace(microsecond=0))
        if self.monitoring is not None and self.monitoring.workflow_version is not None:
            self.workflow_version = self.monitoring.workflow_version

        workflow_info = {
                'python_version': "{}.{}.{}".format(sys.version_info.major,
                                                    sys.version_info.minor,
                                                    sys.version_info.micro),
                'parsl_version': get_version(),
                "time_began": self.time_began,
                'time_completed': None,
                'workflow_duration': None,
                'run_id': self.run_id,
                'workflow_name': self.workflow_name,
                'workflow_version': self.workflow_version,
                'rundir': self.run_dir,
                'tasks_completed_count': self.tasks_completed_count,
                'tasks_failed_count': self.tasks_failed_count,
                'user': getuser(),
                'host': gethostname(),
        }

        if self.monitoring:
            self.monitoring.send(MessageType.WORKFLOW_INFO,
                                 workflow_info)

        checkpoints = self.load_checkpoints(config.checkpoint_files)
        self.memoizer = Memoizer(self, memoize=config.app_cache, checkpoint=checkpoints)
        self.checkpointed_tasks = 0
        self._checkpoint_timer = None
        self.checkpoint_mode = config.checkpoint_mode

        self.data_manager = DataManager(self)
        self.executors = {}
        data_manager_executor = ThreadPoolExecutor(max_threads=config.data_management_max_threads, label='data_manager')
        self.add_executors(config.executors + [data_manager_executor])

        if self.checkpoint_mode == "periodic":
            try:
                h, m, s = map(int, config.checkpoint_period.split(':'))
                checkpoint_period = (h * 3600) + (m * 60) + s
                self._checkpoint_timer = Timer(self.checkpoint, interval=checkpoint_period, name="Checkpoint")
            except Exception:
                logger.error("invalid checkpoint_period provided: {0} expected HH:MM:SS".format(config.checkpoint_period))
                self._checkpoint_timer = Timer(self.checkpoint, interval=(30 * 60), name="Checkpoint")

        # if we use the functionality of dynamically adding executors
        # all executors should be managed.
        if any([x.managed for x in config.executors]):
            self.flowcontrol = FlowControl(self)
        else:
            self.flowcontrol = FlowNoControl(self)

        self.task_count = 0
        self.tasks = {}
        self.submitter_lock = threading.Lock()

        atexit.register(self.atexit_cleanup)
Esempio n. 3
0
    def __init__(self,
                 label='HighThroughputExecutor',
                 provider=LocalProvider(),
                 launch_cmd=None,
                 address="127.0.0.1",
                 worker_ports=None,
                 worker_port_range=(54000, 55000),
                 interchange_port_range=(55000, 56000),
                 storage_access=None,
                 working_dir=None,
                 worker_debug=False,
                 cores_per_worker=1.0,
                 max_workers=float('inf'),
                 heartbeat_threshold=120,
                 heartbeat_period=30,
                 poll_period=10,
                 container_image=None,
                 worker_mode="singularity_reuse",
                 suppress_failure=False,
                 endpoint_id=None,
                 endpoint_db=None,
                 managed=True,
                 task_status_queue=None):

        logger.debug("Initializing HighThroughputExecutor")

        self.label = label
        self.launch_cmd = launch_cmd
        self.provider = provider
        self.worker_debug = worker_debug
        self.storage_access = storage_access if storage_access is not None else []
        if len(self.storage_access) > 1:
            raise ConfigurationError(
                'Multiple storage access schemes are not supported')
        self.working_dir = working_dir
        self.managed = managed
        self.blocks = []
        self.tasks = {}
        self.cores_per_worker = cores_per_worker
        self.max_workers = max_workers
        self.endpoint_db = endpoint_db
        self.endpoint_db.connect()
        self.endpoint_id = endpoint_id
        self._task_counter = 0
        self.address = address
        self.worker_ports = worker_ports
        self.worker_port_range = worker_port_range
        self.interchange_port_range = interchange_port_range
        self.heartbeat_threshold = heartbeat_threshold
        self.heartbeat_period = heartbeat_period
        self.poll_period = poll_period
        self.suppress_failure = suppress_failure
        self.run_dir = '.'
        self.queue_proc = None

        self.task_status_queue = task_status_queue

        # FuncX specific options
        self.container_image = container_image
        self.worker_mode = worker_mode
        self.last_response_time = time.time()

        if not launch_cmd:
            self.launch_cmd = ("process_worker_pool.py {debug} {max_workers} "
                               "-c {cores_per_worker} "
                               "--poll {poll_period} "
                               "--task_url={task_url} "
                               "--result_url={result_url} "
                               "--logdir={logdir} "
                               "--hb_period={heartbeat_period} "
                               "--hb_threshold={heartbeat_threshold} "
                               "--mode={worker_mode} "
                               "--container_image={container_image} ")

        self.ix_launch_cmd = ("htex-interchange {debug} -c={client_address} "
                              "--client_ports={client_ports} "
                              "--worker_port_range={worker_port_range} "
                              "--logdir={logdir} "
                              "{suppress_failure} "
                              "--hb_threshold={heartbeat_threshold} ")
Esempio n. 4
0
    def __init__(self,
                 image_id,
                 key_name,
                 init_blocks=1,
                 min_blocks=0,
                 max_blocks=10,
                 nodes_per_block=1,
                 parallelism=1,

                 worker_init='',
                 instance_type='t2.small',
                 region='us-east-2',
                 spot_max_bid=0,

                 key_file=None,
                 profile=None,
                 iam_instance_profile_arn='',

                 state_file=None,
                 walltime="01:00:00",
                 linger=False,
                 launcher=SingleNodeLauncher()):
        if not _boto_enabled:
            raise OptionalModuleMissing(['boto3'], "AWS Provider requires the boto3 module.")

        self.image_id = image_id
        self._label = 'ec2'
        self.init_blocks = init_blocks
        self.min_blocks = min_blocks
        self.max_blocks = max_blocks
        self.nodes_per_block = nodes_per_block
        self.max_nodes = max_blocks * nodes_per_block
        self.parallelism = parallelism

        self.worker_init = worker_init
        self.instance_type = instance_type
        self.region = region
        self.spot_max_bid = spot_max_bid

        self.key_name = key_name
        self.key_file = key_file
        self.profile = profile
        self.iam_instance_profile_arn = iam_instance_profile_arn

        self.walltime = walltime
        self.launcher = launcher
        self.linger = linger
        self.resources = {}
        self.state_file = state_file if state_file is not None else 'awsproviderstate.json'

        env_specified = os.getenv("AWS_ACCESS_KEY_ID") is not None and os.getenv("AWS_SECRET_ACCESS_KEY") is not None
        if profile is None and key_file is None and not env_specified:
            raise ConfigurationError("Must specify either profile', 'key_file', or "
                                     "'AWS_ACCESS_KEY_ID' and 'AWS_SECRET_ACCESS_KEY' environment variables.")

        try:
            self.initialize_boto_client()
        except Exception as e:
            logger.error("{} failed to initialize.".format(self))
            raise e

        state_file_exists = False
        try:
            self.read_state_file(self.state_file)
            state_file_exists = True
        except Exception:
            logger.info("No state file found. Cannot load previous options. Creating new infrastructure.")

        if not state_file_exists:
            try:
                self.create_vpc().id
            except Exception as e:
                logger.info("Failed to create ec2 infrastructure: {0}".format(e))
                raise
            else:
                self.write_state_file()
Esempio n. 5
0
    def __init__(self,
                 label: str = 'HighThroughputExecutor',
                 provider: ExecutionProvider = LocalProvider(),
                 launch_cmd: Optional[str] = None,
                 address: str = "127.0.0.1",
                 worker_ports: Optional[Tuple[int, int]] = None,
                 worker_port_range: Optional[Tuple[int, int]] = (54000, 55000),
                 interchange_port_range: Optional[Tuple[int,
                                                        int]] = (55000, 56000),
                 storage_access: Optional[List[Any]] = None,
                 working_dir: Optional[str] = None,
                 worker_debug: bool = False,
                 cores_per_worker: float = 1.0,
                 mem_per_worker: Optional[float] = None,
                 max_workers: Union[int, float] = float('inf'),
                 prefetch_capacity: int = 0,
                 heartbeat_threshold: int = 120,
                 heartbeat_period: int = 30,
                 poll_period: int = 10,
                 suppress_failure: bool = False,
                 managed: bool = True,
                 worker_logdir_root: Optional[str] = None):

        logger.debug("Initializing HighThroughputExecutor")

        self.label = label
        self.launch_cmd = launch_cmd
        self.provider = provider
        self.worker_debug = worker_debug
        self.storage_access = storage_access if storage_access is not None else []
        if len(self.storage_access) > 1:
            raise ConfigurationError(
                'Multiple storage access schemes are not supported')
        self.working_dir = working_dir
        self.managed = managed
        self.blocks = {}  # type: Dict[str, str]
        self.tasks = {}  # type: Dict[str, Future]
        self.cores_per_worker = cores_per_worker
        self.mem_per_worker = mem_per_worker
        self.max_workers = max_workers
        self.prefetch_capacity = prefetch_capacity

        self._task_counter = 0
        self.address = address
        self.hub_address = None  # set to the correct hub address in dfk
        self.hub_port = None  # set to the correct hub port in dfk
        self.worker_ports = worker_ports
        self.worker_port_range = worker_port_range
        self.interchange_port_range = interchange_port_range
        self.heartbeat_threshold = heartbeat_threshold
        self.heartbeat_period = heartbeat_period
        self.poll_period = poll_period
        self.suppress_failure = suppress_failure
        self.run_dir = '.'
        self.worker_logdir_root = worker_logdir_root

        if not launch_cmd:
            self.launch_cmd = ("process_worker_pool.py {debug} {max_workers} "
                               "-p {prefetch_capacity} "
                               "-c {cores_per_worker} "
                               "-m {mem_per_worker} "
                               "--poll {poll_period} "
                               "--task_url={task_url} "
                               "--result_url={result_url} "
                               "--logdir={logdir} "
                               "--block_id={{block_id}} "
                               "--hb_period={heartbeat_period} "
                               "--hb_threshold={heartbeat_threshold} ")