def parquet_to_dask(context: MLClientCtx,
                    parquet_url: Union[DataItem, str, Path, IO[AnyStr]],
                    inc_cols: Optional[List[str]] = None,
                    index_cols: Optional[List[str]] = None,
                    shards: int = 4,
                    threads_per: int = 4,
                    processes: bool = False,
                    memory_limit: str = '2GB',
                    persist: bool = True,
                    dask_key: str = 'my_dask_dataframe',
                    target_path: str = '') -> None:
    """Load parquet dataset into dask cluster
    
    If no cluster is found loads a new one and persist the data to it. It
    shouold not be necessary to create a new cluster when the function
    is run as a 'dask' job.
    
    :param context:         the function context
    :param parquet_url:     url of the parquet file or partitioned dataset as either
                            artifact DataItem, string, or path object (see pandas read_csv)
    :param inc_cols:        include only these columns (very fast)
    :param index_cols:      list of index column names (can be a long-running process)
    :param shards:          number of workers to launch
    :param threads_per:     number of threads per worker
    :param processes:       
    """
    if hasattr(context, 'dask_client'):
        context.logger.info('found cluster...')
        dask_client = context.dask_client
    else:
        context.logger.info('starting new cluster...')
        cluster = LocalCluster(n_workers=shards,
                               threads_per_worker=threads_per,
                               processes=processes,
                               memory_limit=memory_limit)
        dask_client = Client(cluster)

    context.logger.info(dask_client)

    df = dd.read_parquet(parquet_url)

    if persist and context:
        df = dask_client.persist(df)
        dask_client.publish_dataset(dask_key=df)
        context.dask_client = dask_client

        # share the scheduler
        filepath = os.path.join(target_path, 'scheduler.json')
        dask_client.write_scheduler_file(filepath)
        context.log_artifact('scheduler', target_path=filepath)

        print(df.head())
Exemple #2
0
class DaskParallelRunner(BaseRunner):
    """Interface to submit and collect a job in a distributed fashion.

    DaskParallelRunner is intended to comply with the bridge design pattern.

    Nevertheless, to reduce the amount of code within single-vs-parallel
    implementations, DaskParallelRunner wraps a BaseRunner object which
    is then executed in parallel on n_workers.

    This class then is constructed by passing a BaseRunner that implements
    a run() method, and is capable of doing so in a serial fashion. Then,
    this wrapper class called DaskParallelRunner uses dask to initialize
    N number of BaseRunner that actively wait of a RunInfo to produce a
    RunValue object.

    To be more precise, the work model is then:
    1- The smbo.intensifier dictates "what" to run (a configuration/instance/seed)
       via a RunInfo object.
    2- a tae_runner takes this RunInfo object and launches the task via
       tae_runner.submit_run(). In the case of DaskParallelRunner, n_workers
       receive a pickle-object of DaskParallelRunner.single_worker, each with a
       run() method coming from DaskParallelRunner.single_worker.run()
    3- RunInfo objects are run in a distributed fashion, an their results are
       available locally to each worker. Such result is collected by
       DaskParallelRunner.get_finished_runs() and then passed to the SMBO.
    4- Exceptions are also locally available to each worker and need to be
       collected.

    Dask works with Future object which are managed via the DaskParallelRunner.client.


    Attributes
    ----------

    results
    ta
    stats
    run_obj
    par_factor
    cost_for_crash
    abort_i_first_run_crash
    n_workers
    futures
    client

    Parameters
    ---------
    single_worker: BaseRunner
        A runner to run in a distributed fashion
    n_workers: int
        Number of workers to use for distributed run. Will be ignored if ``dask_client`` is not ``None``.
    patience: int
        How much to wait for workers to be available if one fails
    output_directory: str, optional
        If given, this will be used for the dask worker directory and for storing server information.
        If a dask client is passed, it will only be used for storing server information as the
        worker directory must be set by the program/user starting the workers.
    dask_client: dask.distributed.Client
        User-created dask client, can be used to start a dask cluster and then attach SMAC to it.
    """
    def __init__(
        self,
        single_worker: BaseRunner,
        n_workers: int,
        patience: int = 5,
        output_directory: typing.Optional[str] = None,
        dask_client: typing.Optional[dask.distributed.Client] = None,
    ):
        super(DaskParallelRunner, self).__init__(
            ta=single_worker.ta,
            stats=single_worker.stats,
            run_obj=single_worker.run_obj,
            par_factor=single_worker.par_factor,
            cost_for_crash=single_worker.cost_for_crash,
            abort_on_first_run_crash=single_worker.abort_on_first_run_crash,
        )

        # The single worker, which is replicated on a need
        # basis to every compute node
        self.single_worker = single_worker
        self.n_workers = n_workers

        # How much time to wait for workers to be available
        self.patience = patience

        self.output_directory = output_directory

        # Because a run() method can have pynisher, we need to prevent the multiprocessing
        # workers to be instantiated as demonic - this cannot be passed via worker_kwargs
        dask.config.set({'distributed.worker.daemon': False})
        if dask_client is None:
            self.close_client_at_del = True
            self.client = Client(n_workers=self.n_workers,
                                 processes=True,
                                 threads_per_worker=1,
                                 local_directory=output_directory)
            if self.output_directory:
                self.scheduler_file = os.path.join(self.output_directory,
                                                   '.dask_scheduler_file')
                self.client.write_scheduler_file(
                    scheduler_file=self.scheduler_file)
        else:
            self.close_client_at_del = False
            self.client = dask_client
        self.futures = []  # type: typing.List[Future]

        self.scheduler_info = self.client._get_scheduler_info()

    def submit_run(self, run_info: RunInfo) -> None:
        """This function submits a configuration
        embedded in a run_info object, and uses one of the workers
        to produce a result locally to each worker.

        The execution of a configuration follows this procedure:
        1- SMBO/intensifier generates a run_info
        2- SMBO calls submit_run so that a worker launches the run_info
        3- submit_run internally calls self.run(). it does so via a call to self.run_wrapper()
        which contains common code that any run() method will otherwise have to implement, like
        capping check.

        Child classes must implement a run() method.
        All results will be only available locally to each worker, so the
        main node needs to collect them.

        Parameters
        ----------
        run_info: RunInfo
            An object containing the configuration and the necessary data to run it

        """
        # Check for resources or block till one is available
        if not self._workers_available():
            wait(self.futures, return_when='FIRST_COMPLETED').done
            self._extract_completed_runs_from_futures()

        # In code check to make sure that there are resources
        if not self._workers_available():
            warnings.warn(
                "No workers are available. This could mean workers crashed"
                "Waiting for new workers...")
            time.sleep(self.patience)
            if not self._workers_available():
                raise ValueError(
                    "Tried to execute a job, but no worker was "
                    "available. This likely means that a worker crashed "
                    "or no workers were properly configured.")

        # At this point we can submit the job
        self.futures.append(
            self.client.submit(self.single_worker.run_wrapper, run_info))

    def get_finished_runs(
            self) -> typing.List[typing.Tuple[RunInfo, RunValue]]:
        """This method returns any finished configuration, and returns a list with
        the results of exercising the configurations. This class keeps populating results
        to self.results until a call to get_finished runs is done. In this case, the
        self.results list is emptied and all RunValues produced by running run() are
        returned.

        Returns
        -------
            List[RunInfo, RunValue]: A list of RunValues (and respective RunInfo), that is,
                the results of executing a run_info
            a submitted configuration
        """

        # Proactively see if more configs have finished
        self._extract_completed_runs_from_futures()

        results_list = []
        while self.results:
            results_list.append(self.results.pop())
        return results_list

    def _extract_completed_runs_from_futures(self) -> None:
        """
        A run is over, when a future has done() equal true.
        This function collects the completed futures and move
        them from self.futures to self.results.

        *** We make sure futures never exceed the capacity of
        the scheduler
        """

        # In code check to make sure we don;t exceed resource allocation
        if len(self.futures) > sum(self.client.nthreads().values()):
            warnings.warn(
                "More running jobs than resources available "
                "Should not have more futures/runs in remote workers "
                "than the number of workers. This could mean a worker "
                "crashed and was not able to be recovered by dask. ")

        # A future is removed to the list of futures as an indication
        # that a worker is available to take in an extra job
        done_futures = [f for f in self.futures if f.done()]
        for future in done_futures:
            self.results.append(future.result())
            self.futures.remove(future)

    def wait(self) -> None:
        """SMBO/intensifier might need to wait for runs to finish before making a decision.
        This class waits until 1 run completes
        """
        if self.futures:
            wait(self.futures, return_when='FIRST_COMPLETED').done

    def pending_runs(self) -> bool:
        """
        Whether or not there are configs still running. Generally if the runner is serial,
        launching a run instantly returns it's result. On parallel runners, there might
        be pending configurations to complete.
        """
        # If there are futures available, it translates
        # to runs still not finished/processed
        return len(self.futures) > 0

    def run(
        self,
        config: Configuration,
        instance: str,
        cutoff: typing.Optional[float] = None,
        seed: int = 12345,
        budget: typing.Optional[float] = None,
        instance_specific: str = "0",
    ) -> typing.Tuple[StatusType, float, float, typing.Dict]:
        """
        This method only complies with the abstract parent class. In the parallel case,
        we call the single worker run() method

        Parameters
        ----------
            config : Configuration
                dictionary param -> value
            instance : string
                problem instance
            cutoff : float, optional
                Wallclock time limit of the target algorithm. If no value is
                provided no limit will be enforced.
            seed : int
                random seed
            budget : float, optional
                A positive, real-valued number representing an arbitrary limit to the target
                algorithm. Handled by the target algorithm internally
            instance_specific: str
                instance specific information (e.g., domain file or solution)

        Returns
        -------
            status: enum of StatusType (int)
                {SUCCESS, TIMEOUT, CRASHED, ABORT}
            cost: float
                cost/regret/quality (float) (None, if not returned by TA)
            runtime: float
                runtime (None if not returned by TA)
            additional_info: dict
                all further additional run information
        """
        return self.single_worker.run(
            config=config,
            instance=instance,
            cutoff=cutoff,
            seed=seed,
            budget=budget,
            instance_specific=instance_specific,
        )

    def num_workers(self) -> int:
        """Total number of workers available. This number is dynamic
        as more resources can be allocated"""
        return sum(self.client.nthreads().values())

    def _workers_available(self) -> bool:
        """"Query if there are workers available, which means
        that there are resources to launch a dask job"""
        total_compute_power = sum(self.client.nthreads().values())
        if len(self.futures) < total_compute_power:
            return True
        return False

    def __del__(self) -> None:
        """Make sure that when this object gets deleted, the client is terminated. This is only done if
        the client was created by the dask runner."""
        if self.close_client_at_del:
            self.client.close()