Beispiel #1
0
def get_peeker():
    logger.info("getting new peeker")
    try:
        scheduler_address = os.environ[SCH_ADDR_KEY]
        client = get_client(scheduler_address)
    except Exception as e:
        logger.info("starting new dask client", exception=str(e))
        client = Client()
        with open(".env", "w") as fp:
            fp.write(f"{SCH_ADDR_KEY}={client.scheduler.address}")
    n_part = sum(client.nthreads().values())
    logger.info("dask client acquired",
                dashboard=client.dashboard_link,
                scheduler=client.scheduler.address,
                n_threads=n_part)

    app_data = load_entire_app_data()
    ddf = dd.from_pandas(app_data, npartitions=n_part).persist()

    cat_filt_cols = app_discrete_cols
    num_filt_cols = [chain_len_col]

    return FootballPeeker(ddf, cat_filt_cols, num_filt_cols)
Beispiel #2
0
    # just wait for a valid worker, not an empty worker
    # The safest approach is to track futures!

    run_history = []
    futures = []

    # Submit all jobs
    counter = 0

    while not dispatcher.is_complete():

        # Are there workers available?
        # The nthreads dynamically query for the scheduler capacity
        # On a local cluster, capacity can be increased by scale()
        # On a distributed, dask-worker 'tcp://127.0.0.1:45739' --nprocs 1 --nthreads 1
        total_compute_power = sum(client.nthreads().values())
        if len(futures) < total_compute_power:
            now = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
            print(f"Scheduling job {counter} at {now}")
            counter += 1

            # submit a new job
            run_info = dispatcher.get_next_run()
            futures.append(client.submit(fit_and_score, run_info))

            # If the cluster was not under full load, we check whether anything has completed
            # since the last iteration and immediately check if we need to submit more jobs
            try:
                done_futures = wait(futures,
                                    timeout=0,
                                    return_when='FIRST_COMPLETED').done
Beispiel #3
0
class DaskParallelRunner(BaseRunner):
    """Interface to submit and collect a job in a distributed fashion.

    DaskParallelRunner is intended to comply with the bridge design pattern.

    Nevertheless, to reduce the amount of code within single-vs-parallel
    implementations, DaskParallelRunner wraps a BaseRunner object which
    is then executed in parallel on n_workers.

    This class then is constructed by passing a BaseRunner that implements
    a run() method, and is capable of doing so in a serial fashion. Then,
    this wrapper class called DaskParallelRunner uses dask to initialize
    N number of BaseRunner that actively wait of a RunInfo to produce a
    RunValue object.

    To be more precise, the work model is then:
    1- The smbo.intensifier dictates "what" to run (a configuration/instance/seed)
       via a RunInfo object.
    2- a tae_runner takes this RunInfo object and launches the task via
       tae_runner.submit_run(). In the case of DaskParallelRunner, n_workers
       receive a pickle-object of DaskParallelRunner.single_worker, each with a
       run() method coming from DaskParallelRunner.single_worker.run()
    3- RunInfo objects are run in a distributed fashion, an their results are
       available locally to each worker. Such result is collected by
       DaskParallelRunner.get_finished_runs() and then passed to the SMBO.
    4- Exceptions are also locally available to each worker and need to be
       collected.

    Dask works with Future object which are managed via the DaskParallelRunner.client.


    Attributes
    ----------

    results
    ta
    stats
    run_obj
    par_factor
    cost_for_crash
    abort_i_first_run_crash
    n_workers
    futures
    client

    Parameters
    ---------
    single_worker: BaseRunner
        A runner to run in a distributed fashion
    n_workers: int
        Number of workers to use for distributed run. Will be ignored if ``dask_client`` is not ``None``.
    patience: int
        How much to wait for workers to be available if one fails
    output_directory: str, optional
        If given, this will be used for the dask worker directory and for storing server information.
        If a dask client is passed, it will only be used for storing server information as the
        worker directory must be set by the program/user starting the workers.
    dask_client: dask.distributed.Client
        User-created dask client, can be used to start a dask cluster and then attach SMAC to it.
    """
    def __init__(
        self,
        single_worker: BaseRunner,
        n_workers: int,
        patience: int = 5,
        output_directory: typing.Optional[str] = None,
        dask_client: typing.Optional[dask.distributed.Client] = None,
    ):
        super(DaskParallelRunner, self).__init__(
            ta=single_worker.ta,
            stats=single_worker.stats,
            run_obj=single_worker.run_obj,
            par_factor=single_worker.par_factor,
            cost_for_crash=single_worker.cost_for_crash,
            abort_on_first_run_crash=single_worker.abort_on_first_run_crash,
        )

        # The single worker, which is replicated on a need
        # basis to every compute node
        self.single_worker = single_worker
        self.n_workers = n_workers

        # How much time to wait for workers to be available
        self.patience = patience

        self.output_directory = output_directory

        # Because a run() method can have pynisher, we need to prevent the multiprocessing
        # workers to be instantiated as demonic - this cannot be passed via worker_kwargs
        dask.config.set({'distributed.worker.daemon': False})
        if dask_client is None:
            self.close_client_at_del = True
            self.client = Client(n_workers=self.n_workers,
                                 processes=True,
                                 threads_per_worker=1,
                                 local_directory=output_directory)
            if self.output_directory:
                self.scheduler_file = os.path.join(self.output_directory,
                                                   '.dask_scheduler_file')
                self.client.write_scheduler_file(
                    scheduler_file=self.scheduler_file)
        else:
            self.close_client_at_del = False
            self.client = dask_client
        self.futures = []  # type: typing.List[Future]

        self.scheduler_info = self.client._get_scheduler_info()

    def submit_run(self, run_info: RunInfo) -> None:
        """This function submits a configuration
        embedded in a run_info object, and uses one of the workers
        to produce a result locally to each worker.

        The execution of a configuration follows this procedure:
        1- SMBO/intensifier generates a run_info
        2- SMBO calls submit_run so that a worker launches the run_info
        3- submit_run internally calls self.run(). it does so via a call to self.run_wrapper()
        which contains common code that any run() method will otherwise have to implement, like
        capping check.

        Child classes must implement a run() method.
        All results will be only available locally to each worker, so the
        main node needs to collect them.

        Parameters
        ----------
        run_info: RunInfo
            An object containing the configuration and the necessary data to run it

        """
        # Check for resources or block till one is available
        if not self._workers_available():
            wait(self.futures, return_when='FIRST_COMPLETED').done
            self._extract_completed_runs_from_futures()

        # In code check to make sure that there are resources
        if not self._workers_available():
            warnings.warn(
                "No workers are available. This could mean workers crashed"
                "Waiting for new workers...")
            time.sleep(self.patience)
            if not self._workers_available():
                raise ValueError(
                    "Tried to execute a job, but no worker was "
                    "available. This likely means that a worker crashed "
                    "or no workers were properly configured.")

        # At this point we can submit the job
        self.futures.append(
            self.client.submit(self.single_worker.run_wrapper, run_info))

    def get_finished_runs(
            self) -> typing.List[typing.Tuple[RunInfo, RunValue]]:
        """This method returns any finished configuration, and returns a list with
        the results of exercising the configurations. This class keeps populating results
        to self.results until a call to get_finished runs is done. In this case, the
        self.results list is emptied and all RunValues produced by running run() are
        returned.

        Returns
        -------
            List[RunInfo, RunValue]: A list of RunValues (and respective RunInfo), that is,
                the results of executing a run_info
            a submitted configuration
        """

        # Proactively see if more configs have finished
        self._extract_completed_runs_from_futures()

        results_list = []
        while self.results:
            results_list.append(self.results.pop())
        return results_list

    def _extract_completed_runs_from_futures(self) -> None:
        """
        A run is over, when a future has done() equal true.
        This function collects the completed futures and move
        them from self.futures to self.results.

        *** We make sure futures never exceed the capacity of
        the scheduler
        """

        # In code check to make sure we don;t exceed resource allocation
        if len(self.futures) > sum(self.client.nthreads().values()):
            warnings.warn(
                "More running jobs than resources available "
                "Should not have more futures/runs in remote workers "
                "than the number of workers. This could mean a worker "
                "crashed and was not able to be recovered by dask. ")

        # A future is removed to the list of futures as an indication
        # that a worker is available to take in an extra job
        done_futures = [f for f in self.futures if f.done()]
        for future in done_futures:
            self.results.append(future.result())
            self.futures.remove(future)

    def wait(self) -> None:
        """SMBO/intensifier might need to wait for runs to finish before making a decision.
        This class waits until 1 run completes
        """
        if self.futures:
            wait(self.futures, return_when='FIRST_COMPLETED').done

    def pending_runs(self) -> bool:
        """
        Whether or not there are configs still running. Generally if the runner is serial,
        launching a run instantly returns it's result. On parallel runners, there might
        be pending configurations to complete.
        """
        # If there are futures available, it translates
        # to runs still not finished/processed
        return len(self.futures) > 0

    def run(
        self,
        config: Configuration,
        instance: str,
        cutoff: typing.Optional[float] = None,
        seed: int = 12345,
        budget: typing.Optional[float] = None,
        instance_specific: str = "0",
    ) -> typing.Tuple[StatusType, float, float, typing.Dict]:
        """
        This method only complies with the abstract parent class. In the parallel case,
        we call the single worker run() method

        Parameters
        ----------
            config : Configuration
                dictionary param -> value
            instance : string
                problem instance
            cutoff : float, optional
                Wallclock time limit of the target algorithm. If no value is
                provided no limit will be enforced.
            seed : int
                random seed
            budget : float, optional
                A positive, real-valued number representing an arbitrary limit to the target
                algorithm. Handled by the target algorithm internally
            instance_specific: str
                instance specific information (e.g., domain file or solution)

        Returns
        -------
            status: enum of StatusType (int)
                {SUCCESS, TIMEOUT, CRASHED, ABORT}
            cost: float
                cost/regret/quality (float) (None, if not returned by TA)
            runtime: float
                runtime (None if not returned by TA)
            additional_info: dict
                all further additional run information
        """
        return self.single_worker.run(
            config=config,
            instance=instance,
            cutoff=cutoff,
            seed=seed,
            budget=budget,
            instance_specific=instance_specific,
        )

    def num_workers(self) -> int:
        """Total number of workers available. This number is dynamic
        as more resources can be allocated"""
        return sum(self.client.nthreads().values())

    def _workers_available(self) -> bool:
        """"Query if there are workers available, which means
        that there are resources to launch a dask job"""
        total_compute_power = sum(self.client.nthreads().values())
        if len(self.futures) < total_compute_power:
            return True
        return False

    def __del__(self) -> None:
        """Make sure that when this object gets deleted, the client is terminated. This is only done if
        the client was created by the dask runner."""
        if self.close_client_at_del:
            self.client.close()
Beispiel #4
0
def main():

    # client = Client(processes = False) # threads ?
    client = Client()
    size = 10000000
    # size       = 20
    # shards     = 20
    # shards     = 6
    # shards     = 1
    shards = 12
    shape = [size]
    lat = np.random.rand(size) * 180.0 - 90.0
    lon = np.random.rand(size) * 360.0 - 180.0
    resolution_ = 8
    resolution = np.full(shape, resolution_, dtype=np.int64)

    # print('lat shape: ',lat.shape)

    print('')
    serial_start = timer()
    s_sids = ps.from_latlon(lat, lon, resolution_)
    s_sidsstr = [hex16(s_sids[i]) for i in range(len(s_sids))]
    serial_end = timer()
    # print('0 s_sids: ',s_sids)
    print('time s_sids: ', serial_end - serial_start)

    def w_from_latlon(llr):
        # print('')
        # print('llr:  ',llr)
        sids = ps.from_latlon(llr[0], llr[1], int(llr[2][0]))
        # print('sids: ',sids)
        # print('')
        return sids

    # def w_from_latlon1(lat,lon,res):
    #     return ps.from_latlon(np.array([lat],dtype=np.double)\
    #                            ,np.array([lon],dtype=np.double)\
    #                            ,int(res))
    # sid        = ps.from_latlon(lat,lon,resolution)
    # sid        = client.map(w_from_latlon1,lat,lon,resolution) # futures

    dask_start = timer()
    shard_size = int(size / shards)
    shard_bins = np.arange(shards + 1) * shard_size
    shard_bins[-1] = size

    # print('---')
    # print('shards:     ',shards)
    # print('shard_size: ',shard_size)
    # print('shard_bins: ',shard_bins)
    # print('---')
    lat_shards = [lat[shard_bins[i]:shard_bins[i + 1]] for i in range(shards)]
    lon_shards = [lon[shard_bins[i]:shard_bins[i + 1]] for i in range(shards)]
    res_shards = [
        resolution[shard_bins[i]:shard_bins[i + 1]] for i in range(shards)
    ]

    llr_shards = []
    for i in range(shards):
        llr_shards.append([lat_shards[i], lon_shards[i], res_shards[i]])

    # print('llr_shards len: ',len(llr_shards))
    # print('llr_shards: ',llr_shards)

    ## future = client.submit(func, big_data)    # bad
    ##
    ## big_future = client.scatter(big_data)     # good
    ## future = client.submit(func, big_future)  # good

    # sid        = client.map(w_from_latlon,llr_shards) # futures

    big_future = client.scatter(llr_shards)
    sid = client.map(w_from_latlon, big_future)  # futures

    # print('0 sid:  ',sid)
    # print('9 len(sid): ',len(sid))
    # for i in range(shards):
    #     print(i, ' 10 sid: ',sid[i])
    #     print(i, ' 11 sid: ',sid[i].result())

    # print('15 sid:    ',[type(i) for i in sid])

    sid_cat = np.concatenate([i.result() for i in sid])
    sidsstr = [hex16(sid_cat[i]) for i in range(len(sid_cat))]
    dask_end = timer()
    # print('2 sids: ',sids)
    sids = sid_cat

    print('')
    # for i in range(size-20,size):
    for i in np.array(np.random.rand(20) * size, dtype=np.int64):
        print("%09i" % i, sidsstr[i], s_sidsstr[i], ' ', sids[i] - s_sids[i])

    print('')
    print('dask total threads:  ', sum(client.nthreads().values()))
    print('size:                ', size)
    print('shards:              ', shards)
    print('')
    print('time sids:           ', dask_end - dask_start)
    print('time s_sids:         ', serial_end - serial_start)
    print('parallel speed up:   ',
          (serial_end - serial_start) / (dask_end - dask_start))

    client.close()
Beispiel #5
0
    }

    # initialize the random seed generator
    random_generator = np.random.default_rng(RAND_SEED)
    all_seeds = random_generator.choice(MAX_SEED, TOTAL_RUN, replace=False)
    my_seed_generator = list(runautopilots.chunks(all_seeds, CHUNK_SIZE))
    ndebug = len(DEBUG_NUMS)
    my_seed_generator[0][0:ndebug] = DEBUG_NUMS

    # initialize best list
    best_list = runautopilots.reset_best_simulations(NUM_BEST)
    # if parallel, initialize client
    if not SERIES:
        client = Client()
        ncores = sum(client.ncores().values())
        nthreads = sum(client.nthreads().values())
        runtype = f"Parallel with [Cores:{ncores:.0f}] x [Threads:{nthreads:.0f}]"
    else:
        runtype = "Series"

    # needed for each autpilot
    my_mission = mission.mission(**class_params["mission_params"])

    t_start = time.time()
    if not DEBUG:

        # initialize autopilot list
        ap_template = autopilot.ap_nn(my_mission.survey_lines,
                                      **autopilot_params)

        all_autopilot_list = []
Beispiel #6
0
#python local.py
def hello_world(n):
    import time
    time.sleep(1)
    return "Hello world " + str(n)


if __name__ == '__main__':
    from dask.distributed import Client, LocalCluster
    import time
    import numpy as np

    cluster = LocalCluster(n_workers=4, threads_per_worker=1)
    client = Client(cluster)

    print(cluster)
    print(client.nthreads())

    start = time.time()
    res = client.map(hello_world, np.arange(36))
    for r in res:
        print(r.result())

    client.close()
    print(time.time() - start)