Example #1
0
def initialize_dask():
    """Initialize Dask environment."""
    from distributed.client import default_client

    try:
        client = default_client()
    except ValueError:
        from distributed import Client

        # The indentation here is intentional, we want the code to be indented.
        ErrorMessage.not_initialized(
            "Dask",
            """
    from distributed import Client

    client = Client()
""",
        )
        num_cpus = CpuCount.get()
        memory_limit = Memory.get()
        worker_memory_limit = memory_limit // num_cpus if memory_limit else "auto"
        client = Client(n_workers=num_cpus, memory_limit=worker_memory_limit)

    num_cpus = len(client.ncores())
    NPartitions._put(num_cpus)
Example #2
0
def initialize_dask():
    from distributed.client import get_client

    try:
        client = get_client()
    except ValueError:
        from distributed import Client

        # The indentation here is intentional, we want the code to be indented.
        ErrorMessage.not_initialized(
            "Dask",
            """
    from distributed import Client

    client = Client()
""",
        )
        client = Client(n_workers=CpuCount.get())

    num_cpus = len(client.ncores())
    NPartitions.put_if_default(num_cpus)
Example #3
0
                ignore_reinit_error=True,
            )
            num_cpus = ray.global_state.cluster_resources()["CPU"]
    except AssertionError:
        pass
elif execution_engine == "Dask":
    from distributed.client import _get_global_client

    if threading.current_thread().name == "MainThread":
        # initialize the dask client
        client = _get_global_client()
        if client is None:
            from distributed import Client

            client = Client()
        num_cpus = sum(client.ncores().values())
elif execution_engine != "Python":
    raise ImportError(
        "Unrecognized execution engine: {}.".format(execution_engine))

DEFAULT_NPARTITIONS = max(4, int(num_cpus))

__all__ = [
    "DataFrame",
    "Series",
    "read_csv",
    "read_parquet",
    "read_json",
    "read_html",
    "read_clipboard",
    "read_excel",
Example #4
0
class DaskHelper:
    """ Manages Dask client and provides associated helper functions.
    """
    def __init__(self, n_workers=1, client=None):
        if client is not None and isinstance(client, Client):
            self.client = client
            self.n_workers = self._get_n_workers()
        else:
            self.n_workers = n_workers
            self.client = Client(n_workers=self.n_workers,
                                 processes=True,
                                 threads_per_worker=1,
                                 scheduler_port=0)
        self.futures = []
        self.shared_data = None
        self.worker_list = self.update_worker_list()
        # adds only this many workers at a time when new workers available
        self._add_worker_batch_lim = 20

    def _get_n_workers(self):
        n_workers = len(self.client.ncores())
        return n_workers

    def update_n_workers(self):
        self.n_workers = self._get_n_workers()

    def _get_worker_list(self):
        worker_list = list(self.client.ncores().keys())
        return worker_list

    def update_worker_list(self):
        self.worker_list = self._get_worker_list()

    def __getstate__(self):
        """ Allows the object to picklable while having Dask client as a class attribute.
        """
        d = dict(self.__dict__)
        d["client"] = None  # hack to allow Dask client to be a class attribute
        return d

    def __del__(self):
        """ Ensures a clean kill of the Dask client and frees up a port.
        """
        if hasattr(self, "client") and isinstance(self, Client):
            self.client.close()

    def submit_job(self, func, x, workers):
        if self.shared_data is not None:
            # self.client.submit(func, x, self.shared_data, workers=workers)
            self.futures.append(
                self.client.submit(func, x, self.shared_data, workers=workers))
        else:
            # self.client.submit(func, x, workers=workers)
            self.futures.append(self.client.submit(func, x, workers=workers))

    def _check_a_worker(self, worker_metrics):
        """ Checks if a worker can take a new job or not

        If a worker is executing (1) then it is certainly not free.
        If a worker is not executing (0) and has no jobs ready (0) then it likely free.
        If a worker has more than 1 job(s) in memory and ready to be executed, then it is not free.
        Else, it is considered free.
        """
        if worker_metrics["executing"]:
            return False
        if not worker_metrics["executing"] and not worker_metrics["ready"]:
            return True
        if worker_metrics["in_memory"] * worker_metrics["ready"]:
            return False
        return True

    def is_worker_available(self) -> Union[List, bool]:
        """ Checks if at least one worker is available to run a job
        """
        if self.n_workers == 1:
            # in the synchronous case, one worker is always available
            return True
        self.update_n_workers()
        # `self.futures` contains only the list of futures submitted by the main process that
        # instantiated this class, which is adequate to check if workers available when workers
        # are exclusive available to only this process
        if len(self.futures) >= self.n_workers:
            # pause/wait if active worker count greater allocated workers
            return []  # False
        # Given multiple different benchmark processes can share the same pool of workers, to
        # have a better estimate of queued jobs, need to retrieve information about the
        # status of each worker in the current worker_list
        if hasattr(self.client, "_scheduler_identity") and \
                "workers" in self.client._scheduler_identity:
            workers = self.client._scheduler_identity["workers"]
            self.worker_list = list(workers.keys())
            worker_status = list(
                map(lambda k: self._check_a_worker(workers[k]['metrics']),
                    self.worker_list))
            # If at least one of the available worker(s) are free, a True signal is returned
            available = np.array(
                self.worker_list)[np.where(worker_status)[0]].tolist()
        else:
            available = list(self.client.ncores().keys())
        np.random.shuffle(available)
        return available

    def fetch_futures(self, retries=1, wait_time=0.05):
        """ Removes the futures which are done from the list

        Loops over the list a given number of times, waiting for a given time, to check if any or
        more futures have finished. If at any time, all futures have been processed, it breaks out.
        """
        counter = 0
        while counter < retries:
            if self.n_workers > 1:
                self.futures = [
                    future for future in self.futures if not future.done()
                ]
                if len(self.futures) == 0:
                    break
            else:
                # Dask not invoked in the synchronous case (n_workers=1)
                self.futures = []
            time.sleep(wait_time)
            counter += 1
        return None

    def distribute_data_to_workers(self, data):
        """ Shares data across all workers to not serialize and transfer with every job
        """
        if self.worker_list is None:
            self.worker_list = self._get_worker_list()
            self.shared_data = self.client.scatter(data, broadcast=True)
        current_worker_list = list(self.client.ncores().keys())
        if len(set(current_worker_list) - set(self.worker_list)) > 0:
            # redistribute data across workers only when new workers have been added
            self.shared_data = self.client.scatter(data, broadcast=True)
        self.worker_list = self._get_worker_list()

    def is_worker_alive(self):
        return len(self.futures) > 0
Example #5
0
class DEHB(DEHBBase):
    def __init__(self,
                 cs=None,
                 f=None,
                 dimensions=None,
                 mutation_factor=0.5,
                 crossover_prob=0.5,
                 strategy='rand1_bin',
                 min_budget=None,
                 save_smac=False,
                 max_budget=None,
                 eta=3,
                 min_clip=None,
                 max_clip=None,
                 configspace=True,
                 boundary_fix_type='random',
                 max_age=np.inf,
                 n_workers=None,
                 client=None,
                 **kwargs):
        super().__init__(cs=cs,
                         f=f,
                         dimensions=dimensions,
                         mutation_factor=mutation_factor,
                         crossover_prob=crossover_prob,
                         strategy=strategy,
                         min_budget=min_budget,
                         max_budget=max_budget,
                         eta=eta,
                         min_clip=min_clip,
                         max_clip=max_clip,
                         configspace=configspace,
                         boundary_fix_type=boundary_fix_type,
                         max_age=max_age,
                         **kwargs)
        self.iteration_counter = -1
        self.de = {}
        self._max_pop_size = None
        self.active_brackets = []  # list of SHBracketManager objects
        self.traj = []
        self.smac_trajectory = []
        self.runtime = []
        self.history = []
        self.smac_history = {"data": [], "config_origins": {}, "configs": {}}
        self.start = None
        self.save_smac = save_smac

        # Dask variables
        if n_workers is None and client is None:
            raise ValueError(
                "Need to specify either 'n_workers'(>0) or 'client' (a Dask client)!"
            )
        if client is not None and isinstance(client, Client):
            self.client = client
            self.n_workers = len(client.ncores())
        else:
            self.n_workers = n_workers
            if self.n_workers > 1:
                self.client = Client(
                    n_workers=self.n_workers,
                    processes=True,
                    threads_per_worker=1,
                    scheduler_port=0
                )  # port 0 makes Dask select a random free port
            else:
                self.client = None
        self.futures = []
        self.shared_data = None

        # Initializing DE subpopulations
        self._get_pop_sizes()
        self._init_subpop()

        # Misc.
        self.available_gpus = None
        self.gpu_usage = None
        self.single_node_with_gpus = None

    def __getstate__(self):
        """ Allows the object to picklable while having Dask client as a class attribute.
        """
        d = dict(self.__dict__)
        d["client"] = None  # hack to allow Dask client to be a class attribute
        d["logger"] = None  # hack to allow logger object to be a class attribute
        return d

    def __del__(self):
        """ Ensures a clean kill of the Dask client and frees up a port.
        """
        if hasattr(self, "client") and isinstance(self, Client):
            self.client.close()

    def _f_objective(self, job_info):
        """ Wrapper to call DE's objective function.
        """
        # check if job_info appended during job submission self.submit_job() includes "gpu_devices"
        if "gpu_devices" in job_info and self.single_node_with_gpus:
            # should set the environment variable for the spawned worker process
            # reprioritising a CUDA device order specific to this worker process
            os.environ.update(
                {"CUDA_VISIBLE_DEVICES": job_info["gpu_devices"]})

        config, budget, parent_id = job_info['config'], job_info[
            'budget'], job_info['parent_id']
        bracket_id = job_info['bracket_id']
        kwargs = job_info["kwargs"]
        res = self.de[budget].f_objective(config, budget, **kwargs)
        info = res["info"] if "info" in res else dict()
        run_info = {
            'fitness': res["fitness"],
            'cost': res["cost"],
            'config': config,
            'budget': budget,
            'parent_id': parent_id,
            'bracket_id': bracket_id,
            'info': info
        }

        if "gpu_devices" in job_info:
            # important for GPU usage tracking if single_node_with_gpus=True
            device_id = int(job_info["gpu_devices"].strip().split(",")[0])
            run_info.update({"device_id": device_id})
        return run_info

    def _create_cuda_visible_devices(self, available_gpus: List[int],
                                     start_id: int) -> str:
        """ Generates a string to set the CUDA_VISIBLE_DEVICES environment variable.

        Given a list of available GPU device IDs and a preferred ID (start_id), the environment
        variable is created by putting the start_id device first, followed by the remaining devices
        arranged randomly. The worker that uses this string to set the environment variable uses
        the start_id GPU device primarily now.
        """
        assert start_id in available_gpus
        available_gpus = deepcopy(available_gpus)
        available_gpus.remove(start_id)
        np.random.shuffle(available_gpus)
        final_variable = [str(start_id)] + [str(_id) for _id in available_gpus]
        final_variable = ",".join(final_variable)
        return final_variable

    def distribute_gpus(self):
        """ Function to create a GPU usage tracker dict.

        The idea is to extract the exact GPU device IDs available. During job submission, each
        submitted job is given a preference of a GPU device ID based on the GPU device with the
        least number of active running jobs. On retrieval of the result, this gpu usage dict is
        updated for the device ID that the finished job was mapped to.
        """
        try:
            available_gpus = os.environ["CUDA_VISIBLE_DEVICES"]
            available_gpus = available_gpus.strip().split(",")
            self.available_gpus = [int(_id) for _id in available_gpus]
        except KeyError as e:
            print("Unable to find valid GPU devices. "
                  "Environment variable {} not visible!".format(str(e)))
            self.available_gpus = []
        self.gpu_usage = dict()
        for _id in self.available_gpus:
            self.gpu_usage[_id] = 0

    def vector_to_configspace(self, config):
        assert hasattr(self, "de")
        assert len(self.budgets) > 0
        return self.de[self.budgets[0]].vector_to_configspace(config)

    def configspace_to_vector(self, config):
        assert hasattr(self, "de")
        assert len(self.budgets) > 0
        return self.de[self.budgets[0]].configspace_to_vector(config)

    def reset(self):
        super().reset()
        if self.n_workers > 1 and hasattr(self, "client") and isinstance(
                self.client, Client):
            self.client.restart()
        else:
            self.client = None
        self.futures = []
        self.shared_data = None
        self.iteration_counter = -1
        self.de = {}
        self._max_pop_size = None
        self.start = None
        self.active_brackets = []
        self.traj = []
        self.smac_trajectory = []
        self.runtime = []
        self.history = []
        self.smac_history = {"data": [], "config_origins": {}, "configs": {}}
        self._get_pop_sizes()
        self._init_subpop()
        self.available_gpus = None
        self.gpu_usage = None

    def init_population(self, pop_size):
        if self.configspace:
            population = self.cs.sample_configuration(size=pop_size)
            population = [
                self.configspace_to_vector(individual)
                for individual in population
            ]
        else:
            population = np.random.uniform(low=0.0,
                                           high=1.0,
                                           size=(pop_size, self.dimensions))
        return population

    def clean_inactive_brackets(self):
        """ Removes brackets from the active list if it is done as communicated by Bracket Manager
        """
        if len(self.active_brackets) == 0:
            return
        self.active_brackets = [
            bracket for bracket in self.active_brackets
            if ~bracket.is_bracket_done()
        ]
        return

    # Also generates SMAC like History
    def _update_trackers(self, traj, runtime, history):
        if len(self.traj) == self.inc_run:
            incumbent = ConfigSpace.Configuration(
                self.cs, vector=traj[1]).get_dictionary()
            for key in list(incumbent.keys()):
                if key != "agent":
                    incumbent[key[2:]] = incumbent.pop(key)
            traj_obj = {
                "cpu_time": time.time() - self.start,
                "wall_clocktime": time.time() - self.start,
                "evaluations": len(self.traj),
                "cost": traj[0],
                "incumbent": incumbent,
                "budget": 0,
                "origin": "DEHB"
            }
            self.smac_trajectory.append(traj_obj)
        self.smac_history["data"].append((
            [
                len(self.traj) + 1,  # config id
                None,  # instance id
                0,  # seed...
                history[-2],  # budget
            ],
            [
                history[1],  # cost / fitness
                history[2],  # time / cost
                {  # status
                    "__enum__": "StatusType.SUCCESS"
                },
                time.time() - history[2],  # start time
                time.time(),  # end time,
                history[-1]  # info
            ]))
        self.smac_history["config_origins"][str(len(self.traj) + 1)] = "DEHB"
        config = ConfigSpace.Configuration(self.cs,
                                           vector=history[0]).get_dictionary()
        for key in list(config.keys()):
            if key != "agent":
                config[key[2:]] = config.pop(key)
        self.smac_history["configs"][str(len(self.traj) + 1)] = config

        self.traj.append(traj[0])
        self.runtime.append(runtime)
        self.history.append(history)

    def _update_incumbents(self, config, score, run, info):
        self.inc_config = config
        self.inc_score = score
        self.inc_run = run
        self.inc_info = info

    def _get_pop_sizes(self):
        """Determines maximum pop size for each budget
        """
        self._max_pop_size = {}
        for i in range(self.max_SH_iter):
            n, r = self.get_next_iteration(i)
            for j, r_j in enumerate(r):
                self._max_pop_size[r_j] = max(
                    n[j], self._max_pop_size[r_j]
                ) if r_j in self._max_pop_size.keys() else n[j]

    def _init_subpop(self):
        """ List of DE objects corresponding to the budgets (fidelities)
        """
        self.de = {}
        for i, b in enumerate(self._max_pop_size.keys()):
            self.de[b] = AsyncDE(**self.de_params,
                                 budget=b,
                                 pop_size=self._max_pop_size[b])
            self.de[b].population = self.de[b].init_population(
                pop_size=self._max_pop_size[b])
            self.de[b].fitness = np.array([np.inf] * self._max_pop_size[b])
            # adding attributes to DEHB objects to allow communication across subpopulations
            self.de[b].parent_counter = 0
            self.de[b].promotion_pop = None
            self.de[b].promotion_fitness = None

    def _concat_pops(self, exclude_budget=None):
        """ Concatenates all subpopulations
        """
        budgets = list(self.budgets)
        if exclude_budget is not None:
            budgets.remove(exclude_budget)
        pop = []
        for budget in budgets:
            pop.extend(self.de[budget].population.tolist())
        return np.array(pop)

    def _start_new_bracket(self):
        """ Starts a new bracket based on Hyperband
        """
        # start new bracket
        self.iteration_counter += 1  # iteration counter gives the bracket count or bracket ID
        n_configs, budgets = self.get_next_iteration(self.iteration_counter)
        bracket = SHBracketManager(n_configs=n_configs,
                                   budgets=budgets,
                                   bracket_id=self.iteration_counter)
        self.active_brackets.append(bracket)
        return bracket

    def _get_worker_count(self):
        if isinstance(self.client, Client):
            return len(self.client.ncores())
        else:
            return 1

    def is_worker_available(self, verbose=False):
        """ Checks if at least one worker is available to run a job
        """
        if self.n_workers == 1 or self.client is None or not isinstance(
                self.client, Client):
            # in the synchronous case, one worker is always available
            return True
        # checks the absolute number of workers mapped to the client scheduler
        # client.ncores() should return a dict with the keys as unique addresses to these workers
        # treating the number of available workers in this manner
        workers = self._get_worker_count()  # len(self.client.ncores())
        if len(self.futures) >= workers:
            # pause/wait if active worker count greater allocated workers
            return False
        return True

    def _get_promotion_candidate(self, low_budget, high_budget, n_configs):
        """ Manages the population to be promoted from the lower to the higher budget.

        This is triggered or in action only during the first full HB bracket, which is equivalent
        to the number of brackets <= max_SH_iter.
        """
        # finding the individuals that have been evaluated (fitness < np.inf)
        evaluated_configs = np.where(self.de[low_budget].fitness != np.inf)[0]
        promotion_candidate_pop = self.de[low_budget].population[
            evaluated_configs]
        promotion_candidate_fitness = self.de[low_budget].fitness[
            evaluated_configs]
        # ordering the evaluated individuals based on their fitness values
        pop_idx = np.argsort(promotion_candidate_fitness)

        # creating population for promotion if none promoted yet or nothing to promote
        if self.de[high_budget].promotion_pop is None or \
                len(self.de[high_budget].promotion_pop) == 0:
            self.de[high_budget].promotion_pop = np.empty((0, self.dimensions))
            self.de[high_budget].promotion_fitness = np.array([])

            # iterating over the evaluated individuals from the lower budget and including them
            # in the promotion population for the higher budget only if it's not in the population
            # this is done to ensure diversity of population and avoid redundant evaluations
            for idx in pop_idx:
                individual = promotion_candidate_pop[idx]
                # checks if the candidate individual already exists in the high budget population
                if np.any(
                        np.all(individual == self.de[high_budget].population,
                               axis=1)):
                    # skipping already present individual to allow diversity and reduce redundancy
                    continue
                self.de[high_budget].promotion_pop = np.append(
                    self.de[high_budget].promotion_pop, [individual], axis=0)
                self.de[high_budget].promotion_fitness = np.append(
                    self.de[high_budget].promotion_pop,
                    promotion_candidate_fitness[pop_idx])
            # retaining only n_configs
            self.de[high_budget].promotion_pop = self.de[
                high_budget].promotion_pop[:n_configs]
            self.de[high_budget].promotion_fitness = \
                self.de[high_budget].promotion_fitness[:n_configs]

        if len(self.de[high_budget].promotion_pop) > 0:
            config = self.de[high_budget].promotion_pop[0]
            # removing selected configuration from population
            self.de[high_budget].promotion_pop = self.de[
                high_budget].promotion_pop[1:]
            self.de[high_budget].promotion_fitness = self.de[
                high_budget].promotion_fitness[1:]
        else:
            # in case of an edge failure case where all high budget individuals are same
            # just choose the best performing individual from the lower budget (again)
            config = self.de[low_budget].population[pop_idx[0]]
        return config

    def _get_next_parent_for_subpop(self, budget):
        """ Maintains a looping counter over a subpopulation, to iteratively select a parent
        """
        parent_id = self.de[budget].parent_counter
        self.de[budget].parent_counter += 1
        self.de[budget].parent_counter = self.de[
            budget].parent_counter % self._max_pop_size[budget]
        return parent_id

    def _acquire_config(self, bracket, budget):
        """ Generates/chooses a configuration based on the budget and iteration number
        """
        # select a parent/target
        parent_id = self._get_next_parent_for_subpop(budget)
        target = self.de[budget].population[parent_id]
        # identify lower budget/fidelity to transfer information from
        lower_budget, num_configs = bracket.get_lower_budget_promotions(budget)

        if self.iteration_counter < self.max_SH_iter:
            # promotions occur only in the first set of SH brackets under Hyperband
            # for the first rung/budget in the current bracket, no promotion is possible and
            # evolution can begin straight away
            # for the subsequent rungs, individuals will be promoted from the lower_budget
            if budget != bracket.budgets[0]:
                # TODO: check if generalizes to all budget spacings
                config = self._get_promotion_candidate(lower_budget, budget,
                                                       num_configs)
                return config, parent_id

        # DE evolution occurs when either all individuals in the subpopulation have been evaluated
        # at least once, i.e., has fitness < np.inf, which can happen if
        # iteration_counter <= max_SH_iter but certainly never when iteration_counter > max_SH_iter

        # a single DE evolution --- (mutation + crossover) occurs here
        mutation_pop_idx = np.argsort(
            self.de[lower_budget].fitness)[:num_configs]
        mutation_pop = self.de[lower_budget].population[mutation_pop_idx]
        # generate mutants from previous budget subpopulation or global population
        if len(mutation_pop) < self.de[budget]._min_pop_size:
            filler = self.de[budget]._min_pop_size - len(mutation_pop) + 1
            new_pop = self.de[budget]._init_mutant_population(
                pop_size=filler,
                population=self._concat_pops(),
                target=target,
                best=self.inc_config)
            mutation_pop = np.concatenate((mutation_pop, new_pop))
        # generate mutant from among individuals in mutation_pop
        mutant = self.de[budget].mutation(current=target,
                                          best=self.inc_config,
                                          alt_pop=mutation_pop)
        # perform crossover with selected parent
        config = self.de[budget].crossover(target=target, mutant=mutant)
        config = self.de[budget].boundary_check(config)
        return config, parent_id

    def _get_next_job(self):
        """ Loads a configuration and budget to be evaluated next by a free worker
        """
        bracket = None
        if len(self.active_brackets) == 0 or \
                np.all([bracket.is_bracket_done() for bracket in self.active_brackets]):
            # start new bracket when no pending jobs from existing brackets or empty bracket list
            bracket = self._start_new_bracket()
        else:
            for _bracket in self.active_brackets:
                # check if _bracket is not waiting for previous rung results of same bracket
                # _bracket is not waiting on the last rung results
                # these 2 checks allow DEHB to have a "synchronous" Successive Halving
                if not _bracket.previous_rung_waits() and _bracket.is_pending(
                ):
                    # bracket eligible for job scheduling
                    bracket = _bracket
                    break
            if bracket is None:
                # start new bracket when existing list has all waiting brackets
                bracket = self._start_new_bracket()
        # budget that the SH bracket allots
        budget = bracket.get_next_job_budget()
        config, parent_id = self._acquire_config(bracket, budget)
        # notifies the Bracket Manager that a single config is to run for the budget chosen
        job_info = {
            "config": config,
            "budget": budget,
            "parent_id": parent_id,
            "bracket_id": bracket.bracket_id
        }
        return job_info

    def _get_gpu_id_with_low_load(self):
        candidates = []
        for k, v in self.gpu_usage.items():
            if v == min(self.gpu_usage.values()):
                candidates.append(k)
        device_id = np.random.choice(candidates)
        # creating string for setting environment variable CUDA_VISIBLE_DEVICES
        gpu_ids = self._create_cuda_visible_devices(self.available_gpus,
                                                    device_id)
        # updating GPU usage
        self.gpu_usage[device_id] += 1
        self.logger.debug("GPU device selected: {}".format(device_id))
        self.logger.debug("GPU device usage: {}".format(self.gpu_usage))
        return gpu_ids

    def submit_job(self, job_info, **kwargs):
        """ Asks a free worker to run the objective function on config and budget
        """
        job_info[
            "kwargs"] = self.shared_data if self.shared_data is not None else kwargs
        # submit to to Dask client
        if self.n_workers > 1 or isinstance(self.client, Client):
            if self.single_node_with_gpus:
                # managing GPU allocation for the job to be submitted
                job_info.update(
                    {"gpu_devices": self._get_gpu_id_with_low_load()})
            self.futures.append(self.client.submit(self._f_objective,
                                                   job_info))
        else:
            # skipping scheduling to Dask worker to avoid added overheads in the synchronous case
            self.futures.append(self._f_objective(job_info))

        # pass information of job submission to Bracket Manager
        for bracket in self.active_brackets:
            if bracket.bracket_id == job_info['bracket_id']:
                # registering is IMPORTANT for Bracket Manager to perform SH
                bracket.register_job(job_info['budget'])
                break

    def _fetch_results_from_workers(self):
        """ Iterate over futures and collect results from finished workers
        """
        if self.n_workers > 1 or isinstance(self.client, Client):
            done_list = [(i, future) for i, future in enumerate(self.futures)
                         if future.done()]
        else:
            # Dask not invoked in the synchronous case
            done_list = [(i, future) for i, future in enumerate(self.futures)]
        if len(done_list) > 0:
            self.logger.debug("Collecting {} of the {} job(s) active.".format(
                len(done_list), len(self.futures)))
        for _, future in done_list:
            if self.n_workers > 1 or isinstance(self.client, Client):
                run_info = future.result()
                if "device_id" in run_info:
                    # updating GPU usage
                    self.gpu_usage[run_info["device_id"]] -= 1
                    self.logger.debug("GPU device released: {}".format(
                        run_info["device_id"]))
                future.release()
            else:
                # Dask not invoked in the synchronous case
                run_info = future
            # update bracket information
            fitness, cost = run_info["fitness"], run_info["cost"]
            info = run_info["info"] if "info" in run_info else dict()
            budget, parent_id = run_info["budget"], run_info["parent_id"]
            config = run_info["config"]
            bracket_id = run_info["bracket_id"]
            for bracket in self.active_brackets:
                if bracket.bracket_id == bracket_id:
                    # bracket job complete
                    bracket.complete_job(
                        budget)  # IMPORTANT to perform synchronous SH

            # carry out DE selection
            if fitness <= self.de[budget].fitness[parent_id]:
                self.de[budget].population[parent_id] = config
                self.de[budget].fitness[parent_id] = fitness
            # updating incumbents
            if self.de[budget].fitness[parent_id] < self.inc_score:
                self._update_incumbents(
                    config=self.de[budget].population[parent_id],
                    score=self.de[budget].fitness[parent_id],
                    run=len(self.traj),
                    info=info)
            # book-keeping
            self._update_trackers(traj=(self.inc_score, self.inc_config),
                                  runtime=cost,
                                  history=(config, float(fitness), float(cost),
                                           float(budget), info))
        # remove processed future
        self.futures = np.delete(self.futures,
                                 [i for i, _ in done_list]).tolist()

    def _is_run_budget_exhausted(self,
                                 fevals=None,
                                 brackets=None,
                                 total_cost=None):
        """ Checks if the DEHB run should be terminated or continued
        """
        delimiters = [fevals, brackets, total_cost]
        delim_sum = sum(x is not None for x in delimiters)
        if delim_sum == 0:
            raise ValueError(
                "Need one of 'fevals', 'brackets' or 'total_cost' as budget for DEHB to run."
            )
        if fevals is not None:
            if len(self.traj) >= fevals:
                return True
        elif brackets is not None:
            if self.iteration_counter >= brackets:
                for bracket in self.active_brackets:
                    # waits for all brackets < iteration_counter to finish by collecting results
                    if bracket.bracket_id < self.iteration_counter and \
                            not bracket.is_bracket_done():
                        return False
                return True
        else:
            if time.time() - self.start >= total_cost:
                return True
            if len(self.runtime
                   ) > 0 and self.runtime[-1] - self.start >= total_cost:
                return True
        return False

    def _save_incumbent(self, name=None):
        if name is None:
            name = time.strftime("%x %X %Z", time.localtime(self.start))
            name = name.replace("/", '-').replace(":", '-').replace(" ", '_')
        try:
            res = dict()
            if self.configspace:
                config = self.vector_to_configspace(self.inc_config)
                res["config"] = config.get_dictionary()
            else:
                res["config"] = self.inc_config.tolist()
            res["score"] = self.inc_score
            res["info"] = self.inc_info
            with open(
                    os.path.join(self.output_path,
                                 "incumbent_{}.json".format(name)), 'w') as f:
                json.dump(res, f)
        except Exception as e:
            self.logger.warning("Incumbent not saved: {}".format(repr(e)))

    def _save_history(self, name=None):
        if name is None:
            name = time.strftime("%x %X %Z", time.localtime(self.start))
            name = name.replace("/", '-').replace(":", '-').replace(" ", '_')
        try:
            with open(
                    os.path.join(self.output_path,
                                 "history_{}.pkl".format(name)), 'wb') as f:
                pickle.dump(self.history, f)
            if self.save_smac:
                with open(
                        os.path.join(self.output_path,
                                     "runhistory_{}.json".format(name)),
                        'w') as f:
                    json.dump(self.smac_history, f, indent=2)
                with open(
                        os.path.join(self.output_path,
                                     "traj_{}.json".format(name)), 'w') as f:
                    for traj in self.smac_trajectory:
                        json.dump(traj, f)
                        f.write("\n")
        except Exception as e:
            self.logger.warning("History not saved: {}".format(repr(e)))

    def _verbosity_debug(self):
        for bracket in self.active_brackets:
            self.logger.debug("Bracket ID {}:\n{}".format(
                bracket.bracket_id, str(bracket)))

    def _verbosity_runtime(self, fevals, brackets, total_cost):
        if fevals is not None:
            remaining = (len(self.traj), fevals, "function evaluation(s) done")
        elif brackets is not None:
            _suffix = "bracket(s) started; # active brackets: {}".format(
                len(self.active_brackets))
            remaining = (self.iteration_counter + 1, brackets, _suffix)
        else:
            elapsed = np.format_float_positional(time.time() - self.start,
                                                 precision=2)
            remaining = (elapsed, total_cost, "seconds elapsed")
        self.logger.info("{}/{} {}".format(remaining[0], remaining[1],
                                           remaining[2]))

    @logger.catch
    def run(self,
            fevals=None,
            brackets=None,
            total_cost=None,
            single_node_with_gpus=False,
            verbose=False,
            debug=False,
            save_intermediate=True,
            save_history=True,
            **kwargs):
        """ Main interface to run optimization by DEHB

        This function waits on workers and if a worker is free, asks for a configuration and a
        budget to evaluate on and submits it to the worker. In each loop, it checks if a job
        is complete, fetches the results, carries the necessary processing of it asynchronously
        to the worker computations.

        The duration of the DEHB run can be controlled by specifying one of 3 parameters. If more
        than one are specified, DEHB selects only one in the priority order (high to low):
        1) Number of function evaluations (fevals)
        2) Number of Successive Halving brackets run under Hyperband (brackets)
        3) Total computational cost (in seconds) aggregated by all function evaluations (total_cost)
        """
        # checks if a Dask client exists
        if len(kwargs) > 0 and self.n_workers > 1 and isinstance(
                self.client, Client):
            # broadcasts all additional data passed as **kwargs to all client workers
            # this reduces overload in the client-worker communication by not having to
            # serialize the redundant data used by all workers for every job
            self.shared_data = self.client.scatter(kwargs, broadcast=True)

        # allows each worker to be mapped to a different GPU when running on a single node
        # where all available GPUs are accessible
        self.single_node_with_gpus = single_node_with_gpus
        if self.single_node_with_gpus:
            self.distribute_gpus()

        self.start = time.time()
        if verbose:
            print("\nLogging at {} for optimization starting at {}\n".format(
                os.path.join(os.getcwd(), self.log_filename),
                time.strftime("%x %X %Z", time.localtime(self.start))))
        if debug:
            logger.configure(handlers=[{"sink": sys.stdout}])
        while True:
            if self._is_run_budget_exhausted(fevals, brackets, total_cost):
                break
            if self.is_worker_available():
                job_info = self._get_next_job()
                if brackets is not None and job_info['bracket_id'] >= brackets:
                    # ignore submission and only collect results
                    # when brackets are chosen as run budget, an extra bracket is created
                    # since iteration_counter is incremented in _get_next_job() and then checked
                    # in _is_run_budget_exhausted(), therefore, need to skip suggestions
                    # coming from the extra allocated bracket
                    # _is_run_budget_exhausted() will not return True until all the lower brackets
                    # have finished computation and returned its results
                    pass
                else:
                    if self.n_workers > 1 or isinstance(self.client, Client):
                        self.logger.debug("{}/{} worker(s) available.".format(
                            self._get_worker_count() - len(self.futures),
                            self._get_worker_count()))
                    # submits job_info to a worker for execution
                    self.submit_job(job_info, **kwargs)
                    if verbose:
                        budget = job_info['budget']
                        self._verbosity_runtime(fevals, brackets, total_cost)
                        self.logger.info(
                            "Evaluating a configuration with budget {} under "
                            "bracket ID {}".format(budget,
                                                   job_info['bracket_id']))
                        self.logger.info(
                            "Best score seen/Incumbent score: {}".format(
                                self.inc_score))
                    self._verbosity_debug()
            self._fetch_results_from_workers()
            if save_intermediate and self.inc_config is not None:
                self._save_incumbent()
            if save_history and self.history is not None:
                self._save_history()
            self.clean_inactive_brackets()
        # end of while

        if verbose and len(self.futures) > 0:
            self.logger.info(
                "DEHB optimisation over! Waiting to collect results from workers running..."
            )
        while len(self.futures) > 0:
            self._fetch_results_from_workers()
            if save_intermediate and self.inc_config is not None:
                self._save_incumbent()
            if save_history and self.history is not None:
                self._save_history()
            time.sleep(0.05)  # waiting 50ms

        if verbose:
            time_taken = time.time() - self.start
            self.logger.info(
                "End of optimisation! Total duration: {}; Total fevals: {}\n".
                format(time_taken, len(self.traj)))
            self.logger.info("Incumbent score: {}".format(self.inc_score))
            self.logger.info("Incumbent config: ")
            if self.configspace:
                config = self.vector_to_configspace(self.inc_config)
                for k, v in config.get_dictionary().items():
                    self.logger.info("{}: {}".format(k, v))
            else:
                self.logger.info("{}".format(self.inc_config))
        self._save_incumbent()
        self._save_history()
        return np.array(self.traj), np.array(self.runtime), np.array(
            self.history, dtype=object)
Example #6
0
START_TIMEOUT = 1800  # 30 min
SILENCE_TIMEOUT = 900  # 15 min
MAX_COLLECT_TIME = 86400  # 1 day


def processing_task_list(cli):
    return list(itertools.chain.from_iterable(cli.processing().values()))


cli = Client('127.0.0.1:8786')

print("Waiting for tasks to start running")

timeout = time.time() + START_TIMEOUT

while not cli.ncores():
    time.sleep(5)
    if time.time() > timeout:
        raise Exception("workers never started")

print("First worker connected. Starting data collection.")

start_time = time.time()
end_time = time.time() + MAX_COLLECT_TIME

with open('graph.csv', 'wb') as outfile:
    writer = csv.writer(outfile)

    writer.writerow(['Time', 'Cores', 'Scheduled Tasks', 'Completed Tasks'])

    silence_end_time = time.time() + SILENCE_TIMEOUT
def preprocessing_script():
    """
    This script will process all the hybridization folders combined in a 
    processing folder. The input parameters are passed using arparse

    Parameters:
    -----------
    
    scheduler: string
        tcp address of the dask.distributed scheduler (ex. tcp://192.168.0.4:7003). 
        default = False. If False the process will run on the local computer using nCPUs-1

    path: string
        Path to the processing directory


    """


    # Inputs of the function
    parser = argparse.ArgumentParser(description='Preprocessing script')
    parser.add_argument('-scheduler', default=False, help='dask scheduler address ex. tcp://192.168.0.4:7003')
    parser.add_argument('-path', help='processing directory')
    args = parser.parse_args()
    
    # Directory to process
    processing_directory = args.path
    # Dask scheduler address
    scheduler_address = args.scheduler
    
    if scheduler_address:
        # Start dask client on server or cluster
        client=Client(scheduler_address)

    else:
        # Start dask client on local machine. It will use all the availabe
        # cores -1

        # number of core to use
        ncores = multiprocessing.cpu_count()-1
        cluster = LocalCluster(n_workers=ncores)
        client=Client(cluster)

    # Subdirectories of the processing_directory that need to be skipped for the
    # analysis
    blocked_directories = ['_logs']

    # Starting logger
    utils.init_file_logger(processing_directory)
    logger = logging.getLogger()

    # Determine the operating system running the code
    os_windows, add_slash = utils.determine_os()

    # Check training slash in the processing directory
    processing_directory=utils.check_trailing_slash(processing_directory,os_windows)

    # Get a list of the hybridization to process
    processing_hyb_list = next(os.walk(processing_directory))[1]

    # Remove the blocked directories from the directories to process
    processing_hyb_list = [el for el in processing_hyb_list if el not in blocked_directories ]

    for processing_hyb in processing_hyb_list:
    
        # Determine the hyb number from the name
        hybridization_number = processing_hyb.split('_hyb')[-1]
        hybridization = 'Hybridization' + hybridization_number
        hyb_dir = processing_directory + processing_hyb + add_slash
        
        # Parse the Experimental metadata file (serial)
        experiment_infos,image_properties, hybridizations_infos, \
        converted_positions, microscope_parameters =\
        utils.experimental_metadata_parser(hyb_dir)
        
        # Parse the configuration file 
        flt_rawcnt_config = utils.filtering_raw_counting_config_parser(hyb_dir)
        
        
        # ----------------- .nd2 FILE CONVERSION ------------------------------

        # Create the temporary subdirectory tree (serial)
        tmp_dir_path, tmp_gene_dirs=utils.create_subdirectory_tree(hyb_dir,\
                    hybridization,hybridizations_infos,processing_hyb,suffix='tmp',add_slash=add_slash)

        # Get the list of the nd2 files to process inside the directory
        files_list = glob.glob(hyb_dir+processing_hyb+'_raw_data'+add_slash+'*.nd2')

        # Get the list of genes that are analyzed in the current hybridization
        gene_list = list(hybridizations_infos[hybridization].keys())

        # Organize the file to process in a list which order match the gene_list for
        # parallel processing
        organized_files_list = [f for gene in gene_list for f in files_list if gene+'.nd2' in f  ]
        organized_tmp_dir_list = [f for gene in gene_list for f in tmp_gene_dirs if gene in f  ]

        # Each .nd2 file will be processed in a worker part of a different node
        # Get the addresses of one process/node to use for conversion
        node_addresses = utils.identify_nodes(client)
        workers_conversion = [list(el.items())[0][1] for key,el in node_addresses.items()]

        # Run the conversion
        futures_processes=client.map(io.nd2_to_npy,gene_list,organized_files_list,
                                    tmp_gene_dirs,processing_hyb=processing_hyb,
                                    use_ram=flt_rawcnt_config['use_ram'],
                                    max_ram=flt_rawcnt_config['max_ram'],
                                    workers=workers_conversion)
        client.gather(futures_processes)

        

        # ---------------------------------------------------------------------
        
        
        # ----------------- FILTERING AND RAW COUNTING ------------------------
        
        # Create directories 

        # Create the directory where to save the filtered images
        suffix = 'filtered_png'
        filtered_png_img_dir_path, filtered_png_img_gene_dirs = \
                utils.create_subdirectory_tree(hyb_dir,hybridization,hybridizations_infos,
                            processing_hyb,suffix,add_slash,analysis_name=flt_rawcnt_config['analysis_name'])

        suffix = 'filtered_npy'
        filtered_img_dir_path, filtered_img_gene_dirs = \
                utils.create_subdirectory_tree(hyb_dir,hybridization,hybridizations_infos,
                            processing_hyb,suffix,add_slash,analysis_name=flt_rawcnt_config['analysis_name'])

        # Create the directory where to save the counting
        suffix = 'counting'
        counting_dir_path, counting_gene_dirs = \
            utils.create_subdirectory_tree(hyb_dir,hybridization,hybridizations_infos,processing_hyb,
                            suffix,add_slash,flt_rawcnt_config['skip_tags_counting'],
                            flt_rawcnt_config['skip_genes_counting'],
                            analysis_name=flt_rawcnt_config['analysis_name'])


        if flt_rawcnt_config['illumination_correction']:

            # Create the directory where to save the counting
            suffix = 'illumination_funcs'
            illumination_func_dir_path, illumination_func_gene_dirs = \
                utils.create_subdirectory_tree(hyb_dir,hybridization,hybridizations_infos,processing_hyb,
                                                suffix,add_slash,analysis_name=flt_rawcnt_config['analysis_name'])

            # Loop through channels and calculate illumination
            for gene in hybridizations_infos[hybridization].keys():
                
                flist_img_to_filter=glob.glob(hyb_dir+processing_hyb+'_tmp/'+processing_hyb+'_'+gene+'_tmp/*.npy')

                logger.debug('Create average image for gene %s', gene)

                # Chunking the image list
                num_chunks = sum(list(client.ncores().values()))
                chunked_list = utils.list_chunking(flist_img_to_filter,num_chunks)

                # Scatter the images sublists to process in parallel
                futures = client.scatter(chunked_list)

                # Create dask processing graph
                output = []
                for future in futures:
                    ImgMean = delayed(utils.partial_image_mean)(future)
                    output.append(ImgMean)
                ImgMean_all = delayed(sum)(output)
                ImgMean_all = ImgMean_all/float(len(futures))

                # Compute the graph
                ImgMean = ImgMean_all.compute()

                logger.debug('Create illumination function for gene %s',gene)
                # Create illumination function
                Illumination=filters.gaussian(ImgMean,sigma=(20,300,300))

                # Normalization of the illumination
                Illumination_flat=np.amax(Illumination,axis=0)
                Illumination_norm=Illumination_flat/np.amax(Illumination_flat)

                logger.debug('Save illumination function for gene %s',gene)
                # Save the illumination function
                illumination_path = [ill_path for ill_path in illumination_func_gene_dirs if gene in ill_path][0]
                illumination_fname=illumination_path+gene+'_illumination_func.npy'
                np.save(illumination_fname,Illumination_norm,allow_pickle=False)  

                # Broadcast the illumination function to all the cores
                client.scatter(Illumination_norm, broadcast=True)

                logger.debug('Filtering %s',gene)
                # Filtering and counting
                futures_processes=client.map(counting.filtering_and_counting_ill_correction,flist_img_to_filter, \
                                illumination_function=Illumination_norm,\
                                filtered_png_img_gene_dirs=filtered_png_img_gene_dirs,\
                                filtered_img_gene_dirs =filtered_img_gene_dirs,\
                                counting_gene_dirs=counting_gene_dirs,plane_keep=flt_rawcnt_config['plane_keep'], \
                                min_distance=flt_rawcnt_config['min_distance'], stringency=flt_rawcnt_config['stringency'],\
                                skip_genes_counting=flt_rawcnt_config['skip_genes_counting'],skip_tags_counting=flt_rawcnt_config['skip_tags_counting'])
                client.gather(futures_processes)
               

        else:
            for gene in hybridizations_infos[hybridization].keys():
                flist_img_to_filter=glob.glob(hyb_dir+processing_hyb+'_tmp/'+processing_hyb+'_'+gene+'_tmp/*.npy')
                # filtering
                logger.debug('Filtering without illumination correction %s',gene)

                futures_processes=client.map(counting.filtering_and_counting,flist_img_to_filter, \
                                        filtered_png_img_gene_dirs=filtered_png_img_gene_dirs, \
                                        filtered_img_gene_dirs=filtered_img_gene_dirs, \
                                        counting_gene_dirs=counting_gene_dirs, \
                                        plane_keep=flt_rawcnt_config['plane_keep'], min_distance=flt_rawcnt_config['min_distance'],\
                                        stringency=flt_rawcnt_config['stringency'],\
                                        skip_genes_counting=flt_rawcnt_config['skip_genes_counting'],skip_tags_counting=flt_rawcnt_config['skip_tags_counting'])

                client.gather(futures_processes)
                
        # ---------------------------------------------------------------------
        
        # # ----------------- COMBINE THE FILTERED DATA IN .ppf.hdf5 ------------------------
        # # Combine the filter data in one single .ppf for each hybridization
        # # This step will run in serial mode and will not need to shuffle data
        # #  between cores because everything is on the common file system

        # logger.debug('Create .ppf.hdf5 file')

        # # Create the ppf.hdf5 file that contains the filtered data in uint16
        # preprocessing_file_path = hdf5_utils.hdf5_create_preprocessing_file(hybridizations_infos,processing_hyb,
        #                                 hybridization,flt_rawcnt_config['analysis_name'], hyb_dir,converted_positions,image_properties)

        # logger.debug('Write the .npy filtered files into the .ppf file')
        # # Load and write the .npy tmp images into the hdf5 file

        # # open the hdf5 file
        # with h5py.File(preprocessing_file_path) as f_hdl:
        #     # Loop through each gene
        #     for gene in hybridizations_infos[hybridization].keys():

        #         logger.debug('Writing %s images in .ppf.hdf5',gene)
        #         # list of the files to transfer
        #         filtered_gene_dir = [fdir for fdir in filtered_img_gene_dirs if gene in fdir][0]
        #         filtered_files_list = glob.glob(filtered_gene_dir+'*.npy')

        #         # loop through the list of file
        #         for f_file in filtered_files_list:
        #             pos = f_file.split('/')[-1].split('_')[-1].split('.')[0]
        #             f_hdl[gene]['FilteredData'][pos][:] =np.load(f_file)
        #             f_hdl.flush()
        
        # # ---------------------------------------------------------------------
        
        # # ----------------- STITCHING ------------------------
        # # Load the stitching parameters from the .yaml file

        # # Stitch the image in 2D or 3D (3D need more work/testing)
        # nr_dim = flt_rawcnt_config['nr_dim']

        # # Estimated overlapping between images according to the Nikon software
        # est_overlap = image_properties['Overlapping_percentage']

        # # Number of peaks to use for the alignment
        # nr_peaks = flt_rawcnt_config['nr_peaks']

        # # Determine if the coords need to be flipped

        # y_flip = flt_rawcnt_config['y_flip']

        # # Method to use for blending
        # # can be 'linear' or 'non linear'
        # # The methods that performs the best is the 'non linear'

        # blend = flt_rawcnt_config['blend']

        # # Reference gene for stitching
        # reference_gene = flt_rawcnt_config['reference_gene']

        # pixel_size = image_properties['PixelSize']

        # # Get the list of the filtered files of the reference gene
        # filtered_gene_dir = [gene_dir for gene_dir in filtered_img_gene_dirs if reference_gene in gene_dir][0]
        # filtered_files_list = glob.glob(filtered_gene_dir+'*.npy')

        # # Create pointer of the hdf5 file that will store the stitched reference image
        # # for the current hybridization
        # # Writing
        # tile_file_base_name = flt_rawcnt_config['analysis_name']+'_'+ processing_hyb
        # data_name   = (tile_file_base_name
        #                 + '_' + reference_gene
        #                 + '_stitching_data')

        # stitching_file_name = tile_file_base_name + '.sf.hdf5'
        # stitching_file= h5py.File(hyb_dir+stitching_file_name,'w',libver='latest')  # replace with 'a' as soon as you fix the error


        # # Determine the tiles organization
        # tiles, contig_tuples, nr_pixels, z_count, micData = stitching.get_pairwise_input_npy(image_properties,converted_positions, hybridization,
        #                         est_overlap = est_overlap, y_flip = False, nr_dim = 2)



        # # Align the tiles 
        # futures_processes=client.map(pairwisesingle.align_single_pair_npy,contig_tuples,
        #                             filtered_files_list=filtered_files_list,micData=micData, 
        #                         nr_peaks=nr_peaks)

        # # Gather the futures
        # data = client.gather(futures_processes)


        # # In this case the order of the returned contingency tuples is with
        # # the order of the input contig_tuples

        # # P_all = [el for data_single in data for el in data_single[0]]
        # P_all =[data_single[0] for data_single in data ]
        # P_all = np.array(P_all)
        # P_all = P_all.flat[:]
        # covs_all = [data_single[1] for data_single in data]
        # alignment = {'P': P_all,
        #             'covs': covs_all}


        # # Calculates a shift in global coordinates for each tile (global
        # # alignment) and then applies these shifts to the  corner coordinates
        # # of each tile and returns and saves these shifted corner coordinates.
        # joining = stitching.get_place_tile_input(hyb_dir, tiles, contig_tuples,
        #                                             micData, nr_pixels, z_count,
        #                                             alignment, data_name,
        #                                             nr_dim=nr_dim)

        # # Create the hdf5 file structure
        # stitched_group, linear_blending, blend =  hdf5preparation.create_structures_hdf5_stitched_ref_gene_file_npy(stitching_file, joining, nr_pixels,
        #                                 reference_gene, blend = 'non linear')

        # # Fill the hdf5 containing the stitched image with empty data and
        # # create the blending mask
        # stitched_group['final_image'][:]= np.zeros(joining['final_image_shape'],dtype=np.float64)
        # if blend is not None:
        #     # make mask
        #     stitched_group['blending_mask'][:] = np.zeros(joining['final_image_shape'][-2:],dtype=np.float64)
        #     tilejoining.make_mask(joining, nr_pixels, stitched_group['blending_mask'])

            
        # # Create the subdirectory used to save the blended tiles
        # suffix = 'blended_tiles'
        # blended_tiles_directory = utils.create_single_directory(hyb_dir,reference_gene, hybridization,processing_hyb,suffix,add_slash,
        #                                 analysis_name=flt_rawcnt_config['analysis_name'])

        # # Get the directory with the filtered npy images of the reference_gene to use for stitching
        # stitching_files_dir = [npy_dir for npy_dir in filtered_img_gene_dirs if reference_gene in npy_dir][0]


        # # Create the tmp directory where to save the masks
        # suffix = 'masks'
        # masked_tiles_directory = utils.create_single_directory(hyb_dir,reference_gene, hybridization,processing_hyb,suffix,add_slash,
        #                                 analysis_name=flt_rawcnt_config['analysis_name'])

        # # Create and save the mask files
        # for corn_value,corner_coords in joining['corner_list']:
        #     if not(np.isnan(corner_coords[0])):
        #         cur_mask = stitched_group['blending_mask'][int(corner_coords[0]):int(corner_coords[0]) + int(nr_pixels),
        #                             int(corner_coords[1]):int(corner_coords[1]) + int(nr_pixels)]

        #         fname = masked_tiles_directory + flt_rawcnt_config['analysis_name'] +'_'+processing_hyb+'_'+reference_gene+'_masks_joining_pos_'+str(corn_value)
        #         np.save(fname,cur_mask)


        # # Blend all the tiles and save them in a directory
        # futures_processes = client.map(tilejoining.generate_blended_tile_npy,joining['corner_list'],
        #                             stitching_files_dir = stitching_files_dir,
        #                             blended_tiles_directory = blended_tiles_directory,
        #                             masked_tiles_directory = masked_tiles_directory,
        #                             analysis_name = flt_rawcnt_config['analysis_name'],
        #                             processing_hyb = processing_hyb,reference_gene = reference_gene,
        #                             micData = micData,tiles = tiles,nr_pixels=nr_pixels,
        #                             linear_blending=linear_blending)



        # _ = client.gather(futures_processes)


        # # Write the stitched image
        # tilejoining.make_final_image_npy(joining, stitching_file, blended_tiles_directory, tiles,reference_gene, nr_pixels)

        # # close the hdf5 file
        # stitching_file.close()


        # # Delete the directories with blended tiles and masks
        # shutil.rmtree(blended_tiles_directory)
        # shutil.rmtree(masked_tiles_directory)

        # ----------------- DELETE FILES ------------------------
        # Don't delete the *.npy files here because can be used to 
        # create the final images using the apply stitching related function    









    client.close()
Example #8
0
def _update_engine(publisher: Parameter):
    global DEFAULT_NPARTITIONS, dask_client, num_cpus
    from modin.config import Backend, CpuCount

    if publisher.get() == "Ray":
        import ray
        from modin.engines.ray.utils import initialize_ray

        # With OmniSci backend there is only a single worker per node
        # and we allow it to work on all cores.
        if Backend.get() == "Omnisci":
            CpuCount.put(1)
            os.environ["OMP_NUM_THREADS"] = str(multiprocessing.cpu_count())
        if _is_first_update.get("Ray", True):
            initialize_ray()
        num_cpus = ray.cluster_resources()["CPU"]
    elif publisher.get() == "Dask":  # pragma: no cover
        from distributed.client import get_client

        if threading.current_thread(
        ).name == "MainThread" and _is_first_update.get("Dask", True):
            import warnings

            warnings.warn("The Dask Engine for Modin is experimental.")

            try:
                dask_client = get_client()
            except ValueError:
                from distributed import Client

                dask_client = Client(n_workers=CpuCount.get())

            num_cpus = len(dask_client.ncores())

    elif publisher.get() == "Cloudray":
        from modin.experimental.cloud import get_connection

        conn = get_connection()
        remote_ray = conn.modules["ray"]
        if _is_first_update.get("Cloudray", True):

            @conn.teleport
            def init_remote_ray(partition):
                from ray import ray_constants
                import modin
                from modin.engines.ray.utils import initialize_ray

                modin.set_backends("Ray", partition)
                initialize_ray(
                    override_is_cluster=True,
                    override_redis_address=
                    f"localhost:{ray_constants.DEFAULT_PORT}",
                    override_redis_password=ray_constants.
                    REDIS_DEFAULT_PASSWORD,
                )

            init_remote_ray(Backend.get())
            # import EngineDispatcher here to initialize IO class
            # so it doesn't skew read_csv() timings later on
            import modin.data_management.factories.dispatcher  # noqa: F401
        else:
            get_connection().modules["modin"].set_backends(
                "Ray", Backend.get())

        num_cpus = remote_ray.cluster_resources()["CPU"]
    elif publisher.get() == "Cloudpython":
        from modin.experimental.cloud import get_connection

        get_connection().modules["modin"].set_backends("Python")

    elif publisher.get() not in _NOINIT_ENGINES:
        raise ImportError("Unrecognized execution engine: {}.".format(
            publisher.get()))

    _is_first_update[publisher.get()] = False
    DEFAULT_NPARTITIONS = max(4, int(num_cpus))
Example #9
0
    default="out.tdb",
    help="Output TDB file")

def recursive_glob(start, pattern):
    matches = []
    for root, dirnames, filenames in os.walk(start):
        for filename in fnmatch.filter(filenames, pattern):
            matches.append(os.path.join(root, filename))
    return sorted(matches)

if __name__ == '__main__':
    args = parser.parse_args(sys.argv[1:])
    if not args.dask_scheduler:
        args.dask_scheduler = LocalCluster(n_workers=int(multiprocessing.cpu_count() / 2), threads_per_worker=1, nanny=True)
    client = Client(args.dask_scheduler)
    logging.info(
        "Running with dask scheduler: %s [%s cores]" % (
            args.dask_scheduler,
            sum(client.ncores().values())))
    datasets = load_datasets(sorted(recursive_glob('Al-Ni', '*.json')))
    recfile = open(args.iter_record, 'a') if args.iter_record else None
    try:
        dbf, mdl, model_dof = fit(args.fit_settings, datasets, scheduler=client, recfile=recfile)
    finally:
        if recfile:
            recfile.close()
    dbf.to_file(args.output_tdb, if_exists='overwrite')



Example #10
0
    parser = argparse.ArgumentParser(
        prog='orthophoto_feat_extraction',
        description='Extracts and GeoLocates Features From a UVAN Flight')
    parser.add_argument('-o',
                        '--output',
                        help='Directory where to create test datasets')
    parser.add_argument('-g', '--geoid', help='Path to Geoid File')
    parser.add_argument('-s', '--srtm', help='Path to SRTM Root')
    parser.add_argument('-d', '--scheduler', help='Node hosting the scheduler')
    parser.add_argument('-i', '--orthophoto', help='Path to OrthoPhoto')
    args = parser.parse_args()

    in_ophoto = orthophoto.VRTOrthophoto(args.orthophoto)
    slices = tbdb.slices_from_ophoto(in_ophoto, 5000)
    in_ophoto.close()

    slice_df = pd.DataFrame(slices, columns=['x_slice', 'y_slice'])
    slice_df['out_path'] = args.output
    slice_df['terrain_path'] = args.srtm
    slice_df['geoid'] = args.geoid
    slice_df['ophoto'] = args.orthophoto
    slice_df['idx'] = slice_df.index

    client = Client(args.scheduler)

    meta = {'file_path': np.str, 'num_landmarks': np.int}

    ddf = dd.from_pandas(slice_df, npartitions=len(client.ncores()))
    ask_df = ddf.apply(db_from_slice, axis=1, meta=meta)
    ask_df.compute()