def setup_dataset(dataset_params): # Create the Dataset instance dataset_obj = Dataset(**dataset_params) # Give the Cluster class access to some key values for key, val in dataset_obj.cluster_vars.items(): setattr(Cluster, key, val) return dataset_obj
def test_exact_min_clust_size(self): kwargs = self.args.copy() kwargs["num_examples"] = 200 kwargs["num_clusters"] = 5 kwargs["min_clust_size"] = 40 # Not possible, will be modified obj = Dataset(**kwargs) self.assertEqual(obj.cluster_sizes, [40] * 5)
def test_incorrect_min_clust_size(self): kwargs = self.args.copy() kwargs["num_examples"] = 100 kwargs["num_clusters"] = 10 kwargs["min_clust_size"] = 20 # Not possible, will be modified obj = Dataset(**kwargs) self.assertLessEqual(obj.min_clust_size, obj.num_examples / obj.num_clusters)
def test_random_cluster_sizes_different(self): kwargs = self.args.copy() kwargs["num_examples"] = 10000 kwargs["num_clusters"] = 100 obj = Dataset(**kwargs) unique_sizes = set() for size in obj.cluster_sizes: unique_sizes.add(size) # There is a chance of a clash, but ensure not all are the same self.assertTrue(len(unique_sizes) > 1)
def test_equal_clusters_are_equal(self): # test that the cluster sizes are equal and expected kwargs = self.args.copy() # Pick tricky numbers kwargs["num_examples"] = 7883 kwargs["num_clusters"] = 17 kwargs["equal_clusters"] = True obj = Dataset(**kwargs) for size in obj.cluster_sizes: with self.subTest(size=size): self.assertEqual(size, 464)
def test_random_cluster_sizes_sum(self): # Have a loop that tests different sizes and that we generate something close to that) kwargs = self.args.copy() kwargs["num_examples"] = 10000 kwargs["num_clusters"] = 100 obj = Dataset(**kwargs) sizes_sum = np.sum(obj.cluster_sizes) close_to_sum = np.around(sizes_sum, decimals=-1) == kwargs["num_examples"] self.assertTrue(close_to_sum)
def create_individual(self): """Hacky function to do the bare minimum needed to create some individuals. The initial population is generated and we yield from that. Useful for debugging, to just playing with a single dataset/individual. """ if self.multi_config: raise ValueError(f"Not available in multi_config mdoe - Need a single config to generate an individual from") # Create the RandomState instance self.set_global_rng(self.seed_num) # Create the Dataset instance dataset_obj = Dataset(**self.full_config["dataset"]) # Setup some attributes for the Cluster class Cluster.setup_variables(dataset_obj, self.full_config["ga"]) # Setup the GA objective_dict, pop = self.setup_ga(dataset_obj, self.full_config) # Take from the initial population yield from pop
def animate(self, record_stats=False, plot_pop=True, **kwargs): """Function to animate a run of HAWKS (showing how the datasets evolve). An example of this can be found in the README. Produces a series of PNGs, and creates a gif using `ImageMagick <https://imagemagick.org/index.php>`_. Args: record_stats (bool, optional): Whether the results of the run should be recorded (and therefore can be saved, depending on the config). Defaults to False. plot_pop (bool, optional): Whether to plot the whole population. If False, just plots the best individual. Defaults to True. Raises: ValueError: Animation cannot be run for a multi_config; only a single set of parameters is permitted. """ # Raise error if multi-config specified if self.multi_config: raise ValueError(f"Animation is not implemented for multi-config") # Perform initial setup total_configs, key_paths, param_lists = self._setup() # Setup the containers for storing results if record_stats: num_rows = self.full_config["ga"]["num_indivs"] results_dict = defaultdict(list) # Setup the plot folder plot_folder = self._plot_save_setup() # Loop over each run for num_run in tqdm(range(self.num_runs), desc="Runs", leave=False): animate_folder = plot_folder / f"animate_run{num_run}" animate_folder.mkdir(exist_ok=True, parents=True) # Super special seed selection global_seed = self.increment_seed(num_run) # Create the RandomState instance self.set_global_rng(global_seed) # Create the Dataset instance dataset_obj = Dataset(**self.full_config["dataset"]) # Setup some attributes for the Cluster class Cluster.setup_variables(dataset_obj, self.full_config["ga"]) # Setup the GA objective_dict, pop = self.setup_ga(dataset_obj, self.full_config) # Plot the initial population if plot_pop: plotting.plot_pop( pop, fpath=animate_folder / "pop_gen-0", fig_format="png", save=True, remove_axis=True, fig_title="Generation 0", show=False, **kwargs ) # Plot the best indiv else: best_indiv, best_index = self._best_in_pop(pop) plotting.plot_indiv( best_indiv, fpath=animate_folder / "indiv_gen-0", remove_axis=True, save=True, show=False, **kwargs ) if record_stats: # Store results from the initial population results_dict = self._store_results( results_dict, pop, num_run, 0, num_rows, objective_dict ) # Go through each generation for gen in tqdm( range(1, self.full_config["ga"]["num_gens"]), desc="Generations", leave=False ): pop = ga.generation( pop, self.deap_toolbox, self.full_config["constraints"], cxpb=self.full_config["ga"]["mate_prob"] ) # Plot the initial population if plot_pop: plotting.plot_pop( pop, fpath=animate_folder / f"gen-{gen}", fig_format="png", save=True, remove_axis=True, fig_title=f"Generation {gen}", show=False, **kwargs ) # Plot the best indiv else: best_indiv, best_index = self._best_in_pop(pop) plotting.plot_indiv( best_indiv, fpath=animate_folder / f"indiv_gen-{gen}", remove_axis=True, save=True, show=False, **kwargs ) if record_stats: # Store results from each generation results_dict = self._store_results( results_dict, pop, num_run, gen, num_rows, objective_dict ) # Keep a reference to the most recent population self.population = pop if record_stats: self.stats = self.stats.append( pd.DataFrame.from_dict(results_dict), ignore_index=True ) if self.save_stats: # Save to CSV utils.df_to_csv( df=self.stats, path=self.base_folder, filename="hawks_stats" ) # Create the gif if convert is available which_convert = shutil.which("convert") if which_convert is not None: subprocess.run( "convert -resize 50% -delay 30 -loop 0 `ls -v | grep 'gen-'` hawks_animation.gif", shell=True, check=True, cwd=animate_folder )
def run_step(self): """Run function that contains the actual code, yielding after each run, if desired. Yields: :class:`~hawks.generator.SingleObjective`: The generator instance at the time, allowing inspection of the process. """ total_configs, key_paths, param_lists = self._setup() # Initialize the config_id config_id = 0 # Loop over each config for params, config in tqdm(self._get_configs(key_paths, param_lists), desc="Configs", total=total_configs): # Add the config to the list self.config_list.append(config) # Add a list as container for new runs self.best_each_run.append([]) # Local ref to best for each config best_indiv_run = None # Setup the containers for storing results num_rows = config["ga"]["num_indivs"] results_dict = defaultdict(list) # Add the config_id, which is also used for the filename when saving the config results_dict["config_id"] = [config_id]*(num_rows*config["ga"]["num_gens"]*self.num_runs) # Add the specific parameters for this config if self.multi_config: for key, param in zip(key_paths, params): name = "_".join(key[1:]) results_dict[name] += [param]*(num_rows*config["ga"]["num_gens"]*self.num_runs) # Loop over each run for num_run in tqdm(range(self.num_runs), desc="Runs", leave=False): # Increment the seed for this run global_seed = self.increment_seed(num_run) # Create the RandomState instance self.set_global_rng(global_seed) # Create the Dataset instance dataset_obj = Dataset(**config["dataset"]) # Setup some attributes for the Cluster class Cluster.setup_variables(dataset_obj, config["ga"]) # Setup the GA objective_dict, pop = self.setup_ga(dataset_obj, config) # Store results from the initial population results_dict = self._store_results( results_dict, pop, num_run, 0, num_rows, objective_dict ) # Go through each generation for gen in tqdm( range(1, config["ga"]["num_gens"]), desc="Generations", leave=False ): pop = ga.generation( pop, self.deap_toolbox, config["constraints"], cxpb=config["ga"]["mate_prob"] ) # Store results from each generation results_dict = self._store_results( results_dict, pop, num_run, gen, num_rows, objective_dict ) best_indiv_run, best_index = self._best_in_pop(pop) # Store the best indiv from each run self.best_each_run[-1].append(best_indiv_run) # Add column to show best dataset from run results_dict = self._store_best_indiv( results_dict, best_index, config["ga"]["num_gens"], num_rows ) # Keep a reference to the most recent population self.population = pop # YIELDIT yield self # Iterate the config_id config_id += 1 # Append the results of this config to the overall results self.stats = self.stats.append( pd.DataFrame.from_dict(results_dict), ignore_index=True ) # Save the stats for this run if specified if self.save_stats: # Save to CSV utils.df_to_csv( df=self.stats, path=self.base_folder, filename="hawks_stats" ) # Save the best individual(s) and their associated config(s) if self.save_best_data: # Loop over each indiv in each config for config_num, indiv_list in enumerate(self.best_each_run): for run_num, indiv in enumerate(indiv_list): # Save the best data indiv.save_clusters( folder=self.base_folder / "datasets", fname=f"config-{config_num}_run-{run_num}_best_data" )