def create_individual(self): """Hacky function to do the bare minimum needed to create some individuals. The initial population is generated and we yield from that. Useful for debugging, to just playing with a single dataset/individual. """ if self.multi_config: raise ValueError(f"Not available in multi_config mdoe - Need a single config to generate an individual from") # Create the RandomState instance self.set_global_rng(self.seed_num) # Create the Dataset instance dataset_obj = Dataset(**self.full_config["dataset"]) # Setup some attributes for the Cluster class Cluster.setup_variables(dataset_obj, self.full_config["ga"]) # Setup the GA objective_dict, pop = self.setup_ga(dataset_obj, self.full_config) # Take from the initial population yield from pop
def generate_indiv(icls, dataset_obj): # Create the individual # Uses the DEAP wrapper around Genotype() indiv = icls([Cluster(size) for size in dataset_obj.cluster_sizes]) # Create the views (each cluster.values is a view into genotype.all_values) indiv.create_views() # And sample some initial values indiv.resample_values() return indiv
def setUp(self): # Whatever setup is needed rng = np.random.RandomState(42) Genotype.global_rng = rng Cluster.global_rng = rng sizes = [190, 20, 30, 110] self.indiv = Genotype([Cluster(size) for size in sizes]) self.indiv.create_views() self.indiv.resample_values() hawks.objectives.Silhouette.setup_indiv(self.indiv)
def test_eigenratio_eccentric(self): clust1 = Cluster(50) clust1.mean = np.array([0, 0]) clust1.cov = np.array([[10, 0], [0, 1]]) clust2 = Cluster(80) clust2.mean = np.array([0, 0]) clust2.cov = np.array([[9.9, 0], [0, 1]]) indiv = Genotype([clust1, clust2]) eigen_ratio = hawks.constraints.eigenval_ratio(indiv) self.assertEqual(eigen_ratio, 10)
def setUp(self): # Whatever setup is needed rng = np.random.RandomState(42) Genotype.global_rng = rng Cluster.global_rng = rng sizes = [190, 20, 30, 110] setattr(Cluster, "num_dims", 2) setattr(Cluster, "initial_mean_upper", 1.0) setattr(Cluster, "initial_cov_upper", 0.5) self.indiv = Genotype([Cluster(size) for size in sizes]) self.indiv.create_views() self.indiv.resample_values() hawks.objectives.Silhouette.setup_indiv(self.indiv)
def test_overlap_same(self): clust1 = Cluster(4000) clust1.mean = np.array([0, 0]) clust1.cov = np.array([[1, 0], [0, 1]]) clust2 = Cluster(4000) clust2.mean = np.array([0, 0]) clust2.cov = np.array([[1, 0], [0, 1]]) indiv = Genotype([clust1, clust2]) indiv.create_views() indiv.resample_values() overlap = hawks.constraints.overlap(indiv) self.assertAlmostEqual(overlap, 0.5, places=1)
def test_overlap_separated(self): clust1 = Cluster(50) clust1.mean = np.array([0, 0]) clust1.cov = np.array([[1, 0], [0, 1]]) clust2 = Cluster(80) clust2.mean = np.array([10, 10]) clust2.cov = np.array([[1, 0], [0, 1]]) indiv = Genotype([clust1, clust2]) indiv.create_views() indiv.resample_values() overlap = hawks.constraints.overlap(indiv) self.assertEqual(overlap, 0)
def test_silhouette_singleton_cluster(self): rng = np.random.RandomState(42) Genotype.global_rng = rng Cluster.global_rng = rng sizes = [1, 20, 30, 110] self.indiv = Genotype([Cluster(size) for size in sizes]) self.indiv.create_views() self.indiv.resample_values() hawks.objectives.Silhouette.setup_indiv(self.indiv) hawks.objectives.Silhouette.eval_objective(self.indiv) close_to_sk = np.isclose( silhouette_score(self.indiv.all_values, self.indiv.labels, metric="sqeuclidean"), self.indiv.silhouette) self.assertTrue(close_to_sk)
def setUp(self): rng = np.random.RandomState(42) Cluster.global_rng = rng Genotype.global_rng = rng setattr(Cluster, "num_dims", 2) clust1 = Cluster(50) clust1.mean = np.array([0, 0]) clust1.cov = np.array([[1, 0], [0, 1]]) clust2 = Cluster(30) clust2.mean = np.array([5, 5]) clust2.cov = np.array([[5, 0], [0, 10]]) self.indiv1 = Genotype([clust1, clust2]) self.indiv1.create_views() self.indiv1.resample_values() clust3 = Cluster(50) clust3.mean = np.array([2, 2]) clust3.cov = np.array([[2, 0], [0, 2]]) clust4 = Cluster(30) clust4.mean = np.array([10, 10]) clust4.cov = np.array([[4, 0], [0, 2]]) self.indiv2 = Genotype([clust3, clust4]) self.indiv2.create_views() self.indiv2.resample_values()
def animate(self, record_stats=False, plot_pop=True, **kwargs): """Function to animate a run of HAWKS (showing how the datasets evolve). An example of this can be found in the README. Produces a series of PNGs, and creates a gif using `ImageMagick <https://imagemagick.org/index.php>`_. Args: record_stats (bool, optional): Whether the results of the run should be recorded (and therefore can be saved, depending on the config). Defaults to False. plot_pop (bool, optional): Whether to plot the whole population. If False, just plots the best individual. Defaults to True. Raises: ValueError: Animation cannot be run for a multi_config; only a single set of parameters is permitted. """ # Raise error if multi-config specified if self.multi_config: raise ValueError(f"Animation is not implemented for multi-config") # Perform initial setup total_configs, key_paths, param_lists = self._setup() # Setup the containers for storing results if record_stats: num_rows = self.full_config["ga"]["num_indivs"] results_dict = defaultdict(list) # Setup the plot folder plot_folder = self._plot_save_setup() # Loop over each run for num_run in tqdm(range(self.num_runs), desc="Runs", leave=False): animate_folder = plot_folder / f"animate_run{num_run}" animate_folder.mkdir(exist_ok=True, parents=True) # Super special seed selection global_seed = self.increment_seed(num_run) # Create the RandomState instance self.set_global_rng(global_seed) # Create the Dataset instance dataset_obj = Dataset(**self.full_config["dataset"]) # Setup some attributes for the Cluster class Cluster.setup_variables(dataset_obj, self.full_config["ga"]) # Setup the GA objective_dict, pop = self.setup_ga(dataset_obj, self.full_config) # Plot the initial population if plot_pop: plotting.plot_pop( pop, fpath=animate_folder / "pop_gen-0", fig_format="png", save=True, remove_axis=True, fig_title="Generation 0", show=False, **kwargs ) # Plot the best indiv else: best_indiv, best_index = self._best_in_pop(pop) plotting.plot_indiv( best_indiv, fpath=animate_folder / "indiv_gen-0", remove_axis=True, save=True, show=False, **kwargs ) if record_stats: # Store results from the initial population results_dict = self._store_results( results_dict, pop, num_run, 0, num_rows, objective_dict ) # Go through each generation for gen in tqdm( range(1, self.full_config["ga"]["num_gens"]), desc="Generations", leave=False ): pop = ga.generation( pop, self.deap_toolbox, self.full_config["constraints"], cxpb=self.full_config["ga"]["mate_prob"] ) # Plot the initial population if plot_pop: plotting.plot_pop( pop, fpath=animate_folder / f"gen-{gen}", fig_format="png", save=True, remove_axis=True, fig_title=f"Generation {gen}", show=False, **kwargs ) # Plot the best indiv else: best_indiv, best_index = self._best_in_pop(pop) plotting.plot_indiv( best_indiv, fpath=animate_folder / f"indiv_gen-{gen}", remove_axis=True, save=True, show=False, **kwargs ) if record_stats: # Store results from each generation results_dict = self._store_results( results_dict, pop, num_run, gen, num_rows, objective_dict ) # Keep a reference to the most recent population self.population = pop if record_stats: self.stats = self.stats.append( pd.DataFrame.from_dict(results_dict), ignore_index=True ) if self.save_stats: # Save to CSV utils.df_to_csv( df=self.stats, path=self.base_folder, filename="hawks_stats" ) # Create the gif if convert is available which_convert = shutil.which("convert") if which_convert is not None: subprocess.run( "convert -resize 50% -delay 30 -loop 0 `ls -v | grep 'gen-'` hawks_animation.gif", shell=True, check=True, cwd=animate_folder )
def run_step(self): """Run function that contains the actual code, yielding after each run, if desired. Yields: :class:`~hawks.generator.SingleObjective`: The generator instance at the time, allowing inspection of the process. """ total_configs, key_paths, param_lists = self._setup() # Initialize the config_id config_id = 0 # Loop over each config for params, config in tqdm(self._get_configs(key_paths, param_lists), desc="Configs", total=total_configs): # Add the config to the list self.config_list.append(config) # Add a list as container for new runs self.best_each_run.append([]) # Local ref to best for each config best_indiv_run = None # Setup the containers for storing results num_rows = config["ga"]["num_indivs"] results_dict = defaultdict(list) # Add the config_id, which is also used for the filename when saving the config results_dict["config_id"] = [config_id]*(num_rows*config["ga"]["num_gens"]*self.num_runs) # Add the specific parameters for this config if self.multi_config: for key, param in zip(key_paths, params): name = "_".join(key[1:]) results_dict[name] += [param]*(num_rows*config["ga"]["num_gens"]*self.num_runs) # Loop over each run for num_run in tqdm(range(self.num_runs), desc="Runs", leave=False): # Increment the seed for this run global_seed = self.increment_seed(num_run) # Create the RandomState instance self.set_global_rng(global_seed) # Create the Dataset instance dataset_obj = Dataset(**config["dataset"]) # Setup some attributes for the Cluster class Cluster.setup_variables(dataset_obj, config["ga"]) # Setup the GA objective_dict, pop = self.setup_ga(dataset_obj, config) # Store results from the initial population results_dict = self._store_results( results_dict, pop, num_run, 0, num_rows, objective_dict ) # Go through each generation for gen in tqdm( range(1, config["ga"]["num_gens"]), desc="Generations", leave=False ): pop = ga.generation( pop, self.deap_toolbox, config["constraints"], cxpb=config["ga"]["mate_prob"] ) # Store results from each generation results_dict = self._store_results( results_dict, pop, num_run, gen, num_rows, objective_dict ) best_indiv_run, best_index = self._best_in_pop(pop) # Store the best indiv from each run self.best_each_run[-1].append(best_indiv_run) # Add column to show best dataset from run results_dict = self._store_best_indiv( results_dict, best_index, config["ga"]["num_gens"], num_rows ) # Keep a reference to the most recent population self.population = pop # YIELDIT yield self # Iterate the config_id config_id += 1 # Append the results of this config to the overall results self.stats = self.stats.append( pd.DataFrame.from_dict(results_dict), ignore_index=True ) # Save the stats for this run if specified if self.save_stats: # Save to CSV utils.df_to_csv( df=self.stats, path=self.base_folder, filename="hawks_stats" ) # Save the best individual(s) and their associated config(s) if self.save_best_data: # Loop over each indiv in each config for config_num, indiv_list in enumerate(self.best_each_run): for run_num, indiv in enumerate(indiv_list): # Save the best data indiv.save_clusters( folder=self.base_folder / "datasets", fname=f"config-{config_num}_run-{run_num}_best_data" )