Beispiel #1
0
def setup_dataset(dataset_params):
    # Create the Dataset instance
    dataset_obj = Dataset(**dataset_params)
    # Give the Cluster class access to some key values
    for key, val in dataset_obj.cluster_vars.items():
        setattr(Cluster, key, val)
    return dataset_obj
Beispiel #2
0
    def test_exact_min_clust_size(self):
        kwargs = self.args.copy()
        kwargs["num_examples"] = 200
        kwargs["num_clusters"] = 5
        kwargs["min_clust_size"] = 40  # Not possible, will be modified
        obj = Dataset(**kwargs)

        self.assertEqual(obj.cluster_sizes, [40] * 5)
Beispiel #3
0
    def test_incorrect_min_clust_size(self):
        kwargs = self.args.copy()
        kwargs["num_examples"] = 100
        kwargs["num_clusters"] = 10
        kwargs["min_clust_size"] = 20  # Not possible, will be modified
        obj = Dataset(**kwargs)

        self.assertLessEqual(obj.min_clust_size,
                             obj.num_examples / obj.num_clusters)
Beispiel #4
0
    def test_random_cluster_sizes_different(self):
        kwargs = self.args.copy()
        kwargs["num_examples"] = 10000
        kwargs["num_clusters"] = 100
        obj = Dataset(**kwargs)

        unique_sizes = set()
        for size in obj.cluster_sizes:
            unique_sizes.add(size)
        # There is a chance of a clash, but ensure not all are the same
        self.assertTrue(len(unique_sizes) > 1)
Beispiel #5
0
    def test_equal_clusters_are_equal(self):
        # test that the cluster sizes are equal and expected
        kwargs = self.args.copy()
        # Pick tricky numbers
        kwargs["num_examples"] = 7883
        kwargs["num_clusters"] = 17
        kwargs["equal_clusters"] = True
        obj = Dataset(**kwargs)

        for size in obj.cluster_sizes:
            with self.subTest(size=size):
                self.assertEqual(size, 464)
Beispiel #6
0
    def test_random_cluster_sizes_sum(self):
        # Have a loop that tests different sizes and that we generate something close to that)
        kwargs = self.args.copy()
        kwargs["num_examples"] = 10000
        kwargs["num_clusters"] = 100
        obj = Dataset(**kwargs)

        sizes_sum = np.sum(obj.cluster_sizes)

        close_to_sum = np.around(sizes_sum,
                                 decimals=-1) == kwargs["num_examples"]

        self.assertTrue(close_to_sum)
Beispiel #7
0
 def create_individual(self):
     """Hacky function to do the bare minimum needed to create some individuals. The initial population is generated and we yield from that. Useful for debugging, to just playing with a single dataset/individual.
     """
     if self.multi_config:
         raise ValueError(f"Not available in multi_config mdoe - Need a single config to generate an individual from")
     # Create the RandomState instance
     self.set_global_rng(self.seed_num)
     # Create the Dataset instance
     dataset_obj = Dataset(**self.full_config["dataset"])
     # Setup some attributes for the Cluster class
     Cluster.setup_variables(dataset_obj, self.full_config["ga"])
     # Setup the GA
     objective_dict, pop = self.setup_ga(dataset_obj, self.full_config)
     # Take from the initial population
     yield from pop
Beispiel #8
0
 def animate(self, record_stats=False, plot_pop=True, **kwargs):
     """Function to animate a run of HAWKS (showing how the datasets evolve). An example of this can be found in the README. Produces a series of PNGs, and creates a gif using `ImageMagick <https://imagemagick.org/index.php>`_.
     
     Args:
         record_stats (bool, optional): Whether the results of the run should be recorded (and therefore can be saved, depending on the config). Defaults to False.
         plot_pop (bool, optional): Whether to plot the whole population. If False, just plots the best individual. Defaults to True.
     
     Raises:
         ValueError: Animation cannot be run for a multi_config; only a single set of parameters is permitted.
     """
     # Raise error if multi-config specified
     if self.multi_config:
         raise ValueError(f"Animation is not implemented for multi-config")
     # Perform initial setup
     total_configs, key_paths, param_lists = self._setup()
     # Setup the containers for storing results
     if record_stats:
         num_rows = self.full_config["ga"]["num_indivs"]
         results_dict = defaultdict(list)
     # Setup the plot folder
     plot_folder = self._plot_save_setup()
     # Loop over each run
     for num_run in tqdm(range(self.num_runs), desc="Runs", leave=False):
         animate_folder = plot_folder / f"animate_run{num_run}"
         animate_folder.mkdir(exist_ok=True, parents=True)
         # Super special seed selection
         global_seed = self.increment_seed(num_run)
         # Create the RandomState instance
         self.set_global_rng(global_seed)
         # Create the Dataset instance
         dataset_obj = Dataset(**self.full_config["dataset"])
         # Setup some attributes for the Cluster class
         Cluster.setup_variables(dataset_obj, self.full_config["ga"])
         # Setup the GA
         objective_dict, pop = self.setup_ga(dataset_obj, self.full_config)
         # Plot the initial population
         if plot_pop:
             plotting.plot_pop(
                 pop,
                 fpath=animate_folder / "pop_gen-0",
                 fig_format="png",
                 save=True,
                 remove_axis=True,
                 fig_title="Generation 0",
                 show=False,
                 **kwargs
             )
         # Plot the best indiv
         else:
             best_indiv, best_index = self._best_in_pop(pop)
             plotting.plot_indiv(
                 best_indiv,
                 fpath=animate_folder / "indiv_gen-0",
                 remove_axis=True,
                 save=True,
                 show=False,
                 **kwargs
             )
         if record_stats:
             # Store results from the initial population
             results_dict = self._store_results(
                 results_dict, pop, num_run, 0, num_rows, objective_dict
             )
         # Go through each generation
         for gen in tqdm(
                 range(1, self.full_config["ga"]["num_gens"]),
                 desc="Generations", leave=False
             ):
             pop = ga.generation(
                 pop,
                 self.deap_toolbox,
                 self.full_config["constraints"],
                 cxpb=self.full_config["ga"]["mate_prob"]
             )
             # Plot the initial population
             if plot_pop:
                 plotting.plot_pop(
                     pop,
                     fpath=animate_folder / f"gen-{gen}",
                     fig_format="png",
                     save=True,
                     remove_axis=True,
                     fig_title=f"Generation {gen}",
                     show=False,
                     **kwargs
                 )
             # Plot the best indiv
             else:
                 best_indiv, best_index = self._best_in_pop(pop)
                 plotting.plot_indiv(
                     best_indiv,
                     fpath=animate_folder / f"indiv_gen-{gen}",
                     remove_axis=True,
                     save=True,
                     show=False,
                     **kwargs
                 )
             if record_stats:
                 # Store results from each generation
                 results_dict = self._store_results(
                     results_dict, pop, num_run, gen, num_rows, objective_dict
                 )
             # Keep a reference to the most recent population
             self.population = pop
     if record_stats:
         self.stats = self.stats.append(
             pd.DataFrame.from_dict(results_dict), ignore_index=True
         )
         if self.save_stats:
             # Save to CSV
             utils.df_to_csv(
                 df=self.stats,
                 path=self.base_folder,
                 filename="hawks_stats"
             )
     # Create the gif if convert is available
     which_convert = shutil.which("convert")
     if which_convert is not None:
         subprocess.run(
             "convert -resize 50% -delay 30 -loop 0 `ls -v | grep 'gen-'` hawks_animation.gif",
             shell=True,
             check=True,
             cwd=animate_folder
         )
Beispiel #9
0
 def run_step(self):
     """Run function that contains the actual code, yielding after each run, if desired.
     
     Yields:
         :class:`~hawks.generator.SingleObjective`: The generator instance at the time, allowing inspection of the process.
     """
     total_configs, key_paths, param_lists = self._setup()
     # Initialize the config_id
     config_id = 0
     # Loop over each config
     for params, config in tqdm(self._get_configs(key_paths, param_lists), desc="Configs", total=total_configs):
         # Add the config to the list
         self.config_list.append(config)
         # Add a list as container for new runs
         self.best_each_run.append([])
         # Local ref to best for each config
         best_indiv_run = None
         # Setup the containers for storing results
         num_rows = config["ga"]["num_indivs"]
         results_dict = defaultdict(list)
         # Add the config_id, which is also used for the filename when saving the config
         results_dict["config_id"] = [config_id]*(num_rows*config["ga"]["num_gens"]*self.num_runs)
         # Add the specific parameters for this config
         if self.multi_config:
             for key, param in zip(key_paths, params):
                 name = "_".join(key[1:])
                 results_dict[name] += [param]*(num_rows*config["ga"]["num_gens"]*self.num_runs)
         # Loop over each run
         for num_run in tqdm(range(self.num_runs), desc="Runs", leave=False):
             # Increment the seed for this run
             global_seed = self.increment_seed(num_run)
             # Create the RandomState instance
             self.set_global_rng(global_seed)
             # Create the Dataset instance
             dataset_obj = Dataset(**config["dataset"])
             # Setup some attributes for the Cluster class
             Cluster.setup_variables(dataset_obj, config["ga"])
             # Setup the GA
             objective_dict, pop = self.setup_ga(dataset_obj, config)
             # Store results from the initial population
             results_dict = self._store_results(
                 results_dict, pop, num_run, 0, num_rows, objective_dict
             )
             # Go through each generation
             for gen in tqdm(
                     range(1, config["ga"]["num_gens"]),
                     desc="Generations", leave=False
                 ):
                 pop = ga.generation(
                     pop,
                     self.deap_toolbox,
                     config["constraints"],
                     cxpb=config["ga"]["mate_prob"]
                 )
                 # Store results from each generation
                 results_dict = self._store_results(
                     results_dict, pop, num_run, gen, num_rows, objective_dict
                 )
             best_indiv_run, best_index = self._best_in_pop(pop)
             # Store the best indiv from each run
             self.best_each_run[-1].append(best_indiv_run)
             # Add column to show best dataset from run
             results_dict = self._store_best_indiv(
                 results_dict, best_index, config["ga"]["num_gens"], num_rows
             )
             # Keep a reference to the most recent population
             self.population = pop
             # YIELDIT
             yield self
         # Iterate the config_id
         config_id += 1
         # Append the results of this config to the overall results
         self.stats = self.stats.append(
             pd.DataFrame.from_dict(results_dict), ignore_index=True
         )
     # Save the stats for this run if specified
     if self.save_stats:
         # Save to CSV
         utils.df_to_csv(
             df=self.stats,
             path=self.base_folder,
             filename="hawks_stats"
         )
     # Save the best individual(s) and their associated config(s)
     if self.save_best_data:
         # Loop over each indiv in each config
         for config_num, indiv_list in enumerate(self.best_each_run):
             for run_num, indiv in enumerate(indiv_list):
                 # Save the best data
                 indiv.save_clusters(
                     folder=self.base_folder / "datasets",
                     fname=f"config-{config_num}_run-{run_num}_best_data"
                 )