Ejemplo n.º 1
0
 def create_individual(self):
     """Hacky function to do the bare minimum needed to create some individuals. The initial population is generated and we yield from that. Useful for debugging, to just playing with a single dataset/individual.
     """
     if self.multi_config:
         raise ValueError(f"Not available in multi_config mdoe - Need a single config to generate an individual from")
     # Create the RandomState instance
     self.set_global_rng(self.seed_num)
     # Create the Dataset instance
     dataset_obj = Dataset(**self.full_config["dataset"])
     # Setup some attributes for the Cluster class
     Cluster.setup_variables(dataset_obj, self.full_config["ga"])
     # Setup the GA
     objective_dict, pop = self.setup_ga(dataset_obj, self.full_config)
     # Take from the initial population
     yield from pop
Ejemplo n.º 2
0
def generate_indiv(icls, dataset_obj):
    # Create the individual
    # Uses the DEAP wrapper around Genotype()
    indiv = icls([Cluster(size) for size in dataset_obj.cluster_sizes])
    # Create the views (each cluster.values is a view into genotype.all_values)
    indiv.create_views()
    # And sample some initial values
    indiv.resample_values()
    return indiv
Ejemplo n.º 3
0
 def setUp(self):
     # Whatever setup is needed
     rng = np.random.RandomState(42)
     Genotype.global_rng = rng
     Cluster.global_rng = rng
     sizes = [190, 20, 30, 110]
     self.indiv = Genotype([Cluster(size) for size in sizes])
     self.indiv.create_views()
     self.indiv.resample_values()
     hawks.objectives.Silhouette.setup_indiv(self.indiv)
Ejemplo n.º 4
0
    def test_eigenratio_eccentric(self):
        clust1 = Cluster(50)
        clust1.mean = np.array([0, 0])
        clust1.cov = np.array([[10, 0], [0, 1]])

        clust2 = Cluster(80)
        clust2.mean = np.array([0, 0])
        clust2.cov = np.array([[9.9, 0], [0, 1]])

        indiv = Genotype([clust1, clust2])
        eigen_ratio = hawks.constraints.eigenval_ratio(indiv)

        self.assertEqual(eigen_ratio, 10)
Ejemplo n.º 5
0
 def setUp(self):
     # Whatever setup is needed
     rng = np.random.RandomState(42)
     Genotype.global_rng = rng
     Cluster.global_rng = rng
     sizes = [190, 20, 30, 110]
     setattr(Cluster, "num_dims", 2)
     setattr(Cluster, "initial_mean_upper", 1.0)
     setattr(Cluster, "initial_cov_upper", 0.5)
     self.indiv = Genotype([Cluster(size) for size in sizes])
     self.indiv.create_views()
     self.indiv.resample_values()
     hawks.objectives.Silhouette.setup_indiv(self.indiv)
Ejemplo n.º 6
0
    def test_overlap_same(self):
        clust1 = Cluster(4000)
        clust1.mean = np.array([0, 0])
        clust1.cov = np.array([[1, 0], [0, 1]])

        clust2 = Cluster(4000)
        clust2.mean = np.array([0, 0])
        clust2.cov = np.array([[1, 0], [0, 1]])

        indiv = Genotype([clust1, clust2])
        indiv.create_views()
        indiv.resample_values()

        overlap = hawks.constraints.overlap(indiv)

        self.assertAlmostEqual(overlap, 0.5, places=1)
Ejemplo n.º 7
0
    def test_overlap_separated(self):
        clust1 = Cluster(50)
        clust1.mean = np.array([0, 0])
        clust1.cov = np.array([[1, 0], [0, 1]])

        clust2 = Cluster(80)
        clust2.mean = np.array([10, 10])
        clust2.cov = np.array([[1, 0], [0, 1]])

        indiv = Genotype([clust1, clust2])
        indiv.create_views()
        indiv.resample_values()

        overlap = hawks.constraints.overlap(indiv)

        self.assertEqual(overlap, 0)
Ejemplo n.º 8
0
    def test_silhouette_singleton_cluster(self):
        rng = np.random.RandomState(42)
        Genotype.global_rng = rng
        Cluster.global_rng = rng
        sizes = [1, 20, 30, 110]
        self.indiv = Genotype([Cluster(size) for size in sizes])
        self.indiv.create_views()
        self.indiv.resample_values()
        hawks.objectives.Silhouette.setup_indiv(self.indiv)

        hawks.objectives.Silhouette.eval_objective(self.indiv)

        close_to_sk = np.isclose(
            silhouette_score(self.indiv.all_values,
                             self.indiv.labels,
                             metric="sqeuclidean"), self.indiv.silhouette)

        self.assertTrue(close_to_sk)
Ejemplo n.º 9
0
    def setUp(self):
        rng = np.random.RandomState(42)
        Cluster.global_rng = rng
        Genotype.global_rng = rng

        setattr(Cluster, "num_dims", 2)

        clust1 = Cluster(50)
        clust1.mean = np.array([0, 0])
        clust1.cov = np.array([[1, 0], [0, 1]])
        clust2 = Cluster(30)
        clust2.mean = np.array([5, 5])
        clust2.cov = np.array([[5, 0], [0, 10]])
        self.indiv1 = Genotype([clust1, clust2])
        self.indiv1.create_views()
        self.indiv1.resample_values()

        clust3 = Cluster(50)
        clust3.mean = np.array([2, 2])
        clust3.cov = np.array([[2, 0], [0, 2]])
        clust4 = Cluster(30)
        clust4.mean = np.array([10, 10])
        clust4.cov = np.array([[4, 0], [0, 2]])
        self.indiv2 = Genotype([clust3, clust4])
        self.indiv2.create_views()
        self.indiv2.resample_values()
Ejemplo n.º 10
0
 def animate(self, record_stats=False, plot_pop=True, **kwargs):
     """Function to animate a run of HAWKS (showing how the datasets evolve). An example of this can be found in the README. Produces a series of PNGs, and creates a gif using `ImageMagick <https://imagemagick.org/index.php>`_.
     
     Args:
         record_stats (bool, optional): Whether the results of the run should be recorded (and therefore can be saved, depending on the config). Defaults to False.
         plot_pop (bool, optional): Whether to plot the whole population. If False, just plots the best individual. Defaults to True.
     
     Raises:
         ValueError: Animation cannot be run for a multi_config; only a single set of parameters is permitted.
     """
     # Raise error if multi-config specified
     if self.multi_config:
         raise ValueError(f"Animation is not implemented for multi-config")
     # Perform initial setup
     total_configs, key_paths, param_lists = self._setup()
     # Setup the containers for storing results
     if record_stats:
         num_rows = self.full_config["ga"]["num_indivs"]
         results_dict = defaultdict(list)
     # Setup the plot folder
     plot_folder = self._plot_save_setup()
     # Loop over each run
     for num_run in tqdm(range(self.num_runs), desc="Runs", leave=False):
         animate_folder = plot_folder / f"animate_run{num_run}"
         animate_folder.mkdir(exist_ok=True, parents=True)
         # Super special seed selection
         global_seed = self.increment_seed(num_run)
         # Create the RandomState instance
         self.set_global_rng(global_seed)
         # Create the Dataset instance
         dataset_obj = Dataset(**self.full_config["dataset"])
         # Setup some attributes for the Cluster class
         Cluster.setup_variables(dataset_obj, self.full_config["ga"])
         # Setup the GA
         objective_dict, pop = self.setup_ga(dataset_obj, self.full_config)
         # Plot the initial population
         if plot_pop:
             plotting.plot_pop(
                 pop,
                 fpath=animate_folder / "pop_gen-0",
                 fig_format="png",
                 save=True,
                 remove_axis=True,
                 fig_title="Generation 0",
                 show=False,
                 **kwargs
             )
         # Plot the best indiv
         else:
             best_indiv, best_index = self._best_in_pop(pop)
             plotting.plot_indiv(
                 best_indiv,
                 fpath=animate_folder / "indiv_gen-0",
                 remove_axis=True,
                 save=True,
                 show=False,
                 **kwargs
             )
         if record_stats:
             # Store results from the initial population
             results_dict = self._store_results(
                 results_dict, pop, num_run, 0, num_rows, objective_dict
             )
         # Go through each generation
         for gen in tqdm(
                 range(1, self.full_config["ga"]["num_gens"]),
                 desc="Generations", leave=False
             ):
             pop = ga.generation(
                 pop,
                 self.deap_toolbox,
                 self.full_config["constraints"],
                 cxpb=self.full_config["ga"]["mate_prob"]
             )
             # Plot the initial population
             if plot_pop:
                 plotting.plot_pop(
                     pop,
                     fpath=animate_folder / f"gen-{gen}",
                     fig_format="png",
                     save=True,
                     remove_axis=True,
                     fig_title=f"Generation {gen}",
                     show=False,
                     **kwargs
                 )
             # Plot the best indiv
             else:
                 best_indiv, best_index = self._best_in_pop(pop)
                 plotting.plot_indiv(
                     best_indiv,
                     fpath=animate_folder / f"indiv_gen-{gen}",
                     remove_axis=True,
                     save=True,
                     show=False,
                     **kwargs
                 )
             if record_stats:
                 # Store results from each generation
                 results_dict = self._store_results(
                     results_dict, pop, num_run, gen, num_rows, objective_dict
                 )
             # Keep a reference to the most recent population
             self.population = pop
     if record_stats:
         self.stats = self.stats.append(
             pd.DataFrame.from_dict(results_dict), ignore_index=True
         )
         if self.save_stats:
             # Save to CSV
             utils.df_to_csv(
                 df=self.stats,
                 path=self.base_folder,
                 filename="hawks_stats"
             )
     # Create the gif if convert is available
     which_convert = shutil.which("convert")
     if which_convert is not None:
         subprocess.run(
             "convert -resize 50% -delay 30 -loop 0 `ls -v | grep 'gen-'` hawks_animation.gif",
             shell=True,
             check=True,
             cwd=animate_folder
         )
Ejemplo n.º 11
0
 def run_step(self):
     """Run function that contains the actual code, yielding after each run, if desired.
     
     Yields:
         :class:`~hawks.generator.SingleObjective`: The generator instance at the time, allowing inspection of the process.
     """
     total_configs, key_paths, param_lists = self._setup()
     # Initialize the config_id
     config_id = 0
     # Loop over each config
     for params, config in tqdm(self._get_configs(key_paths, param_lists), desc="Configs", total=total_configs):
         # Add the config to the list
         self.config_list.append(config)
         # Add a list as container for new runs
         self.best_each_run.append([])
         # Local ref to best for each config
         best_indiv_run = None
         # Setup the containers for storing results
         num_rows = config["ga"]["num_indivs"]
         results_dict = defaultdict(list)
         # Add the config_id, which is also used for the filename when saving the config
         results_dict["config_id"] = [config_id]*(num_rows*config["ga"]["num_gens"]*self.num_runs)
         # Add the specific parameters for this config
         if self.multi_config:
             for key, param in zip(key_paths, params):
                 name = "_".join(key[1:])
                 results_dict[name] += [param]*(num_rows*config["ga"]["num_gens"]*self.num_runs)
         # Loop over each run
         for num_run in tqdm(range(self.num_runs), desc="Runs", leave=False):
             # Increment the seed for this run
             global_seed = self.increment_seed(num_run)
             # Create the RandomState instance
             self.set_global_rng(global_seed)
             # Create the Dataset instance
             dataset_obj = Dataset(**config["dataset"])
             # Setup some attributes for the Cluster class
             Cluster.setup_variables(dataset_obj, config["ga"])
             # Setup the GA
             objective_dict, pop = self.setup_ga(dataset_obj, config)
             # Store results from the initial population
             results_dict = self._store_results(
                 results_dict, pop, num_run, 0, num_rows, objective_dict
             )
             # Go through each generation
             for gen in tqdm(
                     range(1, config["ga"]["num_gens"]),
                     desc="Generations", leave=False
                 ):
                 pop = ga.generation(
                     pop,
                     self.deap_toolbox,
                     config["constraints"],
                     cxpb=config["ga"]["mate_prob"]
                 )
                 # Store results from each generation
                 results_dict = self._store_results(
                     results_dict, pop, num_run, gen, num_rows, objective_dict
                 )
             best_indiv_run, best_index = self._best_in_pop(pop)
             # Store the best indiv from each run
             self.best_each_run[-1].append(best_indiv_run)
             # Add column to show best dataset from run
             results_dict = self._store_best_indiv(
                 results_dict, best_index, config["ga"]["num_gens"], num_rows
             )
             # Keep a reference to the most recent population
             self.population = pop
             # YIELDIT
             yield self
         # Iterate the config_id
         config_id += 1
         # Append the results of this config to the overall results
         self.stats = self.stats.append(
             pd.DataFrame.from_dict(results_dict), ignore_index=True
         )
     # Save the stats for this run if specified
     if self.save_stats:
         # Save to CSV
         utils.df_to_csv(
             df=self.stats,
             path=self.base_folder,
             filename="hawks_stats"
         )
     # Save the best individual(s) and their associated config(s)
     if self.save_best_data:
         # Loop over each indiv in each config
         for config_num, indiv_list in enumerate(self.best_each_run):
             for run_num, indiv in enumerate(indiv_list):
                 # Save the best data
                 indiv.save_clusters(
                     folder=self.base_folder / "datasets",
                     fname=f"config-{config_num}_run-{run_num}_best_data"
                 )