def test_full_hawks_run_multiple(self): gen = hawks.create_generator("validation.json") gen.run() # Run a second time to ensure there's no carryover gen = hawks.create_generator("validation.json") gen.run() res = gen.get_stats() known_result = pd.read_csv("validation.csv", index_col=False) # Pandas can be iffy with data types equals = np.allclose(res.values, known_result.values) self.assertTrue(equals)
def test_full_hawks_run_multiple(self): test_fpath = Path(hawks.__file__).parents[1] / "tests" gen = hawks.create_generator(test_fpath / "validation.json") gen.run() # Run a second time to ensure there's no carryover gen = hawks.create_generator(test_fpath / "validation.json") gen.run() res = gen.get_stats() known_result = pd.read_csv(test_fpath / "validation.csv", index_col=False) print(res) print(known_result) # Pandas can be iffy with data types equals = np.allclose(res.values, known_result.values) self.assertTrue(equals)
def test_nested_config_arg(self): gen = hawks.create_generator( {"constraints": { "overlap": { "limit": "TEST" } }}) self.assertEqual(gen.full_config["constraints"]["overlap"]["limit"], "TEST")
def test_full_hawks_run(self): gen = hawks.create_generator("validation.json") gen.run() res = gen.get_stats() known_result = pd.read_csv("validation.csv", index_col=False) print(res) print(known_result) # Pandas can be iffy with data types equals = np.allclose(res.values, known_result.values) self.assertTrue(equals)
def test_full_hawks_run(self): test_fpath = Path(hawks.__file__).parents[1] / "tests" gen = hawks.create_generator(test_fpath / "validation.json") gen.run() res = gen.get_stats() known_result = pd.read_csv(test_fpath / "validation.csv", index_col=False) print("Result:") print(res) print("Known result:") print(known_result) print("---") # Pandas can be iffy with data types equals = np.allclose(res.values, known_result.values) self.assertTrue(equals)
def test_incorrect_config_arg(self): with self.assertRaises(ValueError): gen = hawks.create_generator({ "hawks": { "seed_num": 4, "num_runs": 1 }, "objectives": { "silhouette": { "target": 0.9 } }, "constraints": { "eigenval_ratio": { "lim": "upper" # <--- error } } })
def test_multiconfig_deep(self): config = { "dataset": { "num_examples": [10, 100, 1000] }, "constraints": { "overlap": { "limit": ["upper", "lower"] } }, "ga": { "num_gens": [50, 100, 10, 200], "mut_args_mean": { "dims": ["each", "all"] } } } obj = hawks.create_generator(config) total_configs, _, _ = obj._count_multiconfigs() self.assertEqual(total_configs, 48)
def main(ndim, nclusters, run_id): config = { "hawks": { "folder_name": "", "save_best_data": True }, "dataset": { "num_clusters": nclusters, "num_dims": ndim }, "ga": { "num_gens": 500 }, "constraints": { "overlap": { "threshold": 0.3, "limit": "lower" } } } generator = hawks.create_generator(config) print(generator.folder_name) print(generator.save_best_data) generator.run() # Get the best dataset found and it's labels datasets, label_sets = generator.get_best_dataset() # Stored as a list for multiple runs data, labels = datasets[0], label_sets[0] # Run KMeans on the data km = KMeans(n_clusters=len(np.unique(labels)), random_state=0).fit(data) # Get the Adjusted Rand Index for KMeans on the data ari = adjusted_rand_score(labels, km.labels_) sil = silhouette_score(data, km.labels_) print(f"ARI: {ari}, SIL: {sil}") generator.plot_best_indivs(show=True)
""" import seaborn as sns import hawks # Create the generator generator = hawks.create_generator({ "hawks": { "seed_num": 42, "num_runs": 5 }, "dataset": { "num_clusters": 5 }, "objectives": { "silhouette": { "target": 0.9 } }, "constraints": { "overlap": { "threshold": 0.05, "limit": "lower" } } }) # Run HAWKS! generator.run() # Make a dictionary of options common to both plots converg_kws = dict(show=True, xlabel="Generation", ci="sd", legend_type=None) # Make the font etc. larger sns.set_context("talk")
from pathlib import Path import hawks gen = hawks.create_generator("validation.json") gen.run() hawks.utils.df_to_csv( gen.stats, Path.cwd(), "validation" )
def setUp(self): self.gen = hawks.create_generator("validation.json") self.init_pop = [] for indiv in self.gen.create_individual(): self.init_pop.append(indiv)
from sklearn.datasets import make_blobs, make_moons import seaborn as sns import hawks SEED_NUM = 10 NUM_RUNS = 10 # May take a few minutes NUM_CLUSTERS = 5 generator = hawks.create_generator({ "hawks": { "seed_num": SEED_NUM, "num_runs": int(NUM_RUNS / 2) # for parity }, "dataset": { "num_clusters": NUM_CLUSTERS }, "objectives": { "silhouette": { "target": [0.5, 0.9] } } }) generator.run() # Analyse the hawks datasets df, _ = hawks.analysis.analyse_datasets(generator=generator, source="HAWKS", seed=SEED_NUM, save=False) # Make the blobs datasets datasets = [] label_sets = []
import numpy as np from sklearn.cluster import KMeans from sklearn.metrics import adjusted_rand_score import hawks SEED_NUM = 42 # Fix the seed number config = { "hawks": { "seed_num": SEED_NUM } } # Any missing parameters will take the default seen in configs/defaults.json generator = hawks.create_generator(config) # Run the generator generator.run() # Get the best dataset found and it's labels datasets, label_sets = generator.get_best_dataset() # Stored as a list for multiple runs data, labels = datasets[0], label_sets[0] # Run KMeans on the data km = KMeans( n_clusters=len(np.unique(labels)), random_state=SEED_NUM ).fit(data) # Get the Adjusted Rand Index for KMeans on the data ari = adjusted_rand_score(labels, km.labels_) print(f"ARI: {ari}")
def setUp(self): self.gen = hawks.create_generator( Path(hawks.__file__).parents[1] / "tests" / "validation.json") self.init_pop = [] for indiv in self.gen.create_individual(): self.init_pop.append(indiv)
# Use a HAWKS and sklearn example, and show the performance (boxplot) for HAWKS, moons, and blobs from pathlib import Path from sklearn.datasets import make_blobs, make_moons import hawks SEED_NUM = 42 SAVE_FOLDER = Path.cwd() NUM_RUNS = 5 NUM_CLUSTERS = 5 generator = hawks.create_generator({ "hawks": { "seed_num": SEED_NUM, "num_runs": NUM_RUNS }, "dataset": { "num_clusters": NUM_CLUSTERS } }) generator.run() # Analyse the hawks datasets df, _ = hawks.analysis.analyse_datasets(generator=generator, source="HAWKS", seed=SEED_NUM, save=False) # Make the blobs datasets datasets = [] label_sets = [] for run in range(NUM_RUNS): data, labels = make_blobs(n_samples=1000, n_features=2,