def run(self, data1: Data, data2: Data) -> ClusterMatcherResult: # todo [perf, low effort, med prio]: for speedup: only use pd.Series of # clusters ndata1 = data1.copy(deep=True) ndata2 = data2.copy(deep=True) # 1. Throw out index_intersection = set(ndata1.df.index).intersection( set(ndata2.df.index)) ndata1.df = ndata1.df.loc[index_intersection] ndata2.df = ndata2.df.loc[index_intersection] # 2. Rename clusters clusters2 = set(ndata2.df[self.cluster_column]) dct = {} for cluster2 in clusters2: mask = ndata2.df[self.cluster_column] == cluster2 most_likely = np.argmax( np.bincount(ndata1.df[self.cluster_column][mask])) dct[cluster2] = most_likely ndata2.df[self.cluster_column] = ndata2.df[self.cluster_column].map( dct) return ClusterMatcherResult(data1=ndata1, data2=ndata2, rename_dct=dct)
def setUp(self): self.d1 = Data() self.d2 = Data() self.d3 = Data() self.d4 = Data() self.d1.df = pd.DataFrame({"cluster": [1, 1, 2, 2, 3]}) self.d2.df = pd.DataFrame({"cluster": [2, 2, 3, 3, 1]}) self.d3.df = pd.DataFrame({"cluster": [2, 1, 2, 2, 3]}) self.d4.df = pd.DataFrame({"cluster": [4, 1, 2, 2, 3]})
def test_run_identity(self): s = Scanner() d = Data() s.set_spoints_equidist({"a": (0, 1, 2)}) s.set_dfunction(func_identity) s.run(d).write() self.assertEqual(sorted(list(d.df.columns)), ["a", "bin0"]) self.assertAllClose(d.df.values, np.array([[0.0, 0.0], [1.0, 1.0]])) d.write(Path(self.tmpdir.name) / "test.sql")
def test_dress_rehearsal(tmp_path): s = WilsonScanner(scale=5, eft="WET", basis="flavio") s.set_dfunction(random_kinematics, sampling=np.linspace(0.0, 1.0, 10), normalize=True) s.set_no_workers(no_workers=1) s.set_spoints_equidist({ "CVL_bctaunutau": (-0.5, 0.5, 3), "CSL_bctaunutau": (-0.5, 0.5, 3), "CT_bctaunutau": (-0.1, 0.1, 3), }) d = Data() r = s.run(d) r.write() # Can remove str casting once we remove py3.5 support d.write(str(tmp_path / "dress_rehearsal.sql"), overwrite="overwrite") d = DataWithErrors(str(tmp_path / "dress_rehearsal.sql")) d.add_rel_err_uncorr(0.01) d.add_err_poisson(1000) c = HierarchyCluster() c.set_metric(chi2_metric) b = Benchmark() b.set_metric(chi2_metric) c.set_max_d(1) c.run(d).write() b.run(d).write()
def test_run_simple_bins_singlecore(self): s = Scanner() d = Data() s.set_spoints_equidist({"a": (0, 1, 2)}) s.set_dfunction(func_zero_bins, binning=[0, 1, 2]) s.set_no_workers(1) s.run(d).write() self.assertEqual(sorted(list(d.df.columns)), ["a", "bin0", "bin1"]) self.assertAllClose(d.df.values, np.array([[0.0, 0.0, 0.0], [1.0, 1.0, 1.0]])) d.write(Path(self.tmpdir.name) / "test.sql")
def run( self, data: Data, cluster: Cluster, benchmark: Optional[AbstractBenchmark] = None, ) -> SubSampleStabilityTesterResult: """ Run test. Args: data: :class:`~clusterking.data.Data` object cluster: Pre-configured :class:`~clusterking.cluster.Cluster` object benchmark: Optional: :class:`~clusterking.cluster.cluster.Cluster` object Returns: :class:`SubSampleStabilityTesterResult` object """ if not self._sample_kwargs: msg = ("You need to configure sampling with set_sampling before " "you can run this method.") raise ValueError(msg) original_data = data.copy(deep=True) cluster.run(original_data).write() if self._progress_bar: iterator = tqdm.auto.tqdm(range(self._repeat)) else: iterator = range(self._repeat) fom_results = collections.defaultdict(list) sample_kwargs = copy.deepcopy(self._sample_kwargs) if benchmark is not None and "bpoints" not in self._sample_kwargs: sample_kwargs["bpoints"] = True for _ in iterator: this_data = data.sample_param_random(**sample_kwargs) cluster.run(this_data).write() if benchmark is not None: benchmark.run(this_data).write() for fom_name, fom in self._foms.items(): try: fom = fom.run(original_data, this_data).fom except ValueError: fom = -1 fom_results[fom_name].append(fom) df = pd.DataFrame(fom_results) return SubSampleStabilityTesterResult(df=df)
def load(cls, directory: Union[str, PurePath], loader: Optional[Callable] = None) -> "NoisySampleResult": """Load from output directory Args: directory: Path to directory to load from loader: Function used to load data (optional). Example: .. code-block:: python def loader(path): d = clusterking.DataWithError(path) d.add_rel_err_uncorr(0.01) return d nsr = NoisySampleResult.load("/path/to/dir/", loader=loader) """ directory = Path(directory) if not directory.is_dir(): raise FileNotFoundError( "{} does not exist or is not a directory".format(directory)) samples = [] for path in sorted(directory.glob("data_*.sql")): if loader is not None: d = loader(path) else: d = Data(path) samples.append(d) return NoisySampleResult(samples=samples)
def setUp(self): self.s = WilsonScanner(scale=5, eft="WET", basis="flavio") self.s.set_spoints_equidist({ "CVL_bctaunutau": (-1, 1, 2), "CSL_bctaunutau": (-1, 1, 2), "CT_bctaunutau": (-1, 1, 2), }) self.s.set_dfunction(simple_func, binning=[0, 1, 2], normalize=True) self.d = Data()
def test_run_simple_bins_sample(self): s = Scanner() d = Data() s.set_spoints_equidist({"a": (0, 2, 3)}) s.set_dfunction(func_sum_indentity_x, sampling=[0, 1, 2]) s.run(d).write() self.assertEqual(sorted(list(d.df.columns)), ["a", "bin0", "bin1", "bin2"]) print(d.df.values) self.assertAllClose( d.df.values, np.array([ [0.0, 0.0, 0.0, 0.0], [1.0, 0.0, 1.0, 2.0], [2.0, 0.0, 2.0, 4.0], ]), ) d.write(Path(self.tmpdir.name) / "test.sql")
def run(self, data1: Data, data2: Data) -> ClusterMatcherResult: ndata1 = data1.copy(deep=True) ndata2 = data2.copy(deep=True) nclusters1 = len(data1.df[self.cluster_column].unique()) nclusters2 = len(data2.df[self.cluster_column].unique()) if nclusters1 != nclusters2: raise ValueError("Cluster numbers don't match") order1 = self._get_order_of_clusters(data1) order2 = self._get_order_of_clusters(data2) order1_inverted = {value: key for key, value in order1.items()} rename_dct = {} for cluster in order2: rename_dct[cluster] = order1_inverted[order2[cluster]] ndata2.df[self.cluster_column] = ndata2.df[self.cluster_column].map( rename_dct) return ClusterMatcherResult(data1=ndata1, data2=ndata2, rename_dct=rename_dct)
def test(self): d1 = Data() d2 = Data() d1.df = pd.DataFrame({"cluster": [1, 1, 2, 2, 3]}) d2.df = pd.DataFrame({"cluster": [2, 2, 3, 3, 1]}) ttcmr = TrivialClusterMatcher().run(d1, d2) self.assertDictEqual(ttcmr.rename_dct, {2: 1, 3: 2, 1: 3})
def test_sss(self): d = Data() s = Scanner() s.set_no_workers(1) s.set_spoints_equidist({"a": (0, 1, 4)}) s.set_dfunction(func_one) s.run(d).write() c = KmeansCluster() c.set_kmeans_options(n_clusters=2) ssst = SubSampleStabilityTester() ssst.set_sampling(frac=0.95) ssst.set_repeat(2) ssst.run(data=d, cluster=c)
def test_noisy_sample(self): d = Data() s = Scanner() s.set_no_workers(1) s.set_spoints_equidist({"a": (0, 1, 2)}) s.set_dfunction(func_zero) ns = NoisySample() ns.set_repeat(1) ns.set_noise("gauss", mean=0.0, sigma=1 / 30 / 4) nsr = ns.run(scanner=s, data=d) self.assertEqual(len(nsr.samples), 2) nsr.write(self.tmpdir.name, non_empty="raise") nsr_loaded = NoisySampleResult.load(self.tmpdir.name) for i in range(2): self.assertDictEqual( nsr.samples[i].df.to_dict(), nsr_loaded.samples[i].df.to_dict() ) c = KmeansCluster() c.set_kmeans_options(n_clusters=2) nsst = NoisySampleStabilityTester() nsst.run(nsr, cluster=c)
class TestHierarchyCluster(MyTestCase): def setUp(self): self.ddir = Path(__file__).parent / "data" self.dname = "1d.sql" self.d = Data(self.ddir / self.dname) def test_cluster(self): d = self.d.copy() c = HierarchyCluster() c.set_metric("euclidean") c.set_max_d(0.75) c.run(d).write() c.set_max_d(1.5) c.run(d).write(cluster_column="cluster15") # The minimal distance between our distributions is 1, so they all # end up in different clusters self.assertEqual(len(d.clusters()), self.d.n) # This is a bit unfortunate, since we have so many distribution pairs # with equal distance (so it's up to the implementation of the algorithm # , which clusters develop) but this is what happened so far: self.assertEqual(len(d.clusters(cluster_column="cluster15")), 6) def test_reuse_hierarchy(self): d = self.d.copy() c = HierarchyCluster() c.set_metric("euclidean") c.set_max_d(1.5) r = c.run(d) r.write() r2 = c.run(d, reuse_hierarchy_from=r) r2.write(cluster_column="reused") self.assertListEqual(d.df["cluster"].tolist(), d.df["reused"].tolist()) def test_reuse_hierarchy_fail_different_data(self): d = self.d.copy() e = self.d.copy() c = HierarchyCluster() c.set_metric("euclidean") c.set_max_d(1.5) r = c.run(d) r.write() with self.assertRaises(ValueError) as ex: c.run(e, reuse_hierarchy_from=r) self.assertTrue("different data object" in str(ex.exception)) def test_reuse_hierarchy_fail_different_cluster(self): d = self.d.copy() c = HierarchyCluster() c2 = HierarchyCluster() c.set_metric("euclidean") c.set_max_d(1.5) c2.set_metric("euclidean") c2.set_max_d(1.5) r = c.run(d) r.write() with self.assertRaises(ValueError) as e: c2.run(e, reuse_hierarchy_from=r) self.assertTrue( "different HierarchyCluster object" in str(e.exception)) def test_hierarchy_cluster_no_max_d(self): d = self.d.copy() c = HierarchyCluster() with self.assertRaises(ValueError) as e: c.run(d) self.assertTrue("set_max_d" in str(e.exception)) def test_dendrogram_plot(self): c = HierarchyCluster() c.set_metric() c.set_max_d(0.2) r = c.run(self.d) r.dendrogram()
def _data(): ddir = Path(__file__).parent / "data" dname = "1d.sql" d = Data(ddir / dname) return d
def setUp(self): self.ddir = Path(__file__).parent / "data" self.dname = "1d_clustered.sql" self.d = Data(self.ddir / self.dname)
def setUp(self): path = Path(__file__).parent / "data" / "test.sql" self.data = [[100, 200], [400, 500]] self.d = Data(path)
class TestData(MyTestCase): def setUp(self): path = Path(__file__).parent / "data" / "test.sql" self.data = [[100, 200], [400, 500]] self.d = Data(path) def nd(self): return self.d.copy(deep=True) # ************************************************************************** # Property shortcuts # ************************************************************************** def test_bin_cols(self): self.assertEqual(self.d.bin_cols, ["bin0", "bin1"]) def test_par_cols(self): self.assertEqual( self.d.par_cols, ["CVL_bctaunutau", "CT_bctaunutau", "CSL_bctaunutau"], ) def test_n(self): self.assertEqual(self.d.n, 2) def test_nbins(self): self.assertEqual(self.d.nbins, 2) def test_npars(self): self.assertEqual(self.d.npars, 3) def test__dist_xrange(self): self.assertEqual(self.d._dist_xrange, (0, 20)) # ************************************************************************** # Returning things # ************************************************************************** def test_data(self): self.assertAllClose(self.d.data(), self.data) def test_norms(self): self.assertAllClose(self.d.norms(), [300, 900]) def test_clusters(self): self.assertEqual(self.d.clusters(), [0]) self.assertEqual(self.d.clusters(cluster_column="other_cluster"), [0, 1]) def test_get_param_values(self): self.assertEqual( sorted(list(self.d.get_param_values().keys())), sorted(["CVL_bctaunutau", "CT_bctaunutau", "CSL_bctaunutau"]), ) self.assertAlmostEqual( self.d.get_param_values("CVL_bctaunutau")[0], -1.0) self.assertAlmostEqual( self.d.get_param_values("CT_bctaunutau")[1], 0.0) def test_data_normed(self): self.assertAllClose(self.d.data(normalize=True), [[1 / 3, 2 / 3], [4 / 9, 5 / 9]]) # ************************************************************************** # Subsample # ************************************************************************** # see next class # ************************************************************************** # Quick plots # ************************************************************************** # We just check that they run without throwing. def test_plot_dist(self): self.d.plot_dist() def test_plot_dist_minmax(self): self.d.plot_dist_minmax() def test_plot_dist_box(self): self.d.plot_dist_box() def test_plot_clusters_scatter(self): self.d.plot_clusters_scatter() self.d.plot_clusters_scatter( ["CVL_bctaunutau", "CT_bctaunutau", "CSL_bctaunutau"]) self.d.plot_clusters_scatter(["CVL_bctaunutau", "CT_bctaunutau"]) self.d.plot_clusters_scatter(["CVL_bctaunutau"]) def test_plot_clusters_fill(self): self.d.plot_clusters_fill(["CVL_bctaunutau", "CT_bctaunutau"])
def setUp(self): path = Path(__file__).parent / "data" / "test_longer.sql" self.d = Data(path)
class TestSubSample(MyTestCase): def setUp(self): path = Path(__file__).parent / "data" / "test_longer.sql" self.d = Data(path) def nd(self): return self.d.copy(deep=True) def test_only_bpoints(self): self.assertEqual(self.d.only_bpoints().n, 1) self.assertEqual(self.d.only_bpoints(bpoint_column="bpoint1").n, 2) self.assertEqual(self.d.only_bpoints(bpoint_column="bpoint2").n, 3) def test_fix_param(self): e = self.d.fix_param(a=0) self.assertEqual(e.n, 16) self.assertAllClose(e.get_param_values("a"), [0.0]) e = self.d.fix_param(a=-100) self.assertEqual(e.n, 16) self.assertAllClose(e.get_param_values("a"), [0.0]) e = self.d.fix_param(a=2.3) self.assertEqual(e.n, 16) self.assertAllClose(e.get_param_values("a"), [2.0]) e = self.d.fix_param(a=[0, 2.3]) self.assertEqual(e.n, 32) self.assertAllClose(e.get_param_values("a"), [0.0, 2.0]) e = self.d.fix_param(a=[0, 2.3], b=0) self.assertEqual(e.n, 8) self.assertAllClose(e.get_param_values("a"), [0.0, 2.0]) self.assertAllClose(e.get_param_values("b"), [0.0]) e = self.d.fix_param(a=[0, 2.3], b=0, c=0.0) self.assertEqual(e.n, 2) self.assertAllClose(e.get_param_values("a"), [0.0, 2.0]) self.assertAllClose(e.get_param_values("b"), [0.0]) self.assertAllClose(e.get_param_values("c"), [0.0]) def test_fix_param_bpoints(self): e = self.d.fix_param(a=[], bpoints=True) self.assertEqual(e.n, 1) e = self.d.fix_param(a=[], bpoints=True, bpoint_column="bpoint1") self.assertEqual(e.n, 2) e = self.d.fix_param(a=0.0, bpoints=True, bpoint_column="bpoint1") self.assertEqual(e.n, 16) e = self.d.fix_param(c=0.0, bpoints=True, bpoint_column="bpoint1") self.assertEqual(e.n, 17) e = self.d.fix_param(a=0.0, b=0.0, c=0.0, bpoints=True, bpoint_column="bpoint1") self.assertEqual(e.n, 2) def test_fix_param_bpoint_slices(self): e = self.d.fix_param(a=[], bpoint_slices=True) self.assertEqual(e.n, 16) e = self.d.fix_param(c=[], bpoint_slices=True, bpoint_column="bpoint2") self.assertEqual(e.n, 3 * 16) e = self.d.fix_param(a=[], b=[], c=[], bpoint_slices=True, bpoint_column="bpoint2") self.assertEqual(e.n, 3) def test_sample_param(self): e = self.d.sample_param(a=0) self.assertEqual(e.n, 0) e = self.d.sample_param(a=3) self.assertEqual(e.n, 3 * 4 * 4) e = self.d.sample_param(a=4) self.assertEqual(e.n, 4 * 4 * 4) e = self.d.sample_param(a=10) self.assertEqual(e.n, 4 * 4 * 4) e = self.d.sample_param(a=3, b=3, c=3) self.assertEqual(e.n, 3 * 3 * 3) e = self.d.sample_param(a=(0, 0.4, 3)) self.assertEqual(e.n, 1 * 4 * 4) e = self.d.sample_param(a=(0, 1, 3)) self.assertEqual(e.n, 2 * 4 * 4) e = self.d.sample_param(a=(0, 1, 3), b=2, c=2) self.assertEqual(e.n, 2 * 2 * 2) e = self.d.sample_param(a=(0, 1, 3), b=(0, 1, 3), c=2) self.assertEqual(e.n, 2 * 2 * 2) def test_sample_param_bpoints(self): e = self.d.sample_param(a=0, bpoints=True) self.assertEqual(e.n, 1) e = self.d.sample_param(a=0, bpoints=True, bpoint_column="bpoint2") self.assertEqual(e.n, 3) def test_sample_param_bpoint_slices(self): e = self.d.sample_param(a=0, bpoint_slices=True) self.assertEqual(e.n, 16) e = self.d.sample_param(a=0, bpoint_slices=True, bpoint_column="bpoint2") self.assertEqual(e.n, 16) def test_sample_param_random(self): e = self.d.sample_param_random(n=5) self.assertEqual(e.n, 5) def test_find_closest_spoints(self): self.assertAllClose( self.d.find_closest_spoints(point=dict(a=0, b=0, c=0), n=1).df[["a", "b", "c"]].values, np.array([0, 0, 0]), ) self.assertAllClose( sorted( self.d.find_closest_spoints(point=dict(a=0, b=0, c=0), n=4).df[["a", "b", "c"]].values.tolist()), [[0, 0, 0], [0, 0, 1], [0, 1, 0], [1, 0, 0]], ) self.assertAllClose( sorted( self.d.find_closest_spoints(point=dict(a=0, b=1, c=0), n=5).df[["a", "b", "c"]].values.tolist()), [[0, 0, 0], [0, 1, 0], [0, 1, 1], [0, 2, 0], [1, 1, 0]], )