def test_perform_clustering(self): test_dir_name = os.path.dirname(__file__) feat_array_fn = os.path.join(test_dir_name, "data", "feature_array.csv") tmp_dir = self.__class__.TMP_DIR output_fn = os.path.join(tmp_dir, "code.txt") clusterer = Clusterer(feat_array_fn, output_fn, []) clusterer.iter = 100 clusterer.perform_clustering() sample_output_filename = os.path.join(test_dir_name, "data", "sample_cluster_code.txt") sample_inv_output_filename = os.path.join( test_dir_name, "data", "sample_cluster_code_inv.txt") same_as_sample = filecmp.cmp(output_fn, sample_output_filename) same_as_inv = filecmp.cmp(output_fn, sample_inv_output_filename) self.assertTrue(same_as_sample or same_as_inv) # Now test the data set with non feat cols feat_array_fn = os.path.join(test_dir_name, "data", "feature_array_with_non_feat_cols.csv") clusterer = Clusterer(feat_array_fn, output_fn, ["id", "param"]) clusterer.iter = 100 clusterer.perform_clustering() same_as_sample = filecmp.cmp(output_fn, sample_output_filename) same_as_inv = filecmp.cmp(output_fn, sample_inv_output_filename) self.assertTrue(same_as_sample or same_as_inv)
def test_determine_k(self): """ Test the clusterer._dtermine_k function. """ test_dir_name = os.path.dirname(__file__) feat_array_fn = os.path.join( test_dir_name, "data", "four_clusters.csv") df = pd.read_csv(feat_array_fn) feat_array = df[["x", "y"]].values clusterer = Clusterer(feat_array_fn, "/dev/null", []) best_k = clusterer._determine_k(feat_array, 9) self.assertEqual(best_k, 4) feat_array_fn = os.path.join( test_dir_name, "data", "iris.csv") df = pd.read_csv(feat_array_fn) feat_array = df[[ "Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width"]].values clusterer = Clusterer(feat_array_fn, "/dev/null", []) best_k = clusterer._determine_k(feat_array, 9) self.assertEqual(best_k, 2)
def test_determine_max_k(self): test_dir_name = os.path.dirname(__file__) feat_array_fn = os.path.join(test_dir_name, "data", "feature_array.csv") feat_array = np.loadtxt(feat_array_fn, delimiter=",", skiprows=1) clusterer = Clusterer("_", "_", []) k = clusterer._determine_max_k(feat_array) self.assertEqual(k, 2)
def test_determine_max_k(self): test_dir_name = os.path.dirname(__file__) feat_array_fn = os.path.join( test_dir_name, "data", "feature_array.csv") feat_array = np.loadtxt(feat_array_fn, delimiter=",", skiprows=1) clusterer = Clusterer("_", "_", []) k = clusterer._determine_max_k(feat_array) self.assertEqual(k, 2)
def test_perform_clustering(self): test_dir_name = os.path.dirname(__file__) feat_array_fn = os.path.join( test_dir_name, "data", "feature_array.csv") tmp_dir = self.__class__.TMP_DIR output_fn = os.path.join(tmp_dir, "code.txt") clusterer = Clusterer(feat_array_fn, output_fn, []) clusterer.iter = 100 clusterer.perform_clustering() sample_output_filename = os.path.join( test_dir_name, "data", "sample_cluster_code.txt") sample_inv_output_filename = os.path.join( test_dir_name, "data", "sample_cluster_code_inv.txt") same_as_sample = filecmp.cmp(output_fn, sample_output_filename) same_as_inv = filecmp.cmp(output_fn, sample_inv_output_filename) self.assertTrue(same_as_sample or same_as_inv) # Now test the data set with non feat cols feat_array_fn = os.path.join( test_dir_name, "data", "feature_array_with_non_feat_cols.csv") clusterer = Clusterer(feat_array_fn, output_fn, ["id", "param"]) clusterer.iter = 100 clusterer.perform_clustering() same_as_sample = filecmp.cmp(output_fn, sample_output_filename) same_as_inv = filecmp.cmp(output_fn, sample_inv_output_filename) self.assertTrue(same_as_sample or same_as_inv)
def test_determine_k(self): """ Test the clusterer._dtermine_k function. """ test_dir_name = os.path.dirname(__file__) feat_array_fn = os.path.join(test_dir_name, "data", "four_clusters.csv") df = pd.read_csv(feat_array_fn) feat_array = df[["x", "y"]].values clusterer = Clusterer(feat_array_fn, "/dev/null", []) best_k = clusterer._determine_k(feat_array, 9) self.assertEqual(best_k, 4) feat_array_fn = os.path.join(test_dir_name, "data", "iris.csv") df = pd.read_csv(feat_array_fn) feat_array = df[[ "Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width" ]].values clusterer = Clusterer(feat_array_fn, "/dev/null", []) best_k = clusterer._determine_k(feat_array, 9) self.assertEqual(best_k, 2)