def test_dask_vs_numpy(): client = Client() ddf = dd.read_csv("tests/test_attributes.csv") ddf = ddf.repartition(npartitions=4) g_ddf = gam.GAM(attributions=ddf) g_ddf.generate() df = pd.read_csv("tests/test_attributes.csv") g_df = gam.GAM(attributions=df) g_df.generate() assert g_ddf.attributions.all() == g_df.attributions.all() assert g_ddf.feature_labels == g_df.feature_labels client.close()
def test_find_optimal_clusters(): """"Create sample attributions with 4 clusters""" attributions_file = "test_opt.csv" X, y = make_blobs( n_samples=40, n_features=2, centers=4, cluster_std=0.01, center_box=(0.0, 1.0), shuffle=True, random_state=42, ) logging.info(f"blobs made - {X.shape},{y.shape}") df = pd.DataFrame(columns=["x1", "x2"], data=X) df.to_csv(attributions_file) """"Check cluster search via silhouette score""" g = gam.GAM(attributions_path=attributions_file, distance="kendall_tau") g.get_optimal_clustering(max_clusters=6, verbose=True) logging.info(f"attributions file - {g.attributions_path}") logging.info(f"what we settled on - {g.k}") assert g.silh_scores == [ (-0.6997008553383604, 4), (-0.49646173501281243, 3), (-0.20264778322315316, 6), (-0.15419026951164008, 5), (0.3760497014154821, 2), ] assert g.k == 2
def test_read_csv(): g = gam.GAM(attributions_path="tests/test_attributes.csv") g._read_local() assert hasattr(g, "attributions") assert g.attributions.shape == (4, 3) assert hasattr(g, "feature_labels") assert g.feature_labels == ["a1", "a2", "a3"]
def test_read_csv(): g = gam.GAM(attributions_path="tests/test_attributes.csv") g._read_local() assert (hasattr(g, 'attributions')) assert (g.attributions.shape == (4, 3)) assert (hasattr(g, 'feature_labels')) assert (g.feature_labels == ['a1', 'a2', 'a3'])
def test_normalize(): """Tests normalization of attributions from csv""" g = gam.GAM(attributions_path="tests/test_attributes.csv") g._read_local() normalized_attributions = gam.GAM.normalize(g.attributions) assert normalized_attributions.shape == g.attributions.shape assert not np.any(np.where(normalized_attributions < 0)) assert normalized_attributions.sum(axis=1)[0] == pytest.approx(1.0)
def test_plotting_2attributes(): explanations = [[('height', .05), ('weight', .05), ('hair color', .9)], [('height', .9), ('weight', .05), ('hair color', .05)]] g = gam.GAM(attributions_path="tests/test_attributes.csv", k=len(explanations)) g.explanations = explanations fname = 'tests/image3' g.plot(num_features=2, output_path_base=fname, display=False) output = glob.glob(fname + '*') assert (len(output) > 0) for ofile in output: os.remove(ofile)
def test_plotting_2attributes(): explanations = [ [("height", 0.05), ("weight", 0.05), ("hair color", 0.9)], [("height", 0.9), ("weight", 0.05), ("hair color", 0.05)], ] g = gam.GAM(attributions_path="tests/test_attributes.csv", k=len(explanations)) g.explanations = explanations fname = "tests/image3" g.plot(num_features=2, output_path_base=fname, display=False) output = glob.glob(fname + "*") assert len(output) > 0 for ofile in output: os.remove(ofile)
def test_cluster(): """Tests subpopulations generated by clustering attributions""" g = gam.GAM(attributions_path="tests/test_attributes.csv") g._read_local() # g.normalized_attributions = gam.GAM.normalize(g.attributions) g.clustering_attributions = gam.GAM.normalize(g.attributions) g._cluster() assert len(g.explanations) == 2 assert g.subpopulation_sizes[0] > 0 assert g.subpopulation_sizes[1] > 0 assert len(g.explanations) == 2 assert g.explanations[0][0][0] == g.feature_labels[0] first_explanation_sum = sum([weight for label, weight in g.explanations[0]]) assert first_explanation_sum == pytest.approx(1) second_explanation_sum = sum([weight for label, weight in g.explanations[1]]) assert second_explanation_sum == pytest.approx(1)
def test_find_optimal_clusters(): """"Create sample attributions with 4 clusters""" attributions_file = 'test_opt.csv' X, y = make_blobs(n_samples=40, n_features=2, centers=4, cluster_std=0.01, center_box=(0.0, 1.0), shuffle=True, random_state=42) print('blobs made - ', X.shape, y.shape) df = pd.DataFrame(columns=['x1', 'x2'], data=X) df.to_csv(attributions_file) """"Check cluster search via silhouette score""" g = gam.GAM(attributions_path=attributions_file, distance="kendall_tau") g.get_optimal_clustering() print('attributions file - ', g.attributions_path) print('data size = ', g.normalized_attributions.shape) print('what we settled on - ', g.k) assert(g.k == 4)
def test_read_df_or_list(): # preprocessing df = pd.read_csv("tests/test_attributes.csv") att_list = df.values.tolist() feat_labels_list = df.columns.tolist() att_arr = np.asarray(df.values.tolist()) feat_labels_arr = np.asarray(df.columns.tolist()) client = Client() ddf = dd.read_csv("tests/test_attributes.csv") ddf = ddf.repartition(npartitions=4) dask_att_arr = da.from_array(att_list) dask_feat_labels_arr = da.from_array(feat_labels_list) # Testing dask DataFrame g_ddf = gam.GAM(attributions=ddf) g_ddf.generate() assert hasattr(g_ddf, "attributions") assert g_ddf.attributions.shape == (4, 3) assert hasattr(g_ddf, "feature_labels") assert g_ddf.feature_labels == ["a1", "a2", "a3"] # Testing dask array g_dask_list = gam.GAM(attributions=dask_att_arr, batchsize=100, feature_labels=dask_feat_labels_arr) g_dask_list.generate() assert hasattr(g_dask_list, "attributions") assert g_dask_list.attributions.shape == (4, 3) assert hasattr(g_dask_list, "feature_labels") assert g_dask_list.feature_labels == ["a1", "a2", "a3"] client.close() # Testing DataFrame g_df = gam.GAM(attributions=df) g_df.generate() assert hasattr(g_df, "attributions") assert g_df.attributions.shape == (4, 3) assert hasattr(g_df, "feature_labels") assert g_df.feature_labels == ["a1", "a2", "a3"] # Testing lists g_list = gam.GAM(attributions=att_list, batchsize=100, feature_labels=feat_labels_list) g_list.generate() assert hasattr(g_list, "attributions") assert g_list.attributions.shape == (4, 3) assert hasattr(g_list, "feature_labels") assert g_list.feature_labels == ["a1", "a2", "a3"] # Testing numpy arrays g_arr = gam.GAM(attributions=att_arr, batchsize=100, feature_labels=feat_labels_arr) g_arr.generate() assert hasattr(g_arr, "attributions") assert g_arr.attributions.shape == (4, 3) assert hasattr(g_arr, "feature_labels") assert g_arr.feature_labels == ["a1", "a2", "a3"] # Testing failure with pytest.raises(ValueError): g_fail = gam.GAM(attributions=att_arr, batchsize=100) g_fail.generate()