def _cluster(self):
        # , distance_function=spearman_squared_distance, max_iter=1000, tol=0.0001):
        """Calls local kmedoids module to group attributions"""
        if self.cluster_method is None:
            clusters = KMedoids(
                self.k,
                dist_func=self.distance_function,
                max_iter=self.max_iter,
                tol=self.tol,
                init_medoids=self.init_medoids,
                swap_medoids=self.swap_medoids,
            )
            clusters.fit(self.clustering_attributions, verbose=self.verbose)

            self.subpopulations = clusters.members
            self.subpopulation_sizes = GAM.get_subpopulation_sizes(
                clusters.members)
            self.explanations = self._get_explanations(clusters.centers)
            # Making explanations return numerical values instead of dask arrays
            if isinstance(self.explanations[0][0][1], da.Array):
                explanations = []
                for explanation in self.explanations:
                    explanations.append([(x[0], x[1].compute())
                                         for x in explanation])
                self.explanations = explanations
        else:
            self.cluster_method(self)
Exemple #2
0
def test_banditPAM():
    # load the data
    df = pd.read_csv("tests/banditPAM_data.csv")
    attributions = df.values

    """"Run kmedoids on sample attributions"""
    kmed2 = KMedoids(
        4,
        dist_func="euclidean",
        # dist_func=spearman_squared_distance,
        max_iter=20,
        tol=0.001,
        init_medoids="bandit",
        swap_medoids="bandit",
        verbose=False,
    )
    start_time = time.time()
    kmed2.fit(attributions, verbose=False)
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Finished test in {elapsed_time:.2f}")
    print(kmed2.centers)


    # if testing with 'euclidean' distance
    assert( kmed2.centers == [256, 209, 470, 304])
Exemple #3
0
def test_banditPAM_dask():
    # load the data
    ddf = dd.read_csv("tests/banditPAM_data.csv", dtype={'ARTICLE_ID': 'object'}).repartition(npartitions=4)
    attributions = ddf.to_dask_array(lengths=True)

    """"Run kmedoids on sample attributions"""
    kmed2 = KMedoids(
        n_clusters=4,
        dist_func="euclidean",
        batchsize=200,
        # dist_func=spearman_squared_distance,
        max_iter=20,
        tol=0.001,
        init_medoids="bandit",
        swap_medoids="bandit",
        verbose=False,
    )
    start_time = time.time()
    kmed2.fit(attributions, verbose=False)
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Finished test in {elapsed_time:.2f}")
    print(kmed2.centers)

    # if testing with 'euclidean' distance
    assert np.isin(kmed2.centers, [256, 209, 470, 304]).all()
Exemple #4
0
    def _cluster(self, distance_function=spearman_squared_distance, max_iter=1000, tol=0.0001):
        """Calls kmedoids module to group attributions"""
        clusters = KMedoids(self.k, dist_func=distance_function, max_iter=max_iter, tol=tol)
        clusters.fit(self.normalized_attributions, verbose=False)

        self.subpopulations = clusters.members
        self.subpopulation_sizes = GAM.get_subpopulation_sizes(clusters.members)
        self.explanations = self._get_explanations(clusters.centers)
def test_kmedoids():
    """"Run kmedoids on sample attributions"""
    kmedoids_2 = KMedoids(2,
                          dist_func=spearman_squared_distance,
                          max_iter=1000,
                          tol=0.0001)
    attributions = np.array([(0.2, 0.8), (0.1, 0.9), (0.91, 0.09),
                             (0.88, 0.12)])
    kmedoids_2.fit(attributions, verbose=False)
    # test that 2 attributions are in each cluster
    assert (sum(kmedoids_2.members) == 2)
Exemple #6
0
    def _cluster(self):
        # , distance_function=spearman_squared_distance, max_iter=1000, tol=0.0001):
        """Calls local kmedoids module to group attributions"""
        if self.cluster_method is None:
            clusters = KMedoids(
                self.k,
                dist_func=self.distance_function,
                max_iter=self.max_iter,
                tol=self.tol,
            )
            clusters.fit(self.clustering_attributions, verbose=False)

            self.subpopulations = clusters.members
            self.subpopulation_sizes = GAM.get_subpopulation_sizes(clusters.members)
            self.explanations = self._get_explanations(clusters.centers)
        else:
            self.cluster_method(self)
from gam.clustering import KMedoids
from gam.spearman_distance import spearman_squared_distance

np.random.seed(42)

# load the data
df = pd.read_csv("samples_3500.csv")
attributions = df.values
print(df.shape)
""""Run kmedoids on sample attributions"""
kmed2 = KMedoids(
    5,
    dist_func=spearman_squared_distance,
    max_iter=10,
    tol=0.01,
    init_medoids='bandit',
    swap_medoids="bandit",
    verbose=True,
)
# attributions = np.array([(0.2, 0.8), (0.1, 0.9), (0.91, 0.09), (0.88, 0.12)])
start_time = time.time()
kmed2.fit(attributions, verbose=True)
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Finished test in {elapsed_time:.2f}")
print(kmed2.centers)
cluster_sizes = np.unique(kmed2.members, return_counts=True)[1]
print(f'cluster sizes - {cluster_sizes}')
# test that 2 attributions are in each cluster
# assert(sum(kmedoids_2.members) == 2)