def test_seuclidean():
    with pytest.warns(None) as record:
        km = KMedoids(2, metric="seuclidean", method="pam")
        km.fit(np.array([0, 0, 0, 1]).reshape((4, 1)))
        km.predict(np.array([0, 0, 0, 1]).reshape((4, 1)))
        km.transform(np.array([0, 0, 0, 1]).reshape((4, 1)))
    assert len(record) == 0
def test_kmedoids_iris():
    """Test kmedoids on the Iris dataset"""
    rng = np.random.RandomState(seed)
    X_iris = load_iris()["data"]

    ref_model = KMeans(n_clusters=3).fit(X_iris)

    avg_dist_to_closest_centroid = (ref_model.transform(X_iris).min(
        axis=1).mean())

    for init in ["random", "heuristic", "k-medoids++"]:
        distance_metric = "euclidean"
        model = KMedoids(n_clusters=3,
                         metric=distance_metric,
                         init=init,
                         random_state=rng)
        model.fit(X_iris)

        # test convergence in reasonable number of steps
        assert model.n_iter_ < (len(X_iris) // 10)

        distances = PAIRWISE_DISTANCE_FUNCTIONS[distance_metric](X_iris)
        avg_dist_to_random_medoid = np.mean(distances.ravel())
        avg_dist_to_closest_medoid = model.inertia_ / X_iris.shape[0]
        # We want distance-to-closest-medoid to be reduced from average
        # distance by more than 50%
        assert avg_dist_to_random_medoid > 2 * avg_dist_to_closest_medoid
        # When K-Medoids is using Euclidean distance,
        # we can compare its performance to
        # K-Means. We want the average distance to cluster centers
        # to be similar between K-Means and K-Medoids
        assert_allclose(avg_dist_to_closest_medoid,
                        avg_dist_to_closest_centroid,
                        rtol=0.1)
def test_kmedoids_empty_clusters():
    """When a cluster is empty, it should throw a warning."""
    rng = np.random.RandomState(seed)
    X = [[1], [1], [1]]
    kmedoids = KMedoids(n_clusters=2, random_state=rng)
    with pytest.warns(UserWarning, match="Cluster 1 is empty!"):
        kmedoids.fit(X)
Esempio n. 4
0
def test_kmedoid_results(method, init):
    expected = np.hstack([np.zeros(50), np.ones(50)])
    km = KMedoids(n_clusters=2, init=init, method=method, random_state=rng)
    km.fit(X_cc)
    # This test use data that are not perfectly separable so the
    # accuracy is not 1. Accuracy around 0.85
    assert (np.mean(km.labels_ == expected) >
            0.8) or (1 - np.mean(km.labels_ == expected) > 0.8)
Esempio n. 5
0
def sklearn_kmedoids(ds, numClusters, numSamples):

    km = KMedoids(n_clusters=numClusters, random_state=0)

    df = ds.df[["x1", "x2"]]
    df = df[:numSamples]

    km.fit(df[["x1", "x2"]].to_numpy())

    return pd.DataFrame(km.labels_, columns=["cluster"])
Esempio n. 6
0
def run_KMedoids(n_clusters, pca_components, data, components):
    clustering = KMedoids(n_clusters=7, random_state=0)
    clustering.fit(pca_components)
    df_seg_pca_kmedoids = pd.concat(
        [data.reset_index(drop=True),
         pd.DataFrame(pca_components)], axis=1)
    df_seg_pca_kmedoids.columns.values[(-1 * components):] = [
        "Component " + str(i + 1) for i in range(components)
    ]
    df_seg_pca_kmedoids['Cluster'] = clustering.labels_
    return df_seg_pca_kmedoids
def test_outlier_robustness():
    rng = np.random.RandomState(seed)
    kmeans = KMeans(n_clusters=2, random_state=rng)
    kmedoids = KMedoids(n_clusters=2, random_state=rng)

    X = [[-11, 0], [-10, 0], [-9, 0], [0, 0], [1, 0], [2, 0], [1000, 0]]

    kmeans.fit(X)
    kmedoids.fit(X)

    assert_array_equal(kmeans.labels_, [0, 0, 0, 0, 0, 0, 1])
    assert_array_equal(kmedoids.labels_, [0, 0, 0, 1, 1, 1, 1])
def test_max_iter():
    """Test that warning message is thrown when max_iter is reached."""
    rng = np.random.RandomState(seed)
    X_iris = load_iris()["data"]

    model = KMedoids(
        n_clusters=10, init="random", random_state=rng, max_iter=1
    )
    msg = "Maximum number of iteration reached before"

    with pytest.warns(UserWarning, match=msg):
        model.fit(X_iris)
def test_kmedoid_nclusters(method, init):
    n_clusters = 50

    km = KMedoids(
        n_clusters=n_clusters,
        init=init,
        method=method,
        max_iter=1,
        random_state=rng,
    )
    km.fit(X_cc)
    assert len(np.unique(km.medoid_indices_)) == n_clusters
def test_build():
    X, y = fetch_20newsgroups_vectorized(return_X_y=True)
    # Select only the first 500 samples
    X = X[:500]
    y = y[:500]
    # Precompute cosine distance matrix
    diss = cosine_distances(X)
    # run build
    ske = KMedoids(20, "precomputed", init="build", max_iter=0)
    ske.fit(diss)
    assert ske.inertia_ <= 230
    assert len(np.unique(ske.labels_)) == 20
Esempio n. 11
0
def find_optimal_clusters_and_display(pca_components):
    wcss = []
    max_clusters = 21
    for i in range(1, max_clusters):
        kmedoids_pca = KMedoids(n_clusters=i, random_state=0)
        kmedoids_pca.fit(pca_components)
        wcss.append(kmedoids_pca.inertia_)
    n_clusters = KneeLocator([i for i in range(1, max_clusters)],
                             wcss,
                             curve='convex',
                             direction='decreasing').knee
    st.write("Optimal number of clusters", n_clusters)
    return n_clusters
class Clustering:
    def __init__(self, data):
        self.states = data.keys()
        self.kmeans = KMedoids(n_clusters=3)
        self.kmeans.fit(np.array(tuple(data.values())).reshape(-1, 1))

        self.mapping = list(
            np.argsort(np.squeeze(self.kmeans.cluster_centers_)))

    def cluster(self):
        result = [[], [], []]
        for state, cluster in zip(self.states, self.kmeans.labels_):
            result[self.mapping.index(cluster)].append(state)
        return result
Esempio n. 13
0
def find_cluster_centres(text, num_clusters):
    corpus = nltk.sent_tokenize(text)
    corpus_embeddings = embedder.encode(corpus)
    clustering_model = KMedoids(n_clusters=num_clusters,
                                random_state=0,
                                metric="cosine")
    clustering_model.fit(corpus_embeddings)
    cluster_center_embeddings = clustering_model.cluster_centers_
    cluster_centers = []
    for center_embedding in cluster_center_embeddings:
        for index, sentence_embedding in enumerate(corpus_embeddings):
            if np.array_equal(sentence_embedding, center_embedding):
                if corpus[index] not in cluster_centers:
                    cluster_centers.append(corpus[index])
    return cluster_centers
def test_array_like_init():
    centroids = np.array([X_cc[0], X_cc[50]])

    expected = np.hstack([np.zeros(50), np.ones(50)])
    km = KMedoids(n_clusters=len(centroids), init=centroids)
    km.fit(X_cc)
    # # This test use data that are not perfectly separable so the
    # # accuracy is not 1. Accuracy around 0.85
    assert (np.mean(km.labels_ == expected) >
            0.8) or (1 - np.mean(km.labels_ == expected) > 0.8)

    # Override n_clusters if array-like init method is used
    km = KMedoids(n_clusters=len(centroids) + 2, init=centroids)
    km.fit(X_cc)

    assert len(km.cluster_centers_) == len(centroids)
def test_clara_consistency_iris():
    # test that CLARA is PAM when full sample is used

    rng = np.random.RandomState(seed)
    X_iris = load_iris()["data"]

    clara = CLARA(
        n_clusters=3,
        n_sampling_iter=1,
        n_sampling=len(X_iris),
        random_state=rng,
    )

    model = KMedoids(n_clusters=3, init="build", random_state=rng)

    model.fit(X_iris)
    clara.fit(X_iris)
    assert np.sum(model.labels_ == clara.labels_) == len(X_iris)
def test_kmedoids_fit_naive():
    n_clusters = 3
    metric = "euclidean"

    model = KMedoids(n_clusters=n_clusters, metric=metric)
    Xnaive = np.asarray([[1, 0, 0], [0, 1, 0], [0, 0, 1]])

    model.fit(Xnaive)

    assert_array_equal(model.cluster_centers_,
                       [[1, 0, 0], [0, 1, 0], [0, 0, 1]])
    assert_array_equal(model.labels_, [0, 1, 2])
    assert model.inertia_ == 0.0

    # diagonal must be zero, off-diagonals must be positive
    X_new = model.transform(Xnaive)
    for c in range(n_clusters):
        assert X_new[c, c] == 0
        for c2 in range(n_clusters):
            if c != c2:
                assert X_new[c, c2] > 0
def compare_k_med(k_list, X):
    # Run clustering with different k and check the metrics
    silhouette_list = []
    inertia_list = []
    for p in k_list:
        print("Calculating silhouette score for k =", p)
        clusters = KMedoids(n_clusters=p,
                            metric='precomputed',
                            random_state=2248,
                            init='k-medoids++')
        clusters.fit(X)
        # The higher (up to 1) the better
        s = round(silhouette_score(X, clusters.labels_, metric="precomputed"),
                  4)
        silhouette_list.append(s)
        i = clusters.inertia_
        inertia_list.append(i)
    # The higher (up to 1) the better
    key = silhouette_list.index(max(silhouette_list))
    k = k_list.__getitem__(key)
    print("Kmed best silhouette =", max(silhouette_list), " for k =", k)
    return k, silhouette_list, inertia_list
def test_precomputed():
    """Test the 'precomputed' distance metric."""
    rng = np.random.RandomState(seed)
    X_1 = [[1.0, 0.0], [1.1, 0.0], [0.0, 1.0], [0.0, 1.1]]
    D_1 = euclidean_distances(X_1)
    X_2 = [[1.1, 0.0], [0.0, 0.9]]
    D_2 = euclidean_distances(X_2, X_1)

    kmedoids = KMedoids(metric="precomputed", n_clusters=2, random_state=rng)
    kmedoids.fit(D_1)

    assert_allclose(kmedoids.inertia_, 0.2)
    assert_array_equal(kmedoids.medoid_indices_, [2, 0])
    assert_array_equal(kmedoids.labels_, [1, 1, 0, 0])
    assert kmedoids.cluster_centers_ is None

    med_1, med_2 = tuple(kmedoids.medoid_indices_)
    predictions = kmedoids.predict(D_2)
    assert_array_equal(predictions, [med_1 // 2, med_2 // 2])

    transformed = kmedoids.transform(D_2)
    assert_array_equal(transformed, D_2[:, kmedoids.medoid_indices_])
def test_medoids_indices():
    rng = np.random.RandomState(seed)
    X_iris = load_iris()["data"]

    clara = CLARA(
        n_clusters=3,
        n_sampling_iter=1,
        n_sampling=len(X_iris),
        random_state=rng,
    )

    model = KMedoids(n_clusters=3, init="build", random_state=rng)

    centroids = np.array([X_iris[0], X_iris[50]])
    array_like_model = KMedoids(n_clusters=len(centroids),
                                init=centroids,
                                max_iter=0)

    model.fit(X_iris)
    clara.fit(X_iris)
    array_like_model.fit(X_iris)
    assert_array_equal(X_iris[model.medoid_indices_], model.cluster_centers_)
    assert_array_equal(X_iris[clara.medoid_indices_], clara.cluster_centers_)
    assert_array_equal(centroids, array_like_model.cluster_centers_)
Esempio n. 20
0
def define_k(answers):
    kmeans_kwargs = {
        "init": "random",
        "n_init": 10,
        "max_iter": 300,
        "random_state": None,
    }
    sse = []
    for k in range(1, 11):
        kmeans = KMedoids(n_clusters=k,
                          metric=distance_function,
                          random_state=None).fit(answers)
        kmeans.fit(scaled_answers)
        sse.append(kmeans.inertia_)

    plt.style.use("fivethirtyeight")
    plt.plot(range(1, 11), sse)
    plt.xticks(range(1, 11))
    plt.xlabel("Number of Clusters")
    plt.ylabel("SSE")
    plt.show()
    kl = KneeLocator(range(1, 11), sse, curve="convex", direction="decreasing")
    kl.elbow
    return kl.elbow
Esempio n. 21
0
def kmedoids_dm(input_data, cluster_no):
    start = time.time()

    dataset = pd.read_csv(input_data,
                          sep=',',
                          error_bad_lines=False,
                          index_col=False,
                          dtype='unicode')
    tf_idf_vectorizer = TfidfVectorizer(stop_words='english',
                                        max_features=20000)

    kmed = KMedoids(n_clusters=cluster_no, random_state=0)

    kmed.fit(dataset)

    labels = kmed.labels_

    dbi = davies_bouldin_score(dataset, labels)
    si = silhouette_score(dataset, labels)

    print("Runtime: ")
    print(time.time() - start)

    return dbi, si
Esempio n. 22
0
    def models(self):
        """
         Get sentences embeddings and generate cluster according to the number of cluster previously defined.
        An UMAP dimension reduction and a Kmenoid with cosine distance are performed for this task.

        Returns:
            sklearn.model:  fitted umap model with attribute 'sentence_embeddings'
        Returns:
            sklearn.model:  fitted kmenoid model from umaped 'sentence_embeddings'
        """
        umap_model = umap.UMAP(n_neighbors=15,
                               n_components=self.n_umap,
                               metric="cosine")
        umap_model = umap_model.fit(self.sentence_embeddings)
        umap_embeddings = umap_model.transform(self.sentence_embeddings)
        kmenoid_model = KMedoids(n_clusters=self.n_clusters,
                                 metric="cosine",
                                 init="random",
                                 random_state=15)
        cluster = kmenoid_model.fit(umap_embeddings)
        return cluster, umap_model
Esempio n. 23
0
    def clusterization(self,
                       cols=None,
                       method="k_means",
                       visualize=True,
                       n_clusters=None):

        if not self.is_standardize:
            raise ValueError("You should standardize your columns first.")

        if method == "k_means":
            logger.info("=" * 27)
            logger.info("Clustering using K-Means")
            logger.info("=" * 27)

            kmeans_kwargs = {
                "init": "random",
                "n_init": 10,
                "max_iter": 300,
                "random_state": 42,
            }
            sse = []
            kmeans_silhouette_coefficients = []
            for k in range(2, 11):
                kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
                kmeans.fit(self.dataset[cols])
                sse.append(kmeans.inertia_)
                score = silhouette_score(self.dataset[cols], kmeans.labels_)
                kmeans_silhouette_coefficients.append(score)

            if visualize:
                # plt.style.use("fivethirtyeight")
                plt.plot(range(2, 11), sse)
                plt.xticks(range(2, 11))
                plt.title("K-Means")
                plt.xlabel("Number of Clusters")
                plt.ylabel("SSE")
                plt.show()

                # plt.style.use("fivethirtyeight")
                plt.plot(range(2, 11), kmeans_silhouette_coefficients)
                plt.xticks(range(2, 11))
                plt.title("K-Means")
                plt.xlabel("Number of Clusters")
                plt.ylabel("Silhouette Coefficient")
                plt.show()

            kl = KneeLocator(range(2, 11),
                             sse,
                             curve="convex",
                             direction="decreasing")

            number_clusters_best = kl.elbow
            logger.info(
                f"Best number of clusters using elbow method: {number_clusters_best}"
            )
            logger.info("")
            logger.info(
                f"See the graph Silhouette coefficient vs number of clusters to define \
                the best amount of clusters in your case. \
                (Silhouette coefficient goes from -1 to 1, near to 1 is better)"
            )
            logger.info("")

        elif method == "k_medoids":
            logger.info("=" * 27)
            logger.info("Clustering using K-Medoids")
            logger.info("=" * 27)

            kmedoids_kwargs = {
                "metric": "euclidean",
            }
            sse = []
            kmedoids_silhouette_coefficients = []
            for k in range(2, 11):
                kmedoids = KMedoids(n_clusters=k, **kmedoids_kwargs)
                kmedoids.fit(self.dataset[cols])
                sse.append(kmedoids.inertia_)
                score = silhouette_score(self.dataset[cols], kmedoids.labels_)
                kmedoids_silhouette_coefficients.append(score)

            if visualize:
                # plt.style.use("fivethirtyeight")
                plt.plot(range(2, 11), sse)
                plt.xticks(range(2, 11))
                plt.title("K-Medoids")
                plt.xlabel("Number of Clusters")
                plt.ylabel("SSE")
                plt.show()

                # plt.style.use("fivethirtyeight")
                plt.plot(range(2, 11), kmedoids_silhouette_coefficients)
                plt.xticks(range(2, 11))
                plt.title("K-Medoids")
                plt.xlabel("Number of Clusters")
                plt.ylabel("Silhouette Coefficient")
                plt.show()

            kl = KneeLocator(range(2, 11),
                             sse,
                             curve="convex",
                             direction="decreasing")

            number_clusters_best = kl.elbow
            logger.info(
                f"Best number of clusters using elbow method: {number_clusters_best}"
            )
            logger.info("")
            logger.info(
                f"See the graph Silhouette coefficient vs number of clusters to define \
                the best amount of clusters in your case. \
                (Silhouette coefficient goes from -1 to 1, near to 1 is better)"
            )
            logger.info("")

        elif method == "dbscan":
            logger.info("=" * 27)
            logger.info("Clustering using DBScan")
            logger.info("=" * 27)

            silhouette_eps_ncluster = {}
            for eps in np.linspace(0.1, 4, 10):
                dbscan = DBSCAN(eps=eps)
                dbscan.fit(self.dataset[cols])
                if len(set(dbscan.labels_)) > 1:
                    # Silhouette score requires at least 2 clusters to be calculated.
                    # Rows marked with dbscan.labels_=-1 don"t belong to a real cluster
                    # but are considered noise.
                    score = round(
                        silhouette_score(self.dataset[cols], dbscan.labels_),
                        4)
                    nclusters = len(set(dbscan.labels_))

                    silhouette_eps_ncluster[score] = ((eps, nclusters))

            if visualize:
                y, tup = zip(*silhouette_eps_ncluster.items())
                x = [eps for eps, nclusters in tup]

                # plt.style.use("fivethirtyeight")
                plt.plot(x, y)
                plt.xticks(np.linspace(0.1, 4, 10))
                plt.title("DBScan")
                plt.xlabel("eps")
                plt.ylabel("Silhouette Coefficient")
                plt.show()

            nclusters_best = silhouette_eps_ncluster.get(
                max(silhouette_eps_ncluster.keys()), -1)[1]
            logger.info(
                f"Best number of clusters using Silhouette over multiple eps: {nclusters_best}"
            )
            logger.info("")

            # TODO: Add column with the id of the cluster each row belongs to
            # TODO: Implement scatter plot of clusters.
        else:
            raise ValueError("Clustering method not implemented.")
Esempio n. 24
0
def test_elbow(X, dtw_value, seed):
    print(len(X))
    distortions = []
    silhouette_value = []
    dists = dtw_value
    print(dists)
    if seed == -1:
        for seed in range(0, 21):
            cur_silhouette = [seed]
            cur_distortions = [seed]
            for i in range(2, 15):
                print(i)
                km = KMedoids(n_clusters=i,
                              random_state=seed,
                              metric="precomputed",
                              init='k-medoids++',
                              max_iter=30000)
                km.fit(dists)
                # 记录误差和
                cur_distortions.append(km.inertia_)
                y_pred = km.fit_predict(dists)
                np.fill_diagonal(dists, 0)
                score = silhouette_score(dists, y_pred, metric="precomputed")
                cur_silhouette.append(score)
            distortions.append(cur_distortions)
            silhouette_value.append(cur_silhouette)
        with open(r".//res//grid_distortions_destination.csv",
                  "w",
                  encoding='UTF-8',
                  newline='') as csvfile:
            writer = csv.writer(csvfile)
            for row in distortions:
                writer.writerow(row)
                print(row)
        with open(r".//res//grid_silhouette_destination.csv",
                  "w",
                  encoding='UTF-8',
                  newline='') as csvfile:
            writer = csv.writer(csvfile)
            for row in silhouette_value:
                writer.writerow(row)
                print(row)
    else:
        csv_reader = csv.reader(
            open(".//res//grid_distortions_destination.csv", encoding='UTF-8'))
        for row in csv_reader:
            distortions.append([float(item) for item in row])
        csv_reader = csv.reader(
            open(".//res//grid_silhouette_destination.csv", encoding='UTF-8'))
        for row in csv_reader:
            silhouette_value.append([float(item) for item in row])
        chosen_distortions = distortions[seed][1:]
        chosen_silhouette = silhouette_value[seed][1:]
        plt.figure(1)
        plt.plot(range(2, 15), chosen_distortions, marker='o')
        plt.xlabel('Number of clusters')
        plt.ylabel('Distortion')
        plt.savefig(r'.//res//grid_distortions_destination.png')
        plt.close()
        plt.figure(1)
        plt.bar(range(2, 15), chosen_silhouette, color='grey')
        plt.xlabel('Number of clusters')
        plt.ylabel('Silhouette score')
        plt.savefig(r'.//res//grid_silhouette_destination.png')
Esempio n. 25
0
features = pd.read_csv('./../Data/specPowerDatamartTransform.csv')
#PCA
#dicha funcion scale lo que hace es centrar y escalar los datos
scaled_data = preprocessing.scale(features)

#Planteamos los datos como la relacion lineal de solamente dos componentes
pca = PCA(n_components=2)
dataset_questions_pca = pca.fit_transform(scaled_data)

#Analizamos la cantidad de cluster a partir de la informacion obtenida de la relacion lineal del pca
#Aplico el metodo del codo sobre el conjunto de datos
wcss = []
for i in range(1, 7):
    kmedoids = KMedoids(n_clusters=i, random_state=0)
    kmedoids.fit(dataset_questions_pca)
    sse = kmedoids.inertia_
    print("Clusters", i, "SSE", sse)
    wcss.append(sse)
plt.plot(range(1, 7), wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

#Aplico k-means sobre el conjunto brindado por pca
kmedoids = KMedoids(n_clusters=3, random_state=0)
y_kmedoids = kmedoids.fit_predict(dataset_questions_pca)
initial_centroids = kmedoids.cluster_centers_

print("Centroides iniciales")
Esempio n. 26
0
class ArgumentClusterer:
    english_clusterer = None
    greek_clusterer = None

    def __init__(self, n_components=2):
        #self.__pca = PCA(n_components=n_components, random_state=0)
        self.__clusterer = None
        self.__medoid_texts = None

    def fit(self, x, output_filename_suffix='output.pdf'):
        x = np.array(x)
        num_samples, num_features = x.shape[0], x.shape[1]
        self.__pca = PCA(n_components=min(num_samples, num_features),
                         random_state=0)
        x_transformed = self.__pca.fit_transform(x)

        visualizer = KElbowVisualizer(KMedoids(random_state=0),
                                      k=(1, num_samples),
                                      timings=False,
                                      locate_elbow=True)
        visualizer.fit(x_transformed)
        best_n_clusters = visualizer.elbow_value_ if visualizer.elbow_value_ is not None else 1

        self.__clusterer = KMedoids(n_clusters=best_n_clusters, random_state=0)
        self.__clusterer.fit(x_transformed)

    def predict(self, x):
        x_transformed = self.__pca.transform(x)
        return self.__clusterer.predict(x_transformed)

    def get_medoid_indices(self):
        return self.__clusterer.medoid_indices_.tolist()

    # Sort different arguments into similar clusters.
    @staticmethod
    @counter
    def suggest_clusters(discussions, lang_det, en_nlp, el_nlp):

        # The workspace doesn't have enough discussions, early exit.
        if len(discussions) < 3:
            return {'greek_clusters': {}, 'english_clusters': {}}

        # Fit all clusterers for all discussions of a single workspace.
        ArgumentClusterer.fit_clusterers(discussions, lang_det, en_nlp, el_nlp)
        english_clusters = {
            label: {
                'nodes': [],
                'texts': [],
                'summary': '',
                'medoid_text': ''
            }
            for label in map(
                str, ArgumentClusterer.english_clusterer.__clusterer.labels_)
        } if ArgumentClusterer.english_clusterer is not None else {}
        greek_clusters = {
            label: {
                'nodes': [],
                'texts': [],
                'summary': '',
                'medoid_text': ''
            }
            for label in map(
                str, ArgumentClusterer.greek_clusterer.__clusterer.labels_)
        } if ArgumentClusterer.greek_clusterer is not None else {}

        for discussion in discussions:
            if discussion['Position'] in ['Issue', 'Solution']:
                continue
            text = discussion['DiscussionText']
            language = detect_language(lang_det, text)
            text = remove_punctuation_and_whitespace(text)
            if language == 'english':
                if ArgumentClusterer.english_clusterer is None:
                    continue
                predicted = str(
                    ArgumentClusterer.english_clusterer.predict(
                        [en_nlp.tokenizer(text).vector])[0])
                english_clusters[predicted]['nodes'].append(discussion['id'])
                english_clusters[predicted]['texts'].append(text)
                english_clusters[predicted][
                    'medoid_text'] = ArgumentClusterer.english_clusterer.__medoid_texts[
                        predicted]
            elif language == 'greek':
                if ArgumentClusterer.greek_clusterer is None:
                    continue
                predicted = str(
                    ArgumentClusterer.greek_clusterer.predict(
                        [el_nlp.tokenizer(text).vector])[0])
                greek_clusters[predicted]['nodes'].append(discussion['id'])
                greek_clusters[predicted]['texts'].append(text)
                greek_clusters[predicted][
                    'medoid_text'] = ArgumentClusterer.greek_clusterer.__medoid_texts[
                        predicted]

            # Run textrank on non-empty aggregated text from each cluster for each language.
            for en_cluster in english_clusters.keys():
                en_text = '. '.join(english_clusters[en_cluster]['texts'])
                if en_text != '':
                    en_doc = run_textrank(en_text, en_nlp)
                    english_clusters[en_cluster][
                        'summary'] = text_summarization(
                            en_doc, en_nlp, config.top_n, config.top_sent)

            for el_cluster in greek_clusters.keys():
                el_text = '. '.join(greek_clusters[el_cluster]['texts'])
                if el_text != '':
                    el_doc = run_textrank(el_text, el_nlp)
                    greek_clusters[el_cluster]['summary'] = text_summarization(
                        el_doc, el_nlp, config.top_n, config.top_sent)

        return {
            'greek_clusters': greek_clusters,
            'english_clusters': english_clusters
        }

    @staticmethod
    @counter
    def fit_clusterers(discussions, lang_det, en_nlp, el_nlp):
        english_clusterer = None
        greek_clusterer = None

        english_texts, greek_texts = [], []
        for discussion in discussions:
            if discussion['Position'] in ['Issue']:
                continue
            text = discussion['DiscussionText']
            language = detect_language(lang_det, text)
            text = remove_punctuation_and_whitespace(text)
            if language == 'english':
                english_texts.append(text)
            elif language == 'greek':
                greek_texts.append(text)

        if len(english_texts) > 2:
            # Initialize the English Clusterer.
            english_clusterer = ArgumentClusterer()

            # Calculate the embeddings for each text of this discussion.
            english_embeddings = [
                en_nlp.tokenizer(text).vector for text in english_texts
            ]

            # Fit the clusterer using the textual embeddings of this discussion.
            english_clusterer.fit(english_embeddings, 'english.pdf')

            # Find the medoids of each cluster from each language.
            english_clusterer.__medoid_texts = {
                str(english_clusterer.__clusterer.labels_[i]): english_texts[i]
                for i in english_clusterer.__clusterer.medoid_indices_
            }

        if len(greek_texts) > 2:
            # Initialize the Greek Clusterer.
            greek_clusterer = ArgumentClusterer()

            # Calculate the embeddings for each text of this discussion.
            greek_embeddings = [
                el_nlp.tokenizer(text).vector for text in greek_texts
            ]

            # Fit the clusterer using the textual embeddings of this discussion.
            greek_clusterer.fit(greek_embeddings, 'greek.pdf')

            # Find the medoids of each cluster from each language.
            greek_clusterer.__medoid_texts = {
                str(greek_clusterer.__clusterer.labels_[i]): greek_texts[i]
                for i in greek_clusterer.__clusterer.medoid_indices_
            }

        ArgumentClusterer.english_clusterer = english_clusterer
        ArgumentClusterer.greek_clusterer = greek_clusterer
Esempio n. 27
0
    def clustering(self, clusterNo):
        km = KMedoids(clusterNo, metric='precomputed', init='k-medoids++')
        km.fit(self.metric)

        self.labels_ = km.labels_[:len(self.classes)]
Esempio n. 28
0
                               alpha=0.9).run()
scores = orig_scores.copy()

cos_dists = squareform(pdist(X, metric='cosine'))

np.fill_diagonal(scores, np.inf)

# >>

k = int(0.1 * n_train)

print('-' * 50)

# KMedoids
kmed = KMedoids(n_clusters=k, metric='precomputed', max_iter=1000)
train = kmed.fit(orig_scores.max() - orig_scores).medoid_indices_
pred_idx = orig_scores[:, train].argmax(axis=-1)
print('kmedoid   (diff)', metric_fn(y, y[train][pred_idx]))

# Heuristic
train = scores.mean(axis=0).argsort()[-k:]
pred_idx = orig_scores[:, train].argmax(axis=-1)
print('heuristic (diff)', metric_fn(y, y[train][pred_idx]))

# Random
train = np.random.choice(X.shape[0], k, replace=False)
pred_idx = orig_scores[:, train].argmax(axis=-1)
print('random    (diff)', metric_fn(y, y[train][pred_idx]))

print('-' * 50)
plt.style.use("fivethirtyeight")
plt.plot(range(2, 11), kmeans_silhouette_coefficients)
plt.xticks(range(2, 11))
plt.title("K-Means")
plt.xlabel("Number of Clusters")
plt.ylabel("Silhouette Coefficient")
plt.show()



# Clustering - Kmedoids (initial approach with 3 clusters)
kmedoids = KMedoids(
    metric="euclidean",
    n_clusters=3,
)
kmedoids.fit(standardized_features)

# The lowest Sum of Squared Error (SSE) value
kmedoids.inertia_

# Final locations of the centroid
kmedoids.cluster_centers_

# The number of iterations required to converge
kmedoids.n_iter_

# How many clusters should be calculated?
#   Using elbow method
print("="*27)
print("Clustering using K-Medoids")
print("="*27)
Esempio n. 30
0
class DFKMedoids(BaseEstimator, ClusterMixin):
    def __init__(self, cluster_name='KMedoids', columns=None,
                 eval_inertia=False, eval_silhouette=False, eval_chi=False, eval_dbi=False, eval_sample_size=None,
                 **kwargs):
        self.cluster_name     = cluster_name
        self.columns          = columns
        self.model            = KMedoids(**kwargs)
        self.eval_inertia     = eval_inertia
        self.eval_silhouette  = eval_silhouette
        self.eval_chi         = eval_chi
        self.eval_dbi         = eval_dbi
        self.eval_sample_size = eval_sample_size
        self.transform_cols   = None
        self.eval_df          = None
        self.centroid_df      = None
        
    def fit(self, X, y=None):
        self.columns        = X.columns if self.columns is None else self.columns
        self.transform_cols = [x for x in X.columns if x in self.columns]

        # Evaluation
        if any([self.eval_inertia, self.eval_silhouette, self.eval_chi, self.eval_dbi]):
            inertias    = []
            silhouettes = []
            chis        = []
            dbis        = []

            self.eval_df = pd.DataFrame({
                'n_cluster': [x+1 for x in range(self.model.n_clusters)],
            })
            self.eval_df['centroid'] = self.eval_df['n_cluster'].apply(lambda x: [])

            tmp_X = X[self.transform_cols].copy()
            index = 0
            for n_cluster in tqdm(self.eval_df['n_cluster'].values):
                model = copy.deepcopy(self.model)
                model.n_clusters = n_cluster
                model.fit(tmp_X)

                # Cluster centroid
                self.eval_df.at[index, 'centroid'] = model.cluster_centers_

                # Reference: https://blog.cambridgespark.com/how-to-determine-the-optimal-number-of-clusters-for-k-means-clustering-14f27070048f
                if self.eval_inertia:
                    inertias.append(model.inertia_)

                # Reference: https://towardsdatascience.com/clustering-metrics-better-than-the-elbow-method-6926e1f723a6
                if self.eval_silhouette:
                    silhouettes.append(np.nan if n_cluster <= 1 else silhouette_score(tmp_X, model.labels_, sample_size=self.eval_sample_size, metric='euclidean', random_state=model.random_state))

                # Reference: https://stats.stackexchange.com/questions/52838/what-is-an-acceptable-value-of-the-calinski-harabasz-ch-criterion
                if self.eval_chi:
                    chis.append(np.nan if n_cluster <= 1 else calinski_harabasz_score(tmp_X, model.labels_))

                # Reference: https://stackoverflow.com/questions/59279056/davies-bouldin-index-higher-or-lower-score-better
                if self.eval_dbi:
                    dbis.append(np.nan if n_cluster <= 1 else davies_bouldin_score(tmp_X, model.labels_))

                index += 1

            if self.eval_inertia:
                self.eval_df['inertia'] = inertias

            if self.eval_silhouette:
                self.eval_df['silhouette'] = silhouettes

            if self.eval_chi:
                self.eval_df['calinski_harabasz'] = chis

            if self.eval_dbi:
                self.eval_df['davies_bouldin'] = dbis

        # Train
        else:
            self.model.fit(X[self.transform_cols])

            self.centroid_df = pd.DataFrame(
                self.model.cluster_centers_,
                columns=self.transform_cols
            )
            self.centroid_df['Cluster'] = [f'Cluster {x}' for x in np.unique(self.model.labels_)]
            self.centroid_df.set_index('Cluster', inplace=True)
            self.centroid_df.index.name = None

        return self
    
    def predict(self, X):
        if self.transform_cols is None:
            raise NotFittedError(f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.")

        new_X = X.copy()
        new_X[self.cluster_name] = self.model.predict(X[self.transform_cols])
        new_X[self.cluster_name] = 'Cluster ' + new_X[self.cluster_name].astype(str)

        return new_X

    def fit_predict(self, X, y=None):
        return self.fit(X).predict(X)

    def predict_proba(self, X):
        if self.transform_cols is None:
            raise NotFittedError(f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.")

        # Measure distance to centroid
        prob_df = pd.DataFrame(
            DistanceMetric.get_metric('euclidean').pairwise(X[self.transform_cols], self.centroid_df),
            columns=[f'{self.cluster_name} Cluster {x}' for x in range(len(self.centroid_df))]
        )
        # Convert to probability
        prob_df = prob_df.divide(prob_df.sum(axis=1), axis=0)
        prob_df = 1 - prob_df

        new_X = pd.concat([X, prob_df], axis=1)

        return new_X