def elbow_analysis(sample_file_path, kmin, kmax, **kwargs):
    initializer = kwargs.get('initializer', kmeans_plusplus_initializer)
    sample = read_sample(sample_file_path)

    elbow_instance = elbow(sample, kmin, kmax, initializer=initializer)
    elbow_instance.process()

    amount_clusters = elbow_instance.get_amount()
    wce = elbow_instance.get_wce()

    centers = kmeans_plusplus_initializer(sample, amount_clusters).initialize()
    kmeans_instance = kmeans(sample, centers)
    kmeans_instance.process()
    clusters = kmeans_instance.get_clusters()
    centers = kmeans_instance.get_centers()

    print("Sample '%s': Obtained amount of clusters: '%d'." %
          (sample_file_path, amount_clusters))

    figure = plt.figure(1)
    ax = figure.add_subplot(111)
    ax.plot(range(kmin, kmax), wce, color='b', marker='.')
    ax.plot(amount_clusters,
            wce[amount_clusters - kmin],
            color='r',
            marker='.',
            markersize=10)
    ax.annotate("Elbow",
                (amount_clusters + 0.1, wce[amount_clusters - kmin] + 5))
    ax.grid(True)
    plt.ylabel("WCE")
    plt.xlabel("K")
    plt.show()

    kmeans_visualizer.show_clusters(sample, clusters, centers)
Esempio n. 2
0
    def calculate_elbow(path_to_data, path_to_answer, kmin, kmax, ccore, **kwargs):
        repeat = 10  # Elbow method randomly chooses initial centers therefore we need to repeat test if it fails.
        testing_result = False

        initializer = kwargs.get('initializer', kmeans_plusplus_initializer)

        sample = read_sample(path_to_data)
        answer = answer_reader(path_to_answer)

        additional_info = []

        for _ in range(repeat):
            elbow_instance = elbow(sample, kmin, kmax, ccore=ccore, initializer=initializer)
            elbow_instance.process()

            actual_elbow = elbow_instance.get_amount()
            actual_wce = elbow_instance.get_wce()

            assertion.gt(actual_elbow, kmin)
            assertion.lt(actual_elbow, kmax)
            assertion.eq(len(actual_wce), kmax - kmin)
            assertion.lt(actual_wce[-1], actual_wce[0] + 0.0000001)

            if actual_elbow != len(answer.get_clusters()):
                additional_info.append(actual_elbow)
                #time.sleep(0.05)    # sleep to gain new seed for random generator
                continue

            testing_result = True
            break

        message = str(len(answer.get_clusters())) + ": " + str(additional_info)
        assertion.true(testing_result, message=message)
    def calculate_elbow(path_to_data, path_to_answer, kmin, kmax, ccore, **kwargs):
        repeat = 5  # Elbow method randomly chooses initial centers therefore we need to repeat test if it fails.
        testing_result = False

        initializer = kwargs.get('initializer', kmeans_plusplus_initializer)

        sample = read_sample(path_to_data)
        answer = answer_reader(path_to_answer)

        for _ in range(repeat):
            elbow_instance = elbow(sample, kmin, kmax, ccore=ccore, initializer=initializer)
            elbow_instance.process()

            actual_elbow = elbow_instance.get_amount()
            actual_wce = elbow_instance.get_wce()

            assertion.gt(actual_elbow, kmin)
            assertion.lt(actual_elbow, kmax)
            assertion.eq(len(actual_wce), kmax - kmin)
            assertion.lt(actual_wce[-1], actual_wce[0])

            if actual_elbow != len(answer.get_clusters()):
                continue

            testing_result = True
            break

        assertion.true(testing_result)
Esempio n. 4
0
def kmediansWithScore(nameData, nameSilhouetteMean, nameDBS, nameCHS,
                      k_clusters, measure, kmin, kmax):
    data = read_sample(str(root) + '\\' + nameData)

    initial_medians = kppi(data, k_clusters).initialize()
    kmedians_instance = kmedians(data, initial_medians)
    kmedians_instance.process()

    clusters = kmedians_instance.get_clusters()
    #    final_medians = kmedians_instance.get_medians()

    predicted = kmedians_instance.predict(data)

    silhouetteScore = silhouette(data, clusters).process().get_score()
    meanSilhouetteScore = np.mean(silhouetteScore)
    #wlitCSV(silhouetteScore, filenameSilhouette, '', root)
    #witCSV(meanSilhouetteScore, nameSilhouetteMean, '', root)

    dbsScore = dbs(data, predicted)
    #witCSV(dbsScore, nameDBS, '', root)

    chsScore = chs(data, predicted)
    #witCSV(chsScore, nameCHS, '', root)

    elbow_instance = elbow(data, kmin, kmax)
    elbow_instance.process()
    amount_clusters = elbow_instance.get_amount(
    )  # most probable amount of clusters
    wce = elbow_instance.get_wce()
def subcluster(dataset):
    kmin = 1
    kmax = 20
    if kmax > len(dataset):
        kmax = len(dataset)
    optimal_clusters = 1
    # Determining Clusters
    # Might potentially be inefficient technique
    # Instead of elbow, could again repeat what is done
    # in the main clustering, going through K values
    # Choosing one with lowest error from calcError
    # This could be very time intensive however
    if kmax - kmin <= 3:
        optimal_clusters = int((kmin + kmax) / 2)
    else:
        elbow_inst = elbow(dataset, kmin, kmax)
        elbow_inst.process()
        optimal_clusters = elbow_inst.get_amount()
    if optimal_clusters > len(dataset):
        optimal_clusters = len(dataset)
    initial_centers = kmeans_plusplus_initializer(
        dataset, optimal_clusters).initialize()
    metric = distance_metric(type_metric.EUCLIDEAN)
    kmeans_instance = kmeans(dataset, initial_centers, metric=metric)
    kmeans_instance.process()
    clusters = kmeans_instance.get_clusters()

    return clusters
    def calculate_elbow(path_to_data, path_to_answer, kmin, kmax, ccore, **kwargs):
        repeat = 10  # Elbow method randomly chooses initial centers therefore we need to repeat test if it fails.
        testing_result = False

        initializer = kwargs.get('initializer', kmeans_plusplus_initializer)

        sample = read_sample(path_to_data)
        answer = answer_reader(path_to_answer)

        additional_info = []

        for _ in range(repeat):
            elbow_instance = elbow(sample, kmin, kmax, ccore=ccore, initializer=initializer)
            elbow_instance.process()

            actual_elbow = elbow_instance.get_amount()
            actual_wce = elbow_instance.get_wce()

            assertion.gt(actual_elbow, kmin)
            assertion.lt(actual_elbow, kmax)
            assertion.eq(len(actual_wce), kmax - kmin)
            assertion.lt(actual_wce[-1], actual_wce[0] + 0.0000001)

            if actual_elbow != len(answer.get_clusters()):
                additional_info.append(actual_elbow)
                #time.sleep(0.05)    # sleep to gain new seed for random generator
                continue

            testing_result = True
            break

        message = str(len(answer.get_clusters())) + ": " + str(additional_info)
        assertion.true(testing_result, message=message)
def calculateAppropriateNumberOfClusters(data,
                                         minimum,
                                         maximum,
                                         specInit=random_center_initializer):
    elbow_instance = elbow(data, minimum, maximum, initializer=specInit)
    elbow_instance.process()
    return (elbow_instance.get_amount(), elbow_instance.get_wce())
Esempio n. 8
0
def elbow_analysis(sample_file_path, kmin, kmax, **kwargs):
    initializer = kwargs.get('initializer', kmeans_plusplus_initializer)
    sample = read_sample(sample_file_path)

    elbow_instance = elbow(sample, kmin, kmax, initializer=initializer)
    elbow_instance.process()

    amount_clusters = elbow_instance.get_amount()
    wce = elbow_instance.get_wce()

    centers = kmeans_plusplus_initializer(sample, amount_clusters).initialize()
    kmeans_instance = kmeans(sample, centers)
    kmeans_instance.process()
    clusters = kmeans_instance.get_clusters()
    centers = kmeans_instance.get_centers()

    print("Sample '%s': Obtained amount of clusters: '%d'." % (sample_file_path, amount_clusters))

    figure = plt.figure(1)
    ax = figure.add_subplot(111)
    ax.plot(range(kmin, kmax), wce, color='b', marker='.')
    ax.plot(amount_clusters, wce[amount_clusters - kmin], color='r', marker='.', markersize=10)
    ax.annotate("Elbow", (amount_clusters + 0.1, wce[amount_clusters - kmin] + 5))
    ax.grid(True)
    plt.ylabel("WCE")
    plt.xlabel("K")
    plt.show()

    kmeans_visualizer.show_clusters(sample, clusters, centers)
Esempio n. 9
0
def calculateAppropriateNumberOfClusters(data, minimum, maximum):
    #elbow_instance = elbow(data, minimum, maximum)
    elbow_instance = elbow(data,
                           minimum,
                           maximum,
                           initializer=random_center_initializer)
    elbow_instance.process()
    return elbow_instance.get_amount()
Esempio n. 10
0
    def random_state_fixed(path_to_data, kmin, kmax, ccore, **kwargs):
        repeat = kwargs.get('repeat', 1)

        for _ in range(repeat):
            sample = read_sample(path_to_data)

            elbow_instance = elbow(sample, kmin, kmax, ccore=ccore,
                                   **kwargs).process()
            elbow_1 = elbow_instance.get_amount()
            wce_1 = elbow_instance.get_wce()

            elbow_instance = elbow(sample, kmin, kmax, ccore=ccore,
                                   **kwargs).process()
            elbow_2 = elbow_instance.get_amount()
            wce_2 = elbow_instance.get_wce()

            assertion.eq(elbow_1, elbow_2)
            assertion.eq(wce_1, wce_2)
Esempio n. 11
0
def elbow_k_means(key_word, model_path):
    logger = Logger(model_path)
    model = logger.model
    result = model.most_similar(key_word, topn=100)

    word_vectors = []
    num_clusters = 8
    word_names = []
    word_correlation = []
    for r in result:
        word_vectors.append(model.wv[r[0]])
        word_names.append(r[0])
        word_correlation.append(r[1])

    tsne = PCA(n_components=2)

    X_tsne = tsne.fit_transform(word_vectors)

    kmin, kmax = 1, 10
    elbow_instance = elbow(X_tsne, kmin, kmax)

    elbow_instance.process()
    amount_clusters = elbow_instance.get_amount()
    wce = elbow_instance.get_wce()

    centers = kmeans_plusplus_initializer(X_tsne,
                                          amount_clusters,
                                          amount_candidates=kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE).initialize()
    k_means_instance = kmeans(X_tsne, centers)
    k_means_instance.process()

    clusters = k_means_instance.get_clusters()
    centers = k_means_instance.get_centers()

    index_to_word = [[] for i in range(len(clusters))]
    index_to_correlation = [[] for i in range(len(clusters))]
    idx = 0
    cluster_list = []
    for c in clusters:
        words_list = []
        for i in c:
            word_dict = dict()
            word_dict["text"] = word_names[i]
            word_dict["correlation"] = word_correlation[i]
            t_dict = dict()
            t_dict["word"] = word_dict
            words_list.append(t_dict)
        words_dict = dict()
        words_dict["words"] = words_list
        cluster_list.append(words_dict)
        idx += 1

    return len(clusters), cluster_list
Esempio n. 12
0
    def calculate_elbow(path_to_data, path_to_answer, kmin, kmax, ccore,
                        **kwargs):
        repeat = 15  # Elbow method randomly chooses initial centers therefore we need to repeat test if it fails.
        testing_result = False
        kstep = kwargs.get('kstep', 1)

        sample = read_sample(path_to_data)

        expected_clusters_amount = None
        if path_to_answer is not None:
            if isinstance(path_to_answer, int):
                expected_clusters_amount = path_to_answer
            else:
                expected_clusters_amount = len(
                    answer_reader(path_to_answer).get_clusters())

        additional_info = []

        for _ in range(repeat):
            elbow_instance = elbow(sample, kmin, kmax, ccore=ccore, **kwargs)
            elbow_instance.process()

            actual_elbow = elbow_instance.get_amount()
            actual_wce = elbow_instance.get_wce()

            assertion.gt(actual_elbow, kmin)
            assertion.lt(actual_elbow, kmax)
            assertion.eq(len(actual_wce),
                         math.floor((kmax - kmin) / kstep + 1))
            assertion.lt(actual_wce[-1], actual_wce[0] + 0.0000001)

            if (expected_clusters_amount is not None) and (
                    actual_elbow != expected_clusters_amount):
                additional_info.append(actual_elbow)
                continue

            testing_result = True
            break

        message = None
        if expected_clusters_amount is not None:
            message = str(expected_clusters_amount) + ": " + str(
                additional_info)

        assertion.true(testing_result, message=message)
Esempio n. 13
0
def run_elbow(data):
    # create instance of Elbow method using K value from 1 to 10.
    kmin, kmax = 1, 10
    elbow_instance = elbow(data, kmin, kmax)
    # process input data and obtain results of analysis
    elbow_instance.process()
    amount_clusters = elbow_instance.get_amount()

    # perform cluster analysis using K-Means algorithm
    centers = kmeans_plusplus_initializer(
        data, amount_clusters,
        amount_candidates=kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE).initialize()
    
    kmeans_instance = kmeans(data, centers)
    kmeans_instance.process()
    
    clusters = kmeans_instance.get_clusters()
    centers = kmeans_instance.get_centers()
    kmeans_visualizer.show_clusters(data, clusters, centers)
Esempio n. 14
0
def subcluster(dataset):
    kmin = len(dataset[0])
    kmax = len(dataset)
    optimal_clusters = 1
    if kmax - kmin <= 3:
        optimal_clusters = int((kmin + kmax) / 2)
    else:
        elbow_inst = elbow(dataset, kmin, kmax)
        elbow_inst.process()
        optimal_clusters = elbow_inst.get_amount()
    if optimal_clusters > len(dataset):
        optimal_clusters = len(dataset)
    initial_centers = kmeans_plusplus_initializer(
        dataset, optimal_clusters).initialize()
    metric = distance_metric(type_metric.EUCLIDEAN)
    kmeans_instance = kmeans(dataset, initial_centers, metric=metric)
    kmeans_instance.process()
    clusters = kmeans_instance.get_clusters()

    return clusters
Esempio n. 15
0
def elbow_kmeans_optimizer(X, k=None, kmin=1, kmax=5, visualize=True):
    """k-means clustering with or without automatically determined cluster numbers. 
    Reference: https://pyclustering.github.io/docs/0.8.2/html/d3/d70/classpyclustering_1_1cluster_1_1elbow_1_1elbow.html
    
    # Arguments:
        X (numpy array-like): Input data matrix.
        kmin: Minimum number of clusters to consider. Defaults to 1.
        kmax: Maximum number of clusters to consider. Defaults to 5.
        visualize: Whether to perform k-means visualization or not.
    
    # Returns:
        numpy arraylike: Clusters.
        numpy arraylike: Cluster centers.
    """
    from pyclustering.utils import read_sample
    from pyclustering.samples.definitions import SIMPLE_SAMPLES
    from pyclustering.cluster.kmeans import kmeans
    from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer, random_center_initializer
    from pyclustering.core.wrapper import ccore_library
    from pyclustering.cluster.elbow import elbow
    from pyclustering.cluster.kmeans import kmeans_visualizer
    import pyclustering.core.elbow_wrapper as wrapper
    if k is not None:
        amount_clusters = k
    else:
        elbow_instance = elbow(X, kmin, kmax)
        elbow_instance.process()
        amount_clusters = elbow_instance.get_amount()
        wce = elbow_instance.get_wce()
    centers = kmeans_plusplus_initializer(X, amount_clusters).initialize()
    kmeans_instance = kmeans(X, centers)
    kmeans_instance.process()
    clusters = kmeans_instance.get_clusters()
    centers = kmeans_instance.get_centers()
    kmeans_visualizer.show_clusters(X, clusters, centers)
    return clusters, centers
Esempio n. 16
0
    c += 1

#Setting up training and testing data
X = np.array(list(job_data.values()))
job_data.clear()
load.clear()
rows.clear()
X_normed = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
Y = np.array(list(wait_time.values()))
R = np.array(list(run_time.values()))
X_train, X_test = X_normed[:20000], X_normed[20000 + 1:60000]
Y_train, Y_test = Y[:20000], Y[20000 + 1:60000]
R_train, R_test = R[:20000], R[20000 + 1:60000]

#Applying ELBOW
elbow_instance = elbow(X_train, 1, 50)
elbow_instance.process()
amount_clusters = elbow_instance.get_amount(
)  # most probable amount of clusters
print(amount_clusters)
kmeans = KMeans(n_clusters=amount_clusters, random_state=0).fit(X_train)
joblib.dump(kmeans, 'XGB/Kmeans.pkl')
reg_models = []
for i in range(amount_clusters):
    x = X_train[kmeans.labels_ == i]
    y = Y_train[kmeans.labels_ == i]
    x = x[:, :4]
    reg = xgb.XGBRegressor(objective='reg:squarederror',
                           eval_metric='mae',
                           colsample_bytree=0.75,
                           learning_rate=0.01,
def createvisual(fileNum, type):
    sample = read_sample("TestData/QueryBehaviorText/" + str(fileNum) + ".txt")

    # Sample is simply matrix holding values, can be accessed for values just like any other
    kmin, kmax = 1, len(sample)

    elbow_inst = elbow(sample, kmin, kmax)

    elbow_inst.process()

    optimal_clusters = elbow_inst.get_amount()

    initial_centers = kmeans_plusplus_initializer(
        sample, optimal_clusters).initialize()

    # user_function = lambda point1, point2: sum(l1 != 12 for l1, l2 in zip(point1, point2))

    user_function = lambda point1, point2: np.count_nonzero(
        np.array(point1) != np.array(point2))

    metricUser = distance_metric(type_metric.USER_DEFINED, func=user_function)

    # print(metricUser([0, 1, 1], [0, 0, 1]))

    metric = distance_metric(type_metric.EUCLIDEAN)
    if type == 0:
        metric = distance_metric(type_metric.USER_DEFINED, func=user_function)
    kmeans_instance = kmeans(sample, initial_centers, metric=metric)

    kmeans_instance.process()
    clusters = kmeans_instance.get_clusters()
    print(fileNum)
    print(clusters)
    print("\n\n\n")

    final_centers = kmeans_instance.get_centers()

    mockDataArr = []

    for i in range(len(sample)):
        mockDataArr.append(i)
    mockDataPos = {}
    for i in range(len(sample)):
        mockDataPos[i] = i
    # print("\n")
    # print("Position Mapping Hashmap: ", mockDataPos)
    # print("\n")
    # print("Initial Column Positions: ", mockDataArr, "\n")

    mockDataClustered = []

    for cluster in clusters:
        mockDataClustered.extend(cluster)

    imageData = []

    origMulitDimen = np.array(sample, dtype=int)

    # print("Original Coordinates")

    # print(origMulitDimen)

    numpyChar = np.transpose(origMulitDimen)

    originalSave = np.copy(numpyChar)

    printNumpy = np.insert(numpyChar, 0, mockDataArr, 0)

    for i in range(len(mockDataArr) - 1):
        if i != mockDataPos[mockDataClustered[i]]:
            temp = np.copy(numpyChar[:, i])

            realTemp = mockDataArr[i]

            mockDataArr[i] = mockDataArr[mockDataPos[mockDataClustered[i]]]

            mockDataArr[mockDataPos[mockDataClustered[i]]] = realTemp

            numpyChar[:, i] = numpyChar[:, mockDataPos[mockDataClustered[i]]]

            numpyChar[:, mockDataPos[mockDataClustered[i]]] = temp

            temp2 = mockDataPos[mockDataClustered[i]]

            mockDataPos[mockDataClustered[i]] = i

            mockDataPos[realTemp] = temp2

    printArray = np.insert(numpyChar, 0, np.array(mockDataArr), 0)

    swappederror = calcError(numpyChar)
    defaulterror = calcError(originalSave)
    f.write(str(swappederror / defaulterror * 100) + "\n")
    print(swappederror / defaulterror)
    fig, ax = plt.subplots(1, 2)
    fig.suptitle('Clusters: ' + str(len(clusters)), fontsize=20)
    fig.text(.5, .05, 'Clustered Columns: ' + str(clusters), ha='center')
    fig.text(.5, .1, 'Original Error: ' + str(defaulterror), ha='center')
    fig.text(.5, .15, 'Clustered Error: ' + str(swappederror), ha='center')
    ax[0].imshow(numpyChar, cmap=plt.cm.Greys)

    ax[1].imshow(originalSave, cmap=plt.cm.Greys)

    ax[0].title.set_text('Clustered Characteristic Matrix')

    ax[1].title.set_text('Original Charecteristic Matrix')
    fig.set_size_inches(10, 7)
    if type == 0:
        plt.savefig("TestData/QueryBehaviorVisualsHamming/" + str(fileNum) +
                    ".png")
    else:
        plt.savefig("TestData/QueryBehaviorVisualsEuclidean/" + str(fileNum) +
                    ".png")
Esempio n. 18
0
print('Successfully loaded all modules')

# load image encodings
print('Loading encodings...')
start = time.time()
encodings = "encodings.npy"
encodings = np.load(encodings)
stop = time.time()
print('Encodings loaded successfully', '[', round(stop - start, 2),
      'seconds ]')

# create elbow instance
print('Creating elbow instance...')
start = time.time()
elbow_instance = elbow(encodings, 2, 100)
stop = time.time()
print('Elbow instance created', '[', round(stop - start, 2), 'seconds ]')

# find the optimal value for K (no. of groups) using elbow algorithm
print('Getting the optimal number of clusters using elbow...')
start = time.time()
elbow_instance.process()
K = elbow_instance.get_amount()
stop = time.time()
print(K, 'clusters should be formed according to Elbow', '[',
      round(stop - start, 2), 'seconds ]')

# load the distance matrix (similarity matrix)
print('Loading the similarity (distance) matrix...')
start = time.time()
Esempio n. 19
0
 def _optimal_cluster(self, kmax=50):
     elbow_instance = elbow(self.traj.values, 1, kmax)
     elbow_instance.process()
     amount_clusters = elbow_instance.get_amount()
     wce = elbow_instance.get_wce()
     return amount_clusters, wce
Esempio n. 20
0
if maxClusters > len(mash_mat):
    print(
        "Warning: max number of clusters exceeds size of mash matrix. Reducing maxClusters."
    )
    maxClusters = len(mash_mat)

# collapse the distance matrix
X = distance.pdist(mash_mat).reshape(-1, 1)

# define the range of number of clusters to test
range_n_clusters = range(minClusters, maxClusters + 1)

# use the elbow method
if clusterMethod == "elbow":

    elbow_instance = elbow(X, range_n_clusters[0], range_n_clusters[-1] + 1)
    elbow_instance.process()
    wce = elbow_instance.get_wce()
    chosen_nClusters = elbow_instance.get_amount()

    if interactive:
        plt.plot(range_n_clusters, wce, 'bx-')
        plt.xlabel('#Clusters')
        plt.ylabel("Distortion")
        plt.title("Elbow Method showing optimal K")
        plt.draw()
        plt.pause(0.001)

        manual_clusters = input(
            "Chose {} clusters. OK? [y/n] ".format(chosen_nClusters))
        if manual_clusters in ["n", "N", "No", "no"]: