Esempio n. 1
0
def clustering(fname="clustering.png"):
    # Create side-by-side axes grid
    _, axes = plt.subplots(ncols=2, figsize=(18,6))
    X, y = make_blobs(centers=7)

    # Add K-Elbow to the left
    oz = KElbowVisualizer(MiniBatchKMeans(), k=(3,12), ax=axes[0])
    oz.fit(X, y)
    oz.finalize()

    # Add SilhouetteVisualizer to the right
    oz = SilhouetteVisualizer(Birch(n_clusters=5), ax=axes[1])
    oz.fit(X, y)
    oz.finalize()

    # Save figure
    path = os.path.join(FIGURES, fname)
    plt.tight_layout()
    plt.savefig(path)
Esempio n. 2
0
def makeK(d, ilist, title):
    d = np.array(d)
    kk = pd.DataFrame({
        'Variance': d[:, 0],
        'Skewness': d[:, 1],
        'Kurtosis': d[:, 2]
    })
    K = 20
    model = KMeans()
    visualizer = KElbowVisualizer(model, k=(1, K))
    kIdx = visualizer.fit(kk)  # Fit the data to the visualizer
    visualizer.show()  # Finalize and render the figure
    kIdx = kIdx.elbow_value_
    model = KMeans(n_clusters=kIdx).fit(kk)
    # scatter plot
    fig = plt.figure()
    ax = Axes3D(fig)  #.add_subplot(111))
    cmap = plt.get_cmap('gnuplot')
    clr = [cmap(i) for i in np.linspace(0, 1, kIdx)]
    for i in range(0, kIdx):
        ind = (model.labels_ == i)
        ax.scatter(d[ind, 2],
                   d[ind, 1],
                   d[ind, 0],
                   s=30,
                   c=clr[i],
                   label='Cluster %d' % i)

    ax.set_xlabel("Kurtosis")
    ax.set_ylabel("Skew")
    ax.set_zlabel("Variance")
    plt.title(title + ': KMeans clustering with K=%d' % kIdx)
    plt.legend()
    plt.savefig(title + "clustersnoises.png")
    plt.show()
    d = pd.DataFrame(
        {
            'Variance': d[:, 0],
            'Skewness': d[:, 1],
            'Kurtosis': d[:, 2],
            'Alpha': d[:, 3],
            'Beta': d[:, 4],
            "Psi": d[:, 5],
            "Cluster": model.labels_
        },
        index=ilist)
    return d
Esempio n. 3
0
    def elbow_test(self, df, k_values):

        # extract
        var_list = self.variable_list
        path_out = self.save_path

        # check if a directory for elbow test exists
        path_out = os.path.join(path_out, 'Elbow test results')
        if not os.path.exists(path_out):
            os.makedirs(path_out)

        # based on distortion score
        plt.figure()
        elbow_k = cluster.KMeans()
        visualizer = KElbowVisualizer(elbow_k,
                                      k=(min(k_values), max(k_values)))
        visualizer.fit(df[var_list])
        visualizer.show(
            outpath=os.path.join(path_out, "kelbow_minibatchkmeans.jpg"))
        optimal_k = {'Distortion score': visualizer.knee_value}

        # based on calinski_harabasz
        plt.figure()
        visualizer = KElbowVisualizer(elbow_k,
                                      k=(min(k_values), max(k_values)),
                                      metric='calinski_harabasz',
                                      timings=False,
                                      locate_elbow=True)
        visualizer.fit(df[var_list])
        visualizer.show(
            outpath=os.path.join(path_out, "kelbow_calinski-harabasz.jpg"))
        optimal_k['Calinski-Harabasz score'] = visualizer.knee_value

        # based on silhouette score
        plt.figure()
        visualizer = KElbowVisualizer(elbow_k,
                                      k=(min(k_values), max(k_values)),
                                      metric='silhouette',
                                      timings=False,
                                      locate_elbow=True)
        visualizer.fit(df[var_list])
        visualizer.show(outpath=os.path.join(path_out, "silhouette.jpg"))
        optimal_k['Silhouette score'] = visualizer.get_params()

        # set the optimal values of k
        self.optimal_k = optimal_k
Esempio n. 4
0
clf = PCA(random_state=0, )

print(clf)

results = clf.fit_transform(X_train)

model = KMeans(
    random_state=0,
    n_jobs=-1,
)

# https://www.scikit-yb.org/en/latest/api/cluster/elbow.html
visualizer = KElbowVisualizer(model, k=(1, 20))

visualizer.fit(results)  # Fit the data to the visualizer
# Finalize and render the figure
visualizer.show(outpath="charts/income.k-means.PCA.KElbowVisualizer.png")
visualizer.poof()

model = KMeans(
    n_clusters=4,
    random_state=0,
    n_jobs=-1,
)
visualizer = InterclusterDistance(model)

visualizer.fit(results)  # Fit the data to the visualizer
# Finalize and render the figure
visualizer.show(outpath="charts/income.k-means.PCA.InterclusterDistance.png")
visualizer.poof()
Esempio n. 5
0
#FINALMENTE, gerando o RFM DATAFRAME

last_date = order_payment['order_delivered_carrier_date'].max() + timedelta(
    days=1)

rfm = order_payment.groupby('customer_id').agg({
    'order_delivered_carrier_date':
    lambda x: (last_date - x.max()).days,
    'order_id':
    lambda x: len(x),
    'payment_value':
    'sum'
})

rfm.dropna(inplace=True)
std = StandardScaler()
x_std = std.fit_transform(rfm)
model = KMeans()
visualizer = KElbowVisualizer(model, k=(4, 12))

visualizer.fit(x_std)
visualizer.show()

model_k = KMeans(n_clusters=4)
kmeans = model_k.fit(x_std)
rfm['cluster'] = kmeans.labels_

rfm.columns = ['Recency', 'Frequency', 'MonetaryValue', 'cluster']

rfm.head()
h5f.close()

h5f = h5py.File(
    '/models/mccikpc2/CPI-analysis/cnn/model_t5_epochs_50_dense64_3a_aux.h5',
    'r')
test_idx = h5f['test_idx'][:]
h5f.close()

model = KMeans()
plt.ion()
plt.show()

# elbow plot
plt.figure()
visualizer = KElbowVisualizer(model, k=(2, 30), timings=True, verbose=1)
visualizer.fit(cod1[test_idx])

# silouette plot
plt.figure()
visualizer = KElbowVisualizer(model,
                              k=(2, 30),
                              metric='silhouette',
                              timings=True,
                              verbose=1)
visualizer.fit(cod1[test_idx])

# gaussian mixtures BIC & AIC
# https://jakevdp.github.io/PythonDataScienceHandbook/05.12-gaussian-mixtures.html
plt.figure()
n_components = np.arange(1, 21)
models = [
Esempio n. 7
0
    #
    # centroids = model.cluster_centers_  # 聚类中心
    # kmeans_plot(X_train, centroids)

    from sklearn.cluster import KMeans
    from sklearn.cluster import DBSCAN
    from yellowbrick.cluster import KElbowVisualizer

    xmin = np.min(X_train[:, 0])
    ymin = np.min(X_train[:, 1])
    xmax = np.max(X_train[:, 0])
    ymax = np.max(X_train[:, 1])

    kmeans = KElbowVisualizer(KMeans(), k=(2, 10))
    # kmeans = KMeans(n_clusters=n_clusters, init='random', n_init=1, random_state=0, max_iter=100)
    kmeans.fit(X_train)
    n_clusters = kmeans.elbow_value_
    kmeans = KMeans(n_clusters=n_clusters,
                    init='random',
                    n_init=1,
                    random_state=0,
                    max_iter=100)
    kmeans.fit(X_train)
    y_kmeans = kmeans.predict(X_train)  # cluster index for each observation
    centers = kmeans.cluster_centers_  # cluster center coordinates
    fig, ax = plt.subplots()
    plt.scatter(X_train[:, 0], X_train[:, 1], c=y_kmeans, s=5, cmap='summer')
    plt.scatter(centers[:, 0], centers[:, 1], c='black', s=100, alpha=0.5)

    from scipy.spatial import Voronoi, voronoi_plot_2d
    from scipy.spatial import ConvexHull, convex_hull_plot_2d
#plt.plot(K, distortions, 'm*-')
#plt.title('Elbow Method with distortion')
#plt.xlabel('Value of k')
#plt.ylabel('Distortion')
#plt.vlines(4,0,25000,colors='red',linestyles ="dashed")
#plt.grid()
#plt.show()

###############################################################################
#elbow 2

from yellowbrick.cluster import KElbowVisualizer
model = kmeans
visualizer = KElbowVisualizer(model, k=(4, 12))
plt.figure(11)
visualizer.fit(X)
visualizer.show()

#
#
#
#
#X= df.loc[df.index,['Position','Count']].to_numpy()
#num_clusters = 3
#kmeans = KMeans(n_clusters = num_clusters).fit(X)
#labels = kmeans.labels_
#n_clusters_ = kmeans.cluster_centers_
#
#
#################   1
#distortions = []
        plt.close()
        print('finished Part {}: data: {} PCA'.format(p, c))

        ######################
        # K-Means Clustering Baseline
        ######################

        fig=plt.figure(figsize=(17,8))
        fig.suptitle('Part: {} Clustering Baseline data: {}'.format(p,c),size=16)
        plt.subplot(131)

        # plot     distortion: mean sum of squared distances to centers
        ax1 = plt.subplot(1, 3, 1)
        model = KMeans(n_init=1000)
        visualizer = KElbowVisualizer(model, k=(2,12),timings=False,metric='distortion', ax = ax1)
        visualizer.fit(data_p_Base[areas])        # Fit the data to the visualizer
        #visualizer.show()

        # plot     silhouette: mean ratio of intra-cluster and nearest-cluster distance
        ax2 = plt.subplot(1, 3, 2)
        model = KMeans(n_init=1000)
        visualizer = KElbowVisualizer(model, k=(2,12),timings=False,metric='silhouette', ax = ax2)
        visualizer.fit(data_p_Base[areas])        # Fit the data to the visualizer
        #visualizer.show()

        #plot      calinski_harabasz: ratio of within to between cluster dispersion
        ax3 = plt.subplot(1, 3, 3)
        model = KMeans(n_init=1000)
        visualizer = KElbowVisualizer(model, k=(2,12),timings=False,metric='calinski_harabasz', ax = ax3)
        visualizer.fit(np.array(data_p_Base[areas]))        # Fit the data to the visualizer
        #visualizer.show()
#%%
# Instantiate the clustering model and visualizer
if ask_user('Bepalen optimaal # clusters met elbow'):
    print('Optimaal aantal clusters voor KMeans bepalen !')

    vectorizer = TfidfVectorizer(stop_words=my_stopwords, max_features=300)
    X = vectorizer.fit_transform(documentstxtclean)
    Xdf = pd.DataFrame(X.toarray())

    model = KMeans()
    visualizer = KElbowVisualizer(model,
                                  k=range(2, 51, 1),
                                  metric='calinski_harabasz',
                                  timings=False)

    visualizer.fit(Xdf)  # Fit the data to the visualizer
    visualizer.show()  # Finalize and render the figure
else:
    print('Optimaal aantal clusters voor KMeans niet bepaald !')

#%% Toevoegen bepaalde clusters aan incidentgegevens
# Resultaat van de elbow is een breekpunt bij 10 en tussen de 18 en 22

if ask_user('Toevoegen clusters aan data'):
    print('Clustergegevens worden aan de dataset toegevoegd!')
    sse = {}
    k = 12
    kmeans = KMeans(n_clusters=k, max_iter=50).fit(Xdf)
    data['clusters'] = kmeans.labels_
    sse[k] = kmeans.inertia_  # Inertia: Sum of distances of samples to their closest cluster center
else:
Esempio n. 11
0
# clusters = 3
kmeans3 = KMeans(n_clusters = 3, init = 'k-means++')
pred3 = kmeans3.fit_predict(selected)
print(pred3)
kmeans3.cluster_centers_

# clusters = 4
kmeans4 = KMeans(n_clusters = 4, init = 'k-means++')
pred4 = kmeans4.fit_predict(selected)
kmeans4.cluster_centers_

# clusters = 5
kmeans5 = KMeans(n_clusters = 5, init = 'k-means++')
pred5 = kmeans5.fit_predict(selected)
kmeans5.cluster_centers_

# correlation
c = selected.corr()

# visualization
array = selected.to_numpy()
plt.scatter(array[:,0], array[:,1], c= pred3, cmap = 'rainbow')
plt.scatter(array[:,0], array[:,1], c= pred4, cmap = 'rainbow')
plt.scatter(array[:,0], array[:,1], c= pred5, cmap = 'rainbow')

# elbow with yellow brick
from yellowbrick.cluster import KElbowVisualizer
model = KMeans()
visualizer = KElbowVisualizer(model, k=(1, 9))
visualizer.fit(array)
visualizer.show()
Esempio n. 12
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--input_file", type=str,
                        required=True)  # file we are reading
    parser.add_argument("--write_file", type=str, required=True)
    parser.add_argument("--stopwords", default=None,
                        type=str)  # stopwords file name
    parser.add_argument(
        "--min_threshold", type=int, default=10
    )  # cluster the word whose appearence is larger than the threshold
    parser.add_argument(
        "--min_num_words", type=int, default=3
    )  # discard the word whose appearence is less than the threshold
    parser.add_argument("--start", default=0, type=int)
    parser.add_argument("--end", default=-1, type=int)
    args = parser.parse_args()

    if args.stopwords:
        with open(args.stopwords, encoding="utf-8") as f:
            stopwords = f.readlines()
            stopwords = [st.strip() for st in stopwords]
    else:
        stopwords = []

    rootdir = args.input_file
    writefile = args.write_file
    list_dir = os.listdir(rootdir)

    if args.end == -1:
        args.end = len(list_dir)

    none_cluster = []
    discard_word = []
    for file_index, file_name in enumerate(tqdm(list_dir)):

        if not (file_index >= args.start and file_index < args.end):
            continue

        write_mode = "a"
        labels = []
        vectors = []

        if file_name[:-4] != '' and ".txt" in file_name:
            with open(rootdir + file_name) as f:
                line = f.readline()
                while (line):
                    line = line.split("\t")
                    labels.append(line[0])
                    vectors.append([
                        line[0],
                        np.array(line[1].split(" "), dtype='float'),
                        np.array(line[2].split(" "), dtype='float')
                    ])  #[[label, vector_src, vector_tgt]]
                    line = f.readline()

            if len(vectors) < args.min_num_words:
                discard_word.append(file_name[:-4])
                continue

            if len(vectors) <= args.min_threshold or checkspecial(
                    file_name[:-4]) or file_name[:-4] in stopwords:
                cluster_src, cluster_tgt, cluster_label, cluster_entropy = get_mean_vector(
                    vectors, labels)
            else:
                vectors_src_all = np.vstack(list(map(lambda x: x[1], vectors)))
                model = KElbowVisualizer(KMeans(), k=(1, 8))
                model.fit(vectors_src_all)
                if model.elbow_value_ == None:
                    none_cluster.append(file_name[:-4])
                    cluster_src, cluster_tgt, cluster_label, cluster_entropy = get_mean_vector(
                        vectors, labels)
                else:
                    cluster_src, cluster_tgt, cluster_label, cluster_entropy = get_muti_mean_vector(
                        vectors_src_all, vectors, labels, model.elbow_value_)
            write_file(cluster_src, cluster_tgt, cluster_label,
                       cluster_entropy, write_mode, file_name, writefile)

    print("Number of None in clustering:", len(none_cluster))
    print("Number of words that have been discarded:", len(discard_word))
    print("List of file that not been clusted:", none_cluster)
    print("List of words that have been discarded:", discard_word)
Esempio n. 13
0
def main():
    """
    Using k-means for some data exploration and a potential solution for the license prediction problem
    """
    os.chdir('../../../all_files_generated')
    current_dir = os.getcwd()

    data_pickles_dir = os.path.join(current_dir, 'data_pickles')
    elbow_method_files_dir = os.path.join(current_dir, 'elbow_method_files')

    x_train_path = os.path.join(data_pickles_dir, 'x_train.pickle')
    x_validation_path = os.path.join(data_pickles_dir, 'x_validation.pickle')
    x_test_path = os.path.join(data_pickles_dir, 'x_test.pickle')
    y_train_path = os.path.join(data_pickles_dir, 'y_train.pickle')
    y_validation_path = os.path.join(data_pickles_dir, 'y_validation.pickle')
    y_test_path = os.path.join(data_pickles_dir, 'y_test.pickle')

    # read in all pickle files that may be required
    with open(x_train_path, 'rb') as data:
        x_train = pickle.load(data)

    with open(x_validation_path, 'rb') as data:
        x_validation = pickle.load(data)

    with open(x_test_path, 'rb') as data:
        x_test = pickle.load(data)

    with open(y_train_path, 'rb') as data:
        y_train = pickle.load(data)

    with open(y_validation_path, 'rb') as data:
        y_validation = pickle.load(data)

    with open(y_test_path, 'rb') as data:
        y_test = pickle.load(data)

    # combine all datasets
    x_train = sparse.vstack(
        (x_train, x_validation, x_test))  # scipy.sparse.csr matrix
    y_train = y_train.append(pd.Series(y_validation))  # pandas series
    y_train = y_train.append(pd.Series(y_test))  # pandas series

    use_yellowbrick = False

    if use_yellowbrick:
        license_classifier = KMeans()
        visualizer = KElbowVisualizer(license_classifier, k=(2, 100))
        visualizer.fit(x_train)
        visualizer.show()
    else:
        inertia = []
        k = range(2, 100)
        for i in k:
            license_classifier = KMeans(n_clusters=i)
            license_classifier.fit(x_train)
            inertia.append(license_classifier.inertia_)

        plt.plot(k, inertia)
        plt.xlabel('K')
        plt.ylabel('Inertia')
        plt.title('Elbow Method')

        elbow_method_path = os.path.join(
            elbow_method_files_dir, 'k_means_clustering_elbow_method.png')
        plt.savefig(elbow_method_path)

        plt.show()
# Instantiate the clustering model and visualizer


"""
finding optimal k using silhouette score
"""
# silhouette_score={ }
k_range=w=list(range(4,20)) #specify the range of k
for k in k_range:
    clusterer= KMeans(n_clusters=k, init='k-means++', random_state=20)
    cluster_labels=clusterer.fit_predict(df_kmean.values)#Compute cluster centers and predict cluster index for each sample
    #print(cluster_labels)
    #the silhouette score gives the av. value for all the samples
    #this gives a percepective  into density and seperation of the formed cluster
    silhouette_avg=silhouette_score(df_kmean.values,cluster_labels )
    #silhouette_score[k]=silhouette_avg #to store the k and its corresponding silhouette score
    print(f"for k cluster={k}, the av silhouette_score is {silhouette_avg}") #we can see that k>= 13, we get good score

#I will use k=15 as the elbow method, because when I compared the dissimilarity and similarity perforamce
# , using optimal_k_compare, I saw thatk=15 gives better result
"""
finding optimal k using visualization (elbow)
"""s
model = KMeans(init='k-means++')
visualizer = KElbowVisualizer(model, k=(4,100), timings=False) #k is th enumber of cluter
visualizer.fit(df_kmean.values)     # Fit the data to the visualizer
visualizer.show()                   # Finalize and render the figure



plt.xlabel("Number of points in node (or index of point if no parenthesis).")
plt.ylabel('Distance')
plt.savefig('Figures/Cluster_analysis/Dendrogram_hierarch_clustering.png')
plt.close()

## Elbow Plot
'''Uses the Within-Cluster Sum-of-Squares and selects the model with the lowest
    sum of squares '''
### Initialize model and load package
from yellowbrick.cluster import KElbowVisualizer
sns.set(style = 'whitegrid', font_scale = 1.5)
model = KMeans()

### Make the Elbow plot
visualizer = KElbowVisualizer(model, k=(2,15), timings= True, size = (1500,900))
visualizer.fit(df[vars_tot])        
visualizer.show('Figures/Cluster_analysis/WCSS.png')   
plt.clf()

'''OLD
fig, ax = plt.subplots(figsize=(20,12)) 
ax.set(ylabel='Within-Cluster Sum-of-Squares (1e20)', xlabel = 'Number of Clusters')
ax.plot(np.arange(2,12,1),intertia / 1e20)
plt.xticks(np.arange(2,12,1))
plt.tight_layout()
fig.savefig('Figures/Cluster_analysis/WCSS.png') '''

'''NOTE There is a clear elbow at Five clusters. Proceed.
    '''
## Silhouette plot
### Initialize model
Esempio n. 16
0
for k in K:
    kmeans = KMeans(n_clusters=k)
    kmeans_model = kmeans.fit(df)
    ssd.append(kmeans_model.inertia_)

plt.plot(K, ssd, "bx-")
plt.xlabel("Farklı K Değerlerine Karşılık Uzaklık Artık Toplamları")
plt.title("Optimum Küme Sayısı için Elbow Yöntemi")
plt.show()

#Optimum küme sayısını seçerken grafiğe bakılarak seçerken en büyük kırılma
#olan yere odaklanılır ve o kadar küme oluşturulması istenir. (örnek=3)

visu = KElbowVisualizer(kmeans, k=(2, 20))
visu_fit = visu.fit(df)
visu_fit.poof()
#visu sayesinde grafikte en iyi küme seçilerek bize gösterilmesi sağlanır

#Final Model
kmeans = KMeans(n_clusters=4)
kmeans_model = kmeans.fit(df)
print(kmeans_model)

kumeler = kmeans_model.labels_
kume = pd.DataFrame({"Eyaletler": df.index, "Kumeler": kumeler})
print(kume)

df["Kume_No"] = kumeler
print(df)
from warnings import filterwarnings
filterwarnings('ignore')
from yellowbrick.cluster import KElbowVisualizer
from sklearn.cluster import KMeans
import pandas as pd

df = pd.read_csv("USArrests.csv", sep=',').copy()
df.index = df.iloc[:, 0]
df = df.iloc[:, 1:5]
del df.index.name
kmeans = KMeans()
visualizer = KElbowVisualizer(kmeans, k=(2, 20))
visualizer.fit(df)
visualizer.poof()
kmeans = KMeans(n_clusters=4)
k_fit = kmeans.fit(df)
kumeler = k_fit.labels_
pd.DataFrame({"Eyaletler": df.index, "Kumeler": kumeler})
Esempio n. 18
0
def cluster_category_data(df,
                          scale_data='minmax',
                          dim_red_method='som',
                          use_elbow_method='True',
                          cluster_method='hierarchical',
                          n_clusters=None,
                          verbose=1,
                          perplexity=None):
    """
    :param df: dataframe containing all the columns belonging to a category to be used in clustering
    :param scale_data: method to be used to scale the dataset
    :param dim_red_method: options are 'som', 'umap', 'tsne', None. If  None, do clustering directly.
    :param use_elbow_method: if True, elbow method is used to find the optimum number of clusters. If False, n_clusters needs to be specified
    :param cluster_method: options are 'kmeans' and 'hierarchical'. In either case kmeans is used for the elbow method(because of the time required).
    :param n_clusters: If use_elbow_method is False, n_clusters needs to be given.
    :param verbose: If True, output the progress in clustering process
    :param perplexity: If method used is TSNE, perplexity nedds to be specified
    """
    t = time.time()

    if scale_data == 'minmax':
        X = MinMaxScaler().fit_transform(df)
    elif scale_data == 'standard':
        X = StandardScaler().fit_transform(df)
    else:
        X = df.values

    if verbose:
        print(f'number of features = {df.shape[1]}')

    if dim_red_method == 'som':
        if verbose:
            print(
                'Self Organising Maps is being used for dimensionality reduction...'
            )
        opt_k = 2
        max_s = -1
        f = 0
        for mapsize in [(30, 30)]:
            if verbose:
                print(f'map size = {mapsize}')
            sm = SOMFactory().build(X,
                                    normalization='var',
                                    initialization='pca',
                                    mapsize=mapsize)
            sm.train(n_job=1,
                     verbose=False,
                     train_rough_len=100,
                     train_finetune_len=500)
            if use_elbow_method:
                model = KElbowVisualizer(KMeans(), k=20, timings=False)
                elbow = model.fit(sm.codebook.matrix).elbow_value_
                if elbow and verbose:
                    print(f'elbow value = {elbow}')
                if not elbow:
                    if verbose:
                        print('elbow not found')
                    ms = -1
                    for k in range(2, 20):
                        km_labels = KMeans(k).fit_predict(sm.codebook.matrix)
                        s = silhouette_score(sm.codebook.matrix, km_labels)
                        if s > ms:
                            elbow = k
            else:
                elbow = n_clusters
            x = sm.project_data(X)
            labels, _, _ = sm.cluster(opt=elbow, cl_type=cluster_method)
            clabels = []
            for i in range(X.shape[0]):
                clabels.append(labels[x[i]])
            s_score = silhouette_score(X, clabels)
            if verbose:
                print(f'silhouette score = {round(s_score, 3)}')
            max_s = max(s_score, max_s)
            if (max_s == s_score):
                opt_k = elbow
                opt_labels = clabels
                opt_size = mapsize
            if (max_s > s_score):
                break
        if verbose:
            print(f'optimum mapsize = {opt_size}')
            print(
                f'optimum number of clusters = {opt_k} & silhouette score = {round(max_s,3)}'
            )
            print(f'time taken = {round(time.time()-t,1)}')
        return opt_labels, opt_k

    elif dim_red_method:
        if dim_red_method == 'umap':
            print('UMAP is being used for dimensionality reduction...')
            embedding = umap.UMAP(n_components=2,
                                  n_neighbors=5,
                                  min_dist=0.0001,
                                  metric='euclidean',
                                  random_state=1,
                                  spread=0.5,
                                  n_epochs=1000).fit_transform(X)
            print('UMAP embedding done...')
        elif dim_red_method == 'tsne':
            print('t-SNE is being used for dimensionality reduction...')
            embedding = TSNE(perplexity=perplexity).fit_transform(X)
            print('t-SNE embedding is done...')
        if use_elbow_method:
            model = KElbowVisualizer(KMeans(), k=20, timings=False)
            elbow = model.fit(embedding).elbow_value_
        else:
            elbow = n_clusters
        if cluster_method == 'kmeans':
            opt_labels = KMeans(elbow).fit_predict(embedding)
        elif cluster_method == 'hierarchical':
            opt_labels = AgglomerativeClustering(elbow).fit_predict(embedding)
        if verbose:
            s_score = silhouette_score(X, opt_labels)
            print(
                f'number of clusters = {elbow} and silhouette_score = {s_score}'
            )
        return opt_labels, elbow

    else:
        if use_elbow_method:
            model = KElbowVisualizer(KMeans(), k=20, timings=False)
            elbow = model.fit(X).elbow_value_
        else:
            elbow = n_clusters
        if cluster_method == 'kmeans':
            opt_labels = KMeans(elbow).fit_predict(X)
        elif cluster_method == 'hierarchical':
            opt_labels = AgglomerativeClustering(elbow).fit_predict(X)
        print(f'silhouette score = {round(silhouette_score(X,opt_labels),3)}')
        return opt_labels, elbow
Esempio n. 19
0
def home2(request):
    global articles
    global sujets

    today = datetime.date.today()
    yesterday = today - datetime.timedelta(days=1)
    yesterday2 = today - datetime.timedelta(days=2)

    aujourd = '"' + str(today) + '"'
    yestday = '"' + str(yesterday) + '"'
    yestday2 = '"' + str(yesterday2) + '"'
    query = "today"
    url = "https://rapidapi.p.rapidapi.com/api/search/NewsSearchAPI"

    for date in [aujourd, yestday, yestday2]:
        print(date)
        querystring = {
            "pageSize": "100",
            "q": query,
            "autoCorrect": "true",
            "pageNumber": "1",
            "toPublishedDate": "null",
            "withThumbnails": "true",
            "fromPublishedDate": date,
            "safeSearch": "true"
        }

        headers = {
            'x-rapidapi-host':
            "contextualwebsearch-websearch-v1.p.rapidapi.com",
            'x-rapidapi-key':
            "a089200dbamshd00bb86da392cd7p19dd23jsn694f8679b489"
        }

        response = requests.request("GET",
                                    url,
                                    headers=headers,
                                    params=querystring)
        detaille1 = json_normalize(response.json(), 'value')
        detaille = pd.DataFrame(columns=detaille1.columns)
        detaille = pd.concat([detaille, detaille1])

    detaille.reset_index(drop=True, inplace=True)
    detaille_article = detaille[~(detaille.id.duplicated())]
    detaille_article = detaille_article[~detaille_article.title.isna()]
    detaille_article = detaille_article[~(detaille_article.body.str.isspace())]

    detaille_article.loc[:, 'complet'] = detaille_article["title"] + " " + detaille_article["title"] + " " + \
                                         detaille_article['body']

    tfidf = TfidfVectorizer(tokenizer=extract_entite_nomme)
    dtm = tfidf.fit_transform(detaille_article.complet)
    x = dtm.toarray()

    model = KMeans()
    visualizer = KElbowVisualizer(model, k=(2, 40))

    visualizer.fit(dtm)  # Fit the data to the visualizer
    visualizer.show()
    nombre_cluster = visualizer.elbow_value_
    k_means = KMeans(n_clusters=nombre_cluster, random_state=42)

    k_means.fit(dtm)

    closest, _ = pairwise_distances_argmin_min(k_means.cluster_centers_, x)
    all_data = [i for i in range(detaille_article.id.size)]

    m_clusters = k_means.labels_.tolist()

    centers = np.array(k_means.cluster_centers_)

    closest_data = []
    for i in range(nombre_cluster):
        center_vec = centers[i]
        data_idx_within_i_cluster = [
            idx for idx, clu_num in enumerate(m_clusters) if clu_num == i
        ]

        one_cluster_tf_matrix = np.zeros(
            (len(data_idx_within_i_cluster), centers.shape[1]))
        for row_num, data_idx in enumerate(data_idx_within_i_cluster):
            one_row = x[data_idx]
            one_cluster_tf_matrix[row_num] = one_row

        closest, _ = pairwise_distances_argmin_min([center_vec],
                                                   one_cluster_tf_matrix)
        closest_idx_in_one_cluster_tf_matrix = closest[0]
        closest_data_row_num = data_idx_within_i_cluster[
            closest_idx_in_one_cluster_tf_matrix]
        data_id = all_data[closest_data_row_num]

        closest_data.append(data_id)

    closest_data = list(set(closest_data))
    detaille_article['id_cluster'] = k_means.labels_

    entities = {}
    for k in detaille_article.groupby("id_cluster").count().id.nlargest(
            20).index:
        for i in range(nombre_cluster):
            if (detaille_article.loc[closest_data[i], 'id_cluster'] == k):
                doc = nlp(detaille_article.loc[closest_data[i], 'title'])
                entity = "nothing"
                nombre_entity = 0
                if not (doc.ents):
                    doc = nlp(detaille_article.loc[closest_data[i], 'body'])

                for ent in doc.ents:

                    if ((len(ent.text) > 2) & (ent.label_ not in [
                            'DATE', 'TIME', 'CARDINAL', 'ORDINAL', 'PERCENT',
                            'QUANTITY'
                    ])):
                        if (detaille_article[detaille_article.loc[:,
                                                                  'id_cluster']
                                             == k].complet.str.contains(
                                                 ent.text,
                                                 flags=re.IGNORECASE,
                                                 regex=True).sum() >
                                nombre_entity):
                            entity = ent.text
                            nombre_entity = detaille_article[
                                detaille_article.loc[:, 'id_cluster'] ==
                                k].body.str.contains(ent.text,
                                                     flags=re.IGNORECASE,
                                                     regex=True).sum()
                if entity != 'nothing':
                    entities[k] = entity

    detaille_article.sort_values("datePublished",
                                 axis=0,
                                 ascending=False,
                                 inplace=True)
    detaille_article.rename(columns={
        'image.url': 'thumbnail',
        'provider.name': 'source'
    },
                            inplace=True)
    articles = detaille_article
    sujets = entities
    print(entities)
    json_records = detaille_article.reset_index().to_json(orient='records')
    data = []
    data = json.loads(json_records)
    context = {'d': data, 'e': entities}
    return render(request, 'html2.html', context)
Esempio n. 20
0
def update_graph(element_column, data_dict, selectedData, mode):
    if element_column is None:
        raise dash.exceptions.PreventUpdate

    if selectedData != None and mode == 'select-mode':
        df = pd.DataFrame.from_dict(data_dict, 'columns')
        x, y = vislogprob.logprob(df[element_column])

        X = np.array([x,y])
        visualizer = KElbowVisualizer(KMeans(), k=(1, 8))
        visualizer.fit(X.transpose())

        originalData = pd.DataFrame()
        originalData.insert(0, 'Relative Frequency (%)', x)
        originalData.insert(1, 'Value', y[::-1])

        selected_x = []
        for point in selectedData['points']:
            selected_x.append(point['x'])

        max_prob = np.max(selected_x)*0.01
        originalData['Class'] = originalData.apply(
            lambda row: 'Anomalous Sample' if row['Relative Frequency (%)'] <= max_prob else 'Background Sample', axis=1)

        probgraf_fig = px.scatter(x=originalData['Relative Frequency (%)']*100, y=originalData.Value, color=originalData.Class, log_y=True, log_x=True,
                                  labels={'x':'Relative Frequency (%) ', 'y':str(element_column)+''})
        probgraf_fig.update_layout(margin={'l': 10, 'b': 10, 't': 10, 'r': 10}, paper_bgcolor='#f9f9f9', legend_orientation="h", legend=dict(x=-.1, y=1.2))

        cluster_fig = px.line(x=visualizer.k_values_, y=visualizer.k_scores_, labels={'x':'Number of K clusters', 'y':'Distortion Score'}, range_y=[-5, np.max(visualizer.k_scores_)+np.mean(visualizer.k_scores_)/3])
        cluster_fig.update_traces(mode="markers+lines", hovertemplate=None)
        cluster_fig.add_shape(dict(type='line',
                                   x0=visualizer.elbow_value_,
                                   y0=-np.mean(visualizer.k_scores_),
                                   x1=visualizer.elbow_value_,
                                   y1=np.max(visualizer.k_scores_)+np.mean(visualizer.k_scores_),
                                   line=dict(dash='dashdot', color='#EF553B')))
        cluster_fig.update_layout(margin={'l': 10, 'b': 10, 't': 10, 'r': 10}, paper_bgcolor='#f9f9f9', legend_orientation="h")

        merged_df = df.merge(originalData, left_on=element_column, right_on='Value')
        merged_df = merged_df.drop(axis=1, labels=['Value', 'Relative Frequency (%)'])
        merged_df.drop_duplicates(inplace=True)
        merged_df.sort_values(axis=0, by=element_column, inplace=True)

        cluster_columns = [{"name": i, "id": i} for i in merged_df.columns]

        return probgraf_fig, cluster_fig, merged_df.to_dict('records'), cluster_columns

    else:
        df = pd.DataFrame.from_dict(data_dict, 'columns')
        x, y = vislogprob.logprob(df[element_column])
        X = np.array([x,y])

        visualizer = KElbowVisualizer(KMeans(), k=(1, 8))
        visualizer.fit(X.transpose())

        df_clustered = vislogprob.clustered_df(X.transpose(), visualizer.elbow_value_)

        probgraf_fig = px.scatter(x=df_clustered['Relative Frequency (%)'], y=df_clustered.Value, color=df_clustered.Class, log_y=True, log_x=True,
                                  labels={'x':'Relative Frequency (%) ', 'y':str(element_column)+''})
        probgraf_fig.update_layout(margin={'l': 10, 'b': 10, 't': 10, 'r': 10}, paper_bgcolor='#f9f9f9', legend_orientation="h", legend=dict(x=-.1, y=1.2))

        cluster_fig = px.line(x=visualizer.k_values_, y=visualizer.k_scores_, labels={'x':'Number of K clusters', 'y':'Distortion Score'}, range_y=[-5, np.max(visualizer.k_scores_)+np.mean(visualizer.k_scores_)/3])
        cluster_fig.update_traces(mode="markers+lines", hovertemplate=None)
        cluster_fig.add_shape(dict(type='line',
                                   x0=visualizer.elbow_value_,
                                   y0=-np.mean(visualizer.k_scores_),
                                   x1=visualizer.elbow_value_,
                                   y1=np.max(visualizer.k_scores_)+np.mean(visualizer.k_scores_),
                                   line=dict(dash='dashdot', color='#EF553B')))
        cluster_fig.update_layout(margin={'l': 10, 'b': 10, 't': 10, 'r': 10}, paper_bgcolor='#f9f9f9', legend_orientation="h")



        merged_df = df.merge(df_clustered, left_on=element_column, right_on='Value')
        merged_df = merged_df.drop(axis=1, labels=['Value', 'Relative Frequency (%)'])
        merged_df.drop_duplicates(inplace=True)
        merged_df.sort_values(axis=0, by=element_column, inplace=True)
        cluster_columns = [{"name": i, "id": i} for i in merged_df.columns]
        return probgraf_fig, cluster_fig, merged_df.to_dict('records'), cluster_columns
Esempio n. 21
0
df['a'] = df['a'].astype(object)
dummies = pd.get_dummies(df['a'], prefix='a')
bcd = df.iloc[:, 2:5]
 
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(bcd)
X_scaled = pd.DataFrame(x_scaled,columns=bcd.columns)
X_scaled = pd.concat([X_scaled,dummies], axis=1,)
 
# Elbow method 手肘法 1
plt.figure(figsize=(12,9))
 
model = KMeans()
 
visualizer = KElbowVisualizer(model, k=(1,5))
visualizer.fit(X_scaled)       
visualizer.show()

# Elbow method 手肘法 2
SSE = []  # 存放每次结果的误差平方和
for k in range(1,5):
    estimator = KMeans(n_clusters=k)  # 构造聚类器
    estimator.fit(X_scaled)
    SSE.append(estimator.inertia_) # estimator.inertia_获取聚类准则的总和
X = range(1,5)
plt.xlabel('k')
plt.ylabel('SSE')
plt.plot(X,SSE,'o-')
plt.show()
 
model=MiniBatchKMeans(n_clusters=2)
Esempio n. 22
0
    np.random.seed(5)
    X = np.array(read_csv('lab2.xlsx', 0))
    # Y = np.array([i for i in range(1, 57)])

    # normalize the data attributes
    normalized_X = MinMaxScaler().fit_transform(X)

    pca = PCA(n_components=2)
    pca.fit(normalized_X)
    pca_X = pca.transform(normalized_X)
    print("pca_X")
    print(pca_X)

    model = KMeans()
    visualizer = KElbowVisualizer(model, k=(2, 12))
    visualizer.fit(pca_X)
    # visualizer.show()

    kmeans = KMeans(n_clusters=visualizer.elbow_value_)
    kmeans.fit(pca_X)
    y_kmeans = kmeans.predict(pca_X)
    print(len(y_kmeans))

    plt.subplot(121)
    plt.scatter(pca_X[:, 0], pca_X[:, 1], c=y_kmeans, cmap="viridis")
    plt.title("K-means")
    # plt.show()

    # kmeans = KMeans(pca_X)
    # kmeans.run()
    # kmeans.plot(column_1_number=0, column_2_number=1)
#print(df.head(10))

for k in range(2, 15):
    kmeans = cluster.KMeans(n_clusters=k)
    kmeans.fit(x_scaled)
    clusters = kmeans.cluster_centers_
    #print clusters
    #print(clusters)
    y_km = kmeans.fit_predict(x_scaled)
    unique, counts = np.unique(y_km, return_counts=True)
    print(f"Cluster counts for k = {k}: ", dict(zip(unique, counts)))

model = KMeans()
visualizer = KElbowVisualizer(model, k=(2, 15))
visualizer.fit(x_scaled)
visualizer.show()

visualizer2 = KElbowVisualizer(model,
                               k=(2, 15),
                               metric='calinski_harabasz',
                               timings=False)
visualizer2.fit(x_scaled)
visualizer2.show()

#print(kmeans)
#labels = kmeans.predict(df)
#print(labels)
#df_final["cluster"] = labels.tolist()
#print(df_final)
Esempio n. 24
0
df.head()
print(df.head())
#separate out data based on per class basis

a = df.loc[df['Label'] !='Label']

print("printing Classes from the CSV\n")
print(a['Label'].unique().tolist())

subset1 = df[df.Label == 'MalwareActivity']
subset2 = df.loc[df['Label'] == 'MalwareAttack']
subset3 = df.loc[df['Label'] == 'Benign']

print("printing subsets")
labellist = [subset1, subset2, subset3]
for ele, category in zip(labellist, list_classes):
    print(ele)
    series = ele.iloc[:, 0:4].values
    my_title = "Elbow Method for K-means clustering for {}".format(category) 
    model = KMeans()
    visualizer = KElbowVisualizer(model, k=(1,10), title=my_title)

    visualizer.fit(series)        # Fit the data to the visualizer
    visualizer.show()        # Finalize and render the figure
    





Esempio n. 25
0
# IU - International University of Applied Science
# Machine Learning - Unsupervised Machine Learning
# Course Code: DLBDSMLUSL01

# Elbow criterion

#%% import libraries
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer

#%% create sample data
X= np.random.rand(50,2)
Y= 2 + np.random.rand(50,2)
Z= np.concatenate((X,Y))

#%% create a k-Means model an Elbow-Visualizer
model = KMeans()
visualizer = KElbowVisualizer(model, k=(1,8), \
    timings=True)

#%% fit the visualizer and show the plot
visualizer.fit(Z)        
visualizer.show()        
Esempio n. 26
0
# Clustering Evaluation Imports
from functools import partial

from sklearn.cluster import MiniBatchKMeans
from sklearn.datasets import make_blobs as sk_make_blobs

from yellowbrick.cluster import KElbowVisualizer

# Helpers for easy dataset creation
N_SAMPLES = 1000
N_FEATURES = 12
SHUFFLE = True

# Make blobs partial
make_blobs = partial(sk_make_blobs,
                     n_samples=N_SAMPLES,
                     n_features=N_FEATURES,
                     shuffle=SHUFFLE)

if __name__ == '__main__':
    # Make 8 blobs dataset
    X, y = make_blobs(centers=8)

    # Instantiate the clustering model and visualizer
    # Instantiate the clustering model and visualizer
    visualizer = KElbowVisualizer(MiniBatchKMeans(), k=(4, 12))

    visualizer.fit(X)  # Fit the training data to the visualizer
    visualizer.poof(outpath="images/elbow.png")  # Draw/show/poof the data
Esempio n. 27
0
from kmodes.kmodes import KModes
import matplotlib.pyplot as plt
from yellowbrick.cluster import KElbowVisualizer
# prepare dataset filling null values with 0

df = pd.read_excel("Deloitte Team 1 Debtor segmentation.xlsx")
df = df.fillna(0)
df_copy = df.copy()
df5 = pd.DataFrame(df,
                   columns=[
                       'Amount Due', 'Active Bucket', '30<60', '61<90',
                       '91<120', '121+'
                   ])
model = KModes()
visualizer = KElbowVisualizer(model, k=(1, 7))
visualizer.fit(df5)  # Fit the data to the visualizer
visualizer.show()

# In[28]:

# binning by manually dividing the scale of amount into buckets and attaching labels to each bucket

df['Amount Due bin'] = pd.cut(df['Amount Due'],
                              [-1, 0, 1000, 3000, 5000, 10000, 50000, 1000000],
                              labels=[
                                  '0', '1-1000', '1000-3000', '3000-5000',
                                  '5000-10000', '10000-50000', '50000-1000000'
                              ])
df = df.drop('Amount Due', axis=1)
df['Active Bucket bin'] = pd.cut(df['Active Bucket'],
                                 [-1, 0, 1000, 3000, 5000, 10000, 20000],
import pandas as pd
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer
from matplotlib import pyplot as plt

data = pd.read_csv("ClusterPlot.csv")

model = KMeans()

elbow_visualizer = KElbowVisualizer(model, k=(1, 10), timings=False)

elbow_visualizer.fit(data)
print("Number of Clusters: ", elbow_visualizer.elbow_value_)
elbow_visualizer.show()

x = data.copy()
kmeans = KMeans(elbow_visualizer.elbow_value_)
kmeans.fit(x)

clusters = x.copy()
clusters["cluster_pred"] = kmeans.fit_predict(x)
plt.scatter(data["V1"], data["V2"], c=clusters["cluster_pred"], cmap="rainbow")
plt.xlabel("V1")
plt.ylabel("V2")
plt.show()
Esempio n. 29
0
def explore_KMeans_clustering(
    df,
    num_cols=None,
    n_clusters=range(3, 5),
    include_silhouette=True,
    include_PCA=True,
    random_state=None,
):
    """create, fit and plot KMeans clustering on the dataset

    Parameters
    ----------
    df : pandas.DataFrame
        the dataset, should be transformed with StandardScaler
    num_cols : list, optional
        list of numeric column names, in case of None, get all numeric columns
    metric : str, optional
        metric, by default "euclidean"
    n_clusters : list, optional
        list of n_clusters hyperparams, by default range(2, 9)
    include_silhouette : bool, optional
        whether Silhouette plots should be generated, by default True
    include_PCA : bool, optional
        whether PCA plots should be generated, by default True
    random_state : int, optional
        a number determines random number generation for centroid initialization, by default None

    Returns
    -------
    dict
        a dictionary with key=type of plot, value=list of plots

    Examples
    -------
    >>> original_df = pd.read_csv("/data/menu.csv")
    >>> numeric_features = eda.get_numeric_columns(original_df)
    >>> numeric_transformer = make_pipeline(SimpleImputer(), StandardScaler())
    >>> preprocessor = make_column_transformer(
    >>>     (numeric_transformer, numeric_features)
    >>> )
    >>> df = pd.DataFrame(
    >>>     data=preprocessor.fit_transform(original_df), columns=numeric_features
    >>> )
    >>> explore_KMeans_clusterting(df)
    """
    if num_cols is None:
        num_cols = get_numeric_columns(df)
    else:
        _verify_numeric_cols(df, num_cols)
    x = df[num_cols]
    results = {}
    if 1 in n_clusters:
        raise Exception("n_cluster cannot be 1")

    print("------------------------")
    print("K-MEANS CLUSTERING")
    print("------------------------")

    if len(n_clusters) > 1:
        print("Generating KElbow plot for KMeans.")
        # visualize using KElbowVisualizer
        kmeans = KMeans(random_state=random_state)

        plt.clf()
        fig, ax = plt.subplots()
        elbow_visualizer = KElbowVisualizer(kmeans, k=n_clusters, ax=ax)
        elbow_visualizer.fit(x)  # Fit the data to the visualizer
        elbow_visualizer.show()
        plt.close()
        elbow_visualizer.k = elbow_visualizer.elbow_value_  # fix printing issue
        results["KElbow"] = fig
    else:
        results["KElbow"] = None

    # visualize using SilhouetteVisualizer
    print("Generating Silhouette & PCA plots")
    silhouette_plots = []
    pca_plots = []
    for k in n_clusters:
        print(f"Number of clusters: {k}")

        kmeans = KMeans(k, random_state=random_state)

        if include_silhouette:
            fig, ax = plt.subplots()
            s_visualizer = SilhouetteVisualizer(kmeans, colors="yellowbrick", ax=ax)
            s_visualizer.fit(x)  # Fit the data to the visualizer
            s_visualizer.show()
            silhouette_plots.append(fig)
            # plt.clf()
            plt.close()

        else:
            silhouette_plots.append(None)

        # PCA plots
        if include_PCA:
            labels = kmeans.fit_predict(x)
            pca_fig = plot_pca_clusters(x, labels, random_state=random_state)
            pca_plots.append(pca_fig)
        else:
            pca_plots.append(None)

    results["Silhouette"] = silhouette_plots
    results["PCA"] = pca_plots

    return results
Esempio n. 30
0
dataset.describe(include="all")

features = dataset.iloc[:, 0:7]
target = dataset.iloc[:, -1]
'''
print('----- features')
print(features)
print('----- target')
print(target)
exit()
'''

model = KMeans()
visualizer = KElbowVisualizer(model, k=(1, 10))

visualizer.fit(features)  # Fit the data to the visualizer
visualizer.poof()  # Draw/show/poof the data

kmeans = KMeans(n_clusters=3)
kmeans.fit(features)
cluster_labels = kmeans.fit_predict(features)

kmeans.cluster_centers_

silhouette_avg = metrics.silhouette_score(features, cluster_labels)
print('silhouette coefficient for the above clutering = ', silhouette_avg)


def purity_score(y_true, y_pred):
    # compute contingency matrix (also called confusion matrix)
    contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
Esempio n. 31
0
def elbow():
    X, _ = make_blobs(centers=8, n_features=12, shuffle=True)
    oz = KElbowVisualizer(KMeans(), k=(4, 12), ax=newfig())
    oz.fit(X)
    savefig(oz, "elbow")