Exemple #1
0
def run_km_em(perm_x, perm_y, dname, clstr):
    SSE_km_perm = []
    ll_em_perm = []
    acc_km_perm = []
    acc_em_perm = []
    adjMI_km_perm = []
    adjMI_em_perm = []
    homo_km_perm = []
    homo_em_perm = []
    comp_km_perm = []
    comp_em_perm = []
    silhou_km_perm = []
    bic_em_perm = []
    clk_time = []

    for k in clstr:
        st = clock()
        km = KMeans(n_clusters=k, random_state=10)
        gmm = GMM(n_components=k, random_state=10)

        SSE_km_perm.append(-km.score(perm_x, km.fit_predict(perm_x)))
        ll_em_perm.append(gmm.score(perm_x, gmm.fit_predict(perm_x)))
        acc_km_perm.append(cluster_acc(perm_y, km.fit_predict(perm_x)))
        acc_em_perm.append(cluster_acc(perm_y, gmm.fit_predict(perm_x)))
        adjMI_km_perm.append(ami(perm_y, km.fit_predict(perm_x)))
        adjMI_em_perm.append(ami(perm_y, gmm.fit_predict(perm_x)))
        homo_km_perm.append(
            metrics.homogeneity_score(perm_y, km.fit_predict(perm_x)))
        homo_em_perm.append(
            metrics.homogeneity_score(perm_y, gmm.fit_predict(perm_x)))
        comp_km_perm.append(
            metrics.completeness_score(perm_y, km.fit_predict(perm_x)))
        comp_em_perm.append(
            metrics.completeness_score(perm_y, gmm.fit_predict(perm_x)))
        silhou_km_perm.append(
            metrics.silhouette_score(perm_x, km.fit_predict(perm_x)))
        bic_em_perm.append(gmm.bic(perm_x))
        clk_time.append(clock() - st)
        print(k, clock() - st)

    dbcluster = pd.DataFrame({
        'k': clstr,
        'SSE_km': SSE_km_perm,
        'll_em': ll_em_perm,
        'acc_km': acc_km_perm,
        'acc_em': acc_em_perm,
        'adjMI_km': adjMI_km_perm,
        'adjMI_em': adjMI_em_perm,
        'homo_km': homo_km_perm,
        'homo_em': homo_em_perm,
        'comp_km': comp_km_perm,
        'comp_em': comp_em_perm,
        'silhou_km': silhou_km_perm,
        'bic_em': bic_em_perm,
        'clk_time': clk_time
    })

    dbcluster.to_csv('./results/cluster_{}.csv'.format(dname), sep=',')
Exemple #2
0
def generate_cluster_plots(df, name, pdir):
    """Visualizes clusters using pre-processed 2D dataset.

    Args:
        df (Pandas.DataFrame): Dataset containing attributes and labels.
        name (str): Dataset name.
        pdir (str): Output folder for plots.

    """
    # get cols
    x1 = df['x1']
    x2 = df['x2']
    km = df['km']
    gmm = df['gmm']
    c = df['class']

    print "Accuracy Score for KMeans- {}".format(cluster_acc(c, km))
    print "Accuracy Score for EM- {}".format(cluster_acc(c, gmm))

    # plot cluster scatter plots
    fig, (ax1, ax2, ax3) = plt.subplots(nrows=1, ncols=3, figsize=(12, 3))
    ax1.scatter(x1, x2, marker='x', s=20, c=km, cmap='gist_rainbow')
    ax1.set_title('K-Means Clusters ({})'.format(name))
    ax1.set_ylabel('x1')
    ax1.set_xlabel('x2')
    ax1.grid(color='grey', linestyle='dotted')

    ax2.scatter(x1, x2, marker='x', s=20, c=gmm, cmap='gist_rainbow')
    ax2.set_title('GMM Clusters ({})'.format(name))
    ax2.set_ylabel('x1')
    ax2.set_xlabel('x2')
    ax2.grid(color='grey', linestyle='dotted')

    # change color map depending on dataset
    cmap = None
    if name == 'digits':
        cmap = 'hsv'
    else:
        cmap = 'summer'
    ax3.scatter(x1, x2, marker='o', s=20, c=c, cmap="gist_rainbow")
    ax3.set_title('Class Labels ({})'.format(name))
    ax3.set_ylabel('x1')
    ax3.set_xlabel('x2')
    ax3.grid(color='grey', linestyle='dotted')

    # change layout size, font size and width between subplots
    fig.tight_layout()
    for ax in fig.axes:
        ax_items = [ax.title, ax.xaxis.label, ax.yaxis.label]
        for item in ax_items + ax.get_xticklabels() + ax.get_yticklabels():
            item.set_fontsize(8)
    plt.subplots_adjust(wspace=0.3)

    # save figure
    plotdir = pdir
    plotpath = get_abspath('{}_clusters.png'.format(name), plotdir)
    plt.savefig(plotpath)
    plt.clf()
Exemple #3
0
def evaluate_kmeans(X, y, problem, out='./results/Clustering/'):
    """Also evaluate kmeans and em both"""
    sm = SMOTE()
    X_res, y_res = sm.fit_sample(X, y)

    SSE = defaultdict(dict)
    ll = defaultdict(dict)
    distort_km = []
    distort_gm = []
    acc = defaultdict(lambda: defaultdict(dict))
    adjMI = defaultdict(lambda: defaultdict(dict))
    km = KMeans(random_state=5)
    gm = GM(random_state=5)

    st = clock()
    clusters = [2, 3, 4, 5, 6]
    for k in clusters:
        print('now doing k=' + str(k))
        km.set_params(n_clusters=k)
        gm.set_params(n_components=k)
        km.fit(X_res)
        gm.fit(X_res)

        #distort_km.append(sum(np.min(cdist(X, km.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0])
        ##distort_gm.append(sum(np.min(cdist(X, gm.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0])
        SSE[k][problem] = km.score(X_res)
        ll[k][problem] = gm.score(X_res)
        print('km score:', SSE[k][problem])
        print('gm score:', ll[k][problem])
        acc[k][problem]['Kmeans'] = cluster_acc(y_res, km.predict(X_res))
        acc[k][problem]['GM'] = cluster_acc(y_res, gm.predict(X_res))
        adjMI[k][problem]['Kmeans'] = metrics.adjusted_mutual_info_score(
            y_res, km.predict(X_res))
        adjMI[k][problem]['GM'] = metrics.adjusted_mutual_info_score(
            y_res, gm.predict(X_res))

    print(k, clock() - st)

    SSE = (-pd.DataFrame(SSE)).T
    SSE.rename(columns=lambda x: x + ' SSE (left)', inplace=True)
    ll = pd.DataFrame(ll).T
    ll.rename(columns=lambda x: x + ' log-likelihood', inplace=True)
    acc = pd.Panel(acc)
    adjMI = pd.Panel(adjMI)

    SSE.to_csv(out + problem + ' SSE.csv')
    ll.to_csv(out + problem + ' logliklihood.csv')
    acc.ix[:, :, problem].to_csv(out + problem + ' acc.csv')
    acc.ix[:, :, problem, ].to_csv(out + problem + ' acc.csv')
    adjMI.ix[:, :, problem].to_csv(out + problem + ' adjMI.csv')
    adjMI.ix[:, :, problem].to_csv(out + problem + ' adjMI.csv')

    return SSE, ll, acc, adjMI, km, gm
Exemple #4
0
def cluster_scores(X, y, pred):
    acc, H0, H1, cluster_summary = hlp.cluster_acc(y, pred)
    df = pd.DataFrame(cluster_summary)[['Size','Pos','Neg','H', 'Acc']]  #.sort_values(['Acc'], ascending=False)
    df2 = pd.DataFrame(pred,silhouette_samples(X, pred)).reset_index()[[0,'index']]
    df2.columns=['label','silh']
    df['Silh']=df2.groupby('label').mean()
    return df.sort_values(['Acc'], ascending=False)
    gmm.fit(faultsX)

    #Faults dataset
    #Visual Measurements
    #Sum of Squared Errors for K-means
    SSE[k]['Faults'] = km.score(faultsX)

    #Log-Likelihood for GMM
    ll[k]['Faults'] = gmm.score(faultsX)

    #Silhouette Score
    #The best value is 1 and the worst value is -1. Silhouette analysis can be used to study the separation distance between the resulting clusters.
    SS[k]['Faults']['Kmeans'] = ss(faultsX, km.predict(faultsX))
    SS[k]['Faults']['GMM'] = ss(faultsX, gmm.predict(faultsX))
    #Cluster Accuracy
    acc[k]['Faults']['Kmeans'] = cluster_acc(faultsY, km.predict(faultsX))
    acc[k]['Faults']['GMM'] = cluster_acc(faultsY, gmm.predict(faultsX))

    #Adjusted Mutual Information
    adjMI[k]['Faults']['Kmeans'] = ami(faultsY, km.predict(faultsX))
    adjMI[k]['Faults']['GMM'] = ami(faultsY, gmm.predict(faultsX))

    #Breast Cancer dataset
    km.fit(bcX)
    gmm.fit(bcX)
    SSE[k]['BreastC'] = km.score(bcX)
    ll[k]['BreastC'] = gmm.score(bcX)
    SS[k]['BreastC']['Kmeans'] = ss(bcX, km.predict(bcX))
    SS[k]['BreastC']['GMM'] = ss(bcX, gmm.predict(bcX))
    acc[k]['BreastC']['Kmeans'] = cluster_acc(bcY, km.predict(bcX))
    acc[k]['BreastC']['GMM'] = cluster_acc(bcY, gmm.predict(bcX))
Exemple #6
0
clusters = [2, 3, 4, 5, 8, 12, 15, 18, 21, 25]

loans_km_acc = []
loans_gmm_acc = []
loans_km_score = []
loans_gmm_score = []
loans_km_ami = []
loans_gmm_ami = []
loans_km_silhouette = []
loans_gmm_silhouette = []

for k in clusters:
    km.set_params(n_clusters=k)
    km.fit(loansX_pca)
    loans_km_acc.append(cluster_acc(loans_Y, km.predict(loansX_pca)))
    loans_km_score.append(km.score(loansX_pca))
    loans_km_ami.append(ami(loans_Y, km.predict(loansX_pca)))
    loans_km_silhouette.append(
        silhouette_score(loansX_pca, km.predict(loansX_pca)))

    gmm.set_params(n_components=k)
    gmm.fit(loansX_pca)
    loans_gmm_acc.append(cluster_acc(loans_Y, gmm.predict(loansX_pca)))
    loans_gmm_score.append(gmm.score(loansX_pca))
    loans_gmm_ami.append(ami(loans_Y, gmm.predict(loansX_pca)))
    loans_gmm_silhouette.append(
        silhouette_score(loansX_pca, gmm.predict(loansX_pca)))

loans_df= pd.DataFrame({'Kmeans acc': loans_km_acc, 'GMM acc': loans_gmm_acc,\
           'Kmeans score': loans_km_score, 'GMM score': loans_gmm_score,\
Exemple #7
0
def clustering_experiment(X, y, name, clusters, rdir):
    """Generate results CSVs for given datasets using the K-Means and EM
    clustering algorithms.

    Args:
        X (Numpy.Array): Attributes.
        y (Numpy.Array): Labels.
        name (str): Dataset name.
        clusters (list[int]): List of k values.
        rdir (str): Output directory.

    """
    sse = defaultdict(dict)  # sum of squared errors
    logl = defaultdict(dict)  # log-likelihood
    bic = defaultdict(dict)  # BIC for EM
    silhouette = defaultdict(dict)  # silhouette score
    acc = defaultdict(lambda: defaultdict(dict))  # accuracy scores
    adjmi = defaultdict(lambda: defaultdict(dict))  # adjusted mutual info
    km = KMeans(random_state=0)  # K-Means
    gmm = GMM(random_state=0)  # Gaussian Mixture Model (EM)

    # start loop for given values of k
    for k in clusters:
        km.set_params(n_clusters=k)
        gmm.set_params(n_components=k)
        km.fit(X)
        gmm.fit(X)

        # calculate SSE, log-likelihood, accuracy, and adjusted mutual info
        sse[k][name] = km.score(X)
        logl[k][name] = gmm.score(X)
        acc[k][name]['km'] = cluster_acc(y, km.predict(X))
        acc[k][name]['gmm'] = cluster_acc(y, gmm.predict(X))
        adjmi[k][name]['km'] = ami(y, km.predict(X))
        adjmi[k][name]['gmm'] = ami(y, gmm.predict(X))

        # calculate silhouette score for K-Means
        km_silhouette = silhouette_score(X, km.predict(X))
        silhouette[k][name] = km_silhouette

        # calculate BIC for EM
        bic[k][name] = gmm.bic(X)

    # generate output dataframes
    sse = (-pd.DataFrame(sse)).T
    sse.rename(columns={name: 'sse'}, inplace=True)
    logl = pd.DataFrame(logl).T
    logl.rename(columns={name: 'log-likelihood'}, inplace=True)
    bic = pd.DataFrame(bic).T
    bic.rename(columns={name: 'bic'}, inplace=True)
    silhouette = pd.DataFrame(silhouette).T
    silhouette.rename(columns={name: 'silhouette_score'}, inplace=True)
    acc = pd.Panel(acc)
    acc = acc.loc[:, :, name].T.rename(lambda x: '{}_acc'.format(x),
                                       axis='columns')
    adjmi = pd.Panel(adjmi)
    adjmi = adjmi.loc[:, :, name].T.rename(lambda x: '{}_adjmi'.format(x),
                                           axis='columns')

    # concatenate all results
    dfs = (sse, silhouette, logl, bic, acc, adjmi)
    metrics = pd.concat(dfs, axis=1)
    resfile = get_abspath('{}_metrics.csv'.format(name), rdir)
    metrics.to_csv(resfile, index_label='k')
Exemple #8
0
acc = defaultdict(lambda: defaultdict(dict))
adjMI = defaultdict(lambda: defaultdict(dict))
km = kmeans(random_state=5)
gmm = GMM(random_state=5)

st = time.time()
print(len(clusters))
for k in clusters:
    km.set_params(n_clusters=k)
    gmm.set_params(n_components=k)
    km.fit(perm_x)
    gmm.fit(perm_x)
    SSE[k]['perm'] = km.score(perm_x)
    ll[k]['perm'] = gmm.score(perm_x)

    acc[k]['perm']['Kmeans'] = cluster_acc(perm_y, km.predict(perm_x))
    acc[k]['perm']['GMM'] = cluster_acc(perm_y, gmm.predict(perm_x))
    adjMI[k]['perm']['Kmeans'] = ami(perm_y, km.predict(perm_x))
    adjMI[k]['perm']['GMM'] = ami(perm_y, gmm.predict(perm_x))

for k in clusters:
    km.set_params(n_clusters=k)
    gmm.set_params(n_components=k)

    km.fit(housing_x)
    gmm.fit(housing_x)
    SSE[k]['housing'] = km.score(housing_x)
    ll[k]['housing'] = gmm.score(housing_x)

    acc[k]['housing']['Kmeans'] = cluster_acc(housing_y, km.predict(housing_x))
    acc[k]['housing']['GMM'] = cluster_acc(housing_y, gmm.predict(housing_x))
            labels[k]['GMM'] = gmm_labels

            sil[k]['Kmeans'] = sil_score(dataX, km_labels)
            sil[k]['GMM'] = sil_score(dataX, gmm_labels)
            km_sil_samples = sil_samples(dataX, km_labels)
            gmm_sil_samples = sil_samples(dataX, gmm_labels)
            for i, x in enumerate(km_sil_samples):
                sil_samp[j] = [k, 'Kmeans', round(x, 6), km_labels[i]]
                j += 1
            for i, x in enumerate(gmm_sil_samples):
                sil_samp[j] = [k, 'GMM', round(x, 6), gmm_labels[i]]
                j += 1
            sse[k] = km.score(dataX)
            ll[k] = gmm.score(dataX)
            bic[k] = gmm.bic(dataX)
            acc[k]['Kmeans'] = cluster_acc(dataY,km.predict(dataX))
            acc[k]['GMM'] = cluster_acc(dataY,gmm.predict(dataX))
            adj_mi[k]['Kmeans'] = ami(dataY,km.predict(dataX))
            adj_mi[k]['GMM'] = ami(dataY,gmm.predict(dataX))



        gmm_clusters = pd.DataFrame()
        kmeans_clusters = pd.DataFrame()

        for i in clusters:
            gmm_clusters[i] = labels[i]['GMM']
            kmeans_clusters[i] = labels[i]['Kmeans']

        bic = pd.DataFrame(bic, index=[0]).T
        bic.index.name = 'k'
Exemple #10
0
    adjMI = defaultdict(lambda: defaultdict(dict))
    adjRI = defaultdict(lambda: defaultdict(dict))
    bic = defaultdict(lambda: defaultdict(dict))
    silh = defaultdict(lambda: defaultdict(dict))
    km = kmeans(random_state=5)
    gmm = GMM(random_state=5)

    st = clock()
    for k in clusters:
        km.set_params(n_clusters=k)
        gmm.set_params(n_components=k)
        km.fit(wineX)
        gmm.fit(wineX)
        SSE[k]['Wine'] = km.score(wineX)
        ll[k]['Wine'] = gmm.score(wineX)
        acc[k]['Wine']['Kmeans'] = cluster_acc(wineY.ravel(),
                                               km.predict(wineX))
        acc[k]['Wine']['GMM'] = cluster_acc(wineY.ravel(), gmm.predict(wineX))
        adjMI[k]['Wine']['Kmeans'] = ami(wineY.ravel(), km.predict(wineX))
        adjMI[k]['Wine']['GMM'] = ami(wineY.ravel(), gmm.predict(wineX))
        adjRI[k]['Wine']['Kmeans'] = ari(wineY.ravel(), km.predict(wineX))
        adjRI[k]['Wine']['GMM'] = ari(wineY.ravel(), gmm.predict(wineX))
        bic[k]['Wine']['Kmeans'] = -compute_bic(km, wineX)
        bic[k]['Wine']['GMM'] = gmm.bic(wineX)
        silh[k]['Wine']['Kmeans'] = silhouette_score(wineX, km.predict(wineX))
        silh[k]['Wine']['GMM'] = silhouette_score(wineX, gmm.predict(wineX))

        km.fit(digitX)
        gmm.fit(digitX)
        SSE[k]['Digit'] = km.score(digitX)
        ll[k]['Digit'] = gmm.score(digitX)
        acc[k]['Digit']['Kmeans'] = cluster_acc(digitY.ravel(),
Exemple #11
0
#%% Data for 1-3
clusters = [2, 3, 4, 5, 6, 8, 10, 12, 15, 20, 25, 30, 35, 40, 50]
scores = hlp.explore_clustering(datasets, clusters)

# Print Charts
hlp.plot_CE([scores], clusters, 'DR using RCA (RP)')

# play around
km = KMeans(random_state=6, n_init=10)
gmm = GaussianMixture(random_state=6, n_init=1)
km.set_params(n_clusters=5)
km.fit(datasets['Titanic']['X_train'])
km.score(datasets['Titanic']['X_train'])
X_train = datasets['Titanic']['X_train']
y_train = datasets['Titanic']['y_train']
acc, H0, H1 = hlp.cluster_acc(y_train, km.predict(X_train))
H0 - H1
adjusted_mutual_info_score(y_train, km.predict(X_train))
gmm.set_params(n_components=5)
gmm.fit(datasets['Titanic']['X_train'])
gmm.score(datasets['Titanic']['X_train'])
gmm.bic(X_train)

# Silhouette plots
from sklearn.neighbors.nearest_centroid import NearestCentroid
# set m = either km or gmm
gmm.set_params(n_components=35)
km.set_params(n_clusters=60)
m = gmm
# fit & plot
#X_train = datasets['Titanic']['X_train']
Exemple #12
0
SSE = defaultdict(dict)
ll = defaultdict(dict)
acc = defaultdict(lambda: defaultdict(dict))
adjMI = defaultdict(lambda: defaultdict(dict))
km = kmeans(random_state=5)
gmm = GMM(random_state=5)

st = clock()
for k in clusters:
    km.set_params(n_clusters=k)
    gmm.set_params(n_components=k)
    km.fit(biodegX)
    gmm.fit(biodegX)
    SSE[k]['Biodeg'] = km.score(biodegX)
    ll[k]['Biodeg'] = gmm.score(biodegX)    
    acc[k]['Biodeg']['Kmeans'] = cluster_acc(biodegY,km.predict(biodegX))
    acc[k]['Biodeg']['GMM'] = cluster_acc(biodegY,gmm.predict(biodegX))
    adjMI[k]['Biodeg']['Kmeans'] = ami(biodegY,km.predict(biodegX))
    adjMI[k]['Biodeg']['GMM'] = ami(biodegY,gmm.predict(biodegX))
    
    km.fit(digitsX)
    gmm.fit(digitsX)
    SSE[k]['Digits'] = km.score(digitsX)
    ll[k]['Digits'] = gmm.score(digitsX)
    acc[k]['Digits']['Kmeans'] = cluster_acc(digitsY,km.predict(digitsX))
    acc[k]['Digits']['GMM'] = cluster_acc(digitsY,gmm.predict(digitsX))
    adjMI[k]['Digits']['Kmeans'] = ami(digitsY,km.predict(digitsX))
    adjMI[k]['Digits']['GMM'] = ami(digitsY,gmm.predict(digitsX))
    print(k, clock()-st)
    
    
SSE = defaultdict(dict)
ll = defaultdict(dict)
acc = defaultdict(lambda: defaultdict(dict))
adjMI = defaultdict(lambda: defaultdict(dict))
km = kmeans(random_state=5)
gmm = GMM(random_state=5)

st = clock()
for k in clusters:
    km.set_params(n_clusters=k)
    gmm.set_params(n_components=k)
    km.fit(wineX)
    gmm.fit(wineX)
    SSE[k]['wine'] = km.score(wineX)
    ll[k]['wine'] = gmm.score(wineX)
    acc[k]['wine']['Kmeans'] = cluster_acc(wineY, km.predict(wineX))
    acc[k]['wine']['GMM'] = cluster_acc(wineY, gmm.predict(wineX))
    adjMI[k]['wine']['Kmeans'] = ami(wineY, km.predict(wineX))
    adjMI[k]['wine']['GMM'] = ami(wineY, gmm.predict(wineX))

    km.fit(cancerX)
    gmm.fit(cancerX)
    SSE[k]['cancer'] = km.score(cancerX)
    ll[k]['cancer'] = gmm.score(cancerX)
    acc[k]['cancer']['Kmeans'] = cluster_acc(cancerY, km.predict(cancerX))
    acc[k]['cancer']['GMM'] = cluster_acc(cancerY, gmm.predict(cancerX))
    adjMI[k]['cancer']['Kmeans'] = ami(cancerY, km.predict(cancerX))
    adjMI[k]['cancer']['GMM'] = ami(cancerY, gmm.predict(cancerX))
    print(k, clock() - st)

## Adding cluster outputs for best parameters and saving at the end of the file
Exemple #14
0
def main_logic():
    out = './BASE/'
    # change the below value based on the readme.txt file instructions
    base = './BASE/'
    np.random.seed(0)

    madelon = pd.read_hdf(base + 'datasets.hdf', 'madelon')
    madelon_X = madelon.drop('Class', 1).copy().values
    madelon_Y = madelon['Class'].copy().values

    character = pd.read_hdf(base + 'datasets.hdf', 'character')
    character_X = character.drop('Class', 1).copy().values
    character_Y = character['Class'].copy().values

    np.random.seed(0)
    # clusters = [2]
    clusters = [2, 5, 10, 15, 20, 25, 30, 35, 40]
    madelon_X = StandardScaler().fit_transform(madelon_X)
    character_X = StandardScaler().fit_transform(character_X)

    # Data for 1-3
    SSE = defaultdict(dict)
    ll = defaultdict(dict)
    Silhouette_dict = defaultdict(dict)
    acc = defaultdict(lambda: defaultdict(dict))
    adjMI = defaultdict(lambda: defaultdict(dict))
    km = kmeans(random_state=5)
    gmm = GMM(random_state=5)

    for j in clusters:
        st = clock()
        km.set_params(n_clusters=j)
        gmm.set_params(n_components=j)
        km.fit(madelon_X)
        gmm.fit(madelon_X)

        SSE[j]['Madelon'] = km.score(madelon_X)
        ll[j]['Madelon'] = gmm.score(madelon_X)
        test = km.predict(madelon_X)

        acc[j]['Madelon']['Kmeans'] = cluster_acc(madelon_Y,
                                                  km.predict(madelon_X), j)
        acc[j]['Madelon']['GMM'] = cluster_acc(madelon_Y,
                                               gmm.predict(madelon_X))

        adjMI[j]['Madelon']['Kmeans'] = ami(madelon_Y, km.predict(madelon_X))
        adjMI[j]['Madelon']['GMM'] = ami(madelon_Y, gmm.predict(madelon_X))
        print("Homogenity Score ,{}, Kmeans,".format(j),
              hs(madelon_Y, km.labels_))
        print("Completeness Score ,{} ,Kmeans,".format(j),
              cs(madelon_Y, km.labels_))

        label = km.labels_
        gmmm = gmm.predict_proba(madelon_X)
        sil_coeff = silhouette_score(madelon_X, label, metric='euclidean')
        Silhouette_dict[j]['Madelon'] = sil_coeff
        print("For n_clusters={}, The Silhouette Coefficient is {}".format(
            j, sil_coeff))

        km.fit(character_X)
        gmm.fit(character_X)
        SSE[j]['character'] = km.score(character_X)
        ll[j]['character'] = gmm.score(character_X)
        best = km.predict(character_X)
        acc[j]['character']['Kmeans'] = cluster_acc(character_Y,
                                                    km.predict(character_X), j)
        acc[j]['character']['GMM'] = cluster_acc(character_Y,
                                                 gmm.predict(character_X))
        adjMI[j]['character']['Kmeans'] = ami(character_Y,
                                              km.predict(character_X))
        adjMI[j]['character']['GMM'] = ami(character_Y,
                                           gmm.predict(character_X))
        label = km.labels_
        sil_coeff = silhouette_score(character_X, label, metric='euclidean')
        Silhouette_dict[j]['character'] = sil_coeff
        print(j, clock() - st)
        print("Homogenity Score ,{}, Kmeans,".format(j),
              hs(character_Y, km.labels_))
        print("Completeness Score ,{} ,Kmeans,".format(j),
              cs(character_Y, km.labels_))
        print("For n_clusters={}, The Silhouette Coefficient is {}".format(
            j, sil_coeff))

    Silhouette_dict = pd.DataFrame(Silhouette_dict).to_csv(out +
                                                           'Silhouette.csv')
    SSE = (-pd.DataFrame(SSE)).T
    SSE.rename(columns=lambda x: x + ' SSE (left)', inplace=True)
    ll = pd.DataFrame(ll).T
    ll.rename(columns=lambda x: x + ' log-likelihood', inplace=True)
    acc = pd.Panel(acc)
    adjMI = pd.Panel(adjMI)

    SSE.to_csv(out + 'SSE.csv')
    ll.to_csv(out + 'logliklihood.csv')
    acc.ix[:, :, 'character'].to_csv(out + 'character_acc.csv')
    acc.ix[:, :, 'Madelon'].to_csv(out + 'Madelon acc.csv')
    adjMI.ix[:, :, 'character'].to_csv(out + 'character_adjMI.csv')
    adjMI.ix[:, :, 'Madelon'].to_csv(out + 'Madelon adjMI.csv')

    # %% NN fit data (2,3)
    grid = {
        'km__n_clusters': clusters,
        'NN__alpha': nn_reg,
        'NN__hidden_layer_sizes': nn_arch
    }
    madelon = pd.read_hdf(base + 'datasets.hdf', 'madelon')
    madelon_X = madelon.drop('Class', 1).copy().values
    madelon_Y = madelon['Class'].copy().values
    X_train, X_test, y_train, y_test = train_test_split(madelon_X,
                                                        madelon_Y,
                                                        test_size=0.3,
                                                        random_state=42)

    np.random.seed(0)

    for k in clusters:
        mlp = MLPClassifier(activation='relu',
                            max_iter=2000,
                            early_stopping=True,
                            random_state=5,
                            alpha=10**-5,
                            hidden_layer_sizes=(62, 62),
                            verbose=0)
        km = kmeans(random_state=5, n_clusters=k)
        pipe = Pipeline([('km', km), ('NN', mlp)])
        # gs = GridSearchCV(pipe, grid, verbose=10)
        tick = time.clock()
        pipe.fit(X_train, y_train)
        tock = time.clock() - tick

        print("Traning time , {}, k means dataset".format(k), ',', tock)
        tick = time.clock()
        y_pred = pipe.predict(X_test)
        tock = time.clock() - tick
        print("Testing time , {}, k means component".format(k), ',', tock)
        print("Accuracy Score ,  {}, kmeans Madelon".format(k), ',',
              accuracy_score(y_test, y_pred))

        grid = {'gmm__n_components': clusters}
        mlp = MLPClassifier(activation='relu',
                            max_iter=2000,
                            early_stopping=True,
                            random_state=5,
                            verbose=0,
                            alpha=10**-5,
                            hidden_layer_sizes=(62, 62))
        gmm = myGMM(random_state=43, n_components=k)
        pipe = Pipeline([('gmm', gmm), ('NN', mlp)])
        # gs = GridSearchCV(pipe, grid, verbose=10, cv=5)
        tick = time.clock()
        pipe.fit(X_train, y_train)
        tock = time.clock() - tick
        print("Traning time , {}, gmm dataset".format(k), ',', tock)
        tick = time.clock()
        y_pred = pipe.predict(X_test)
        tock = time.clock() - tick
        print("Testing time , {}, gmm means component".format(k), ',', tock)
        print("Accuracy Score , {}, gmm means Madelon".format(k), ',',
              accuracy_score(y_test, y_pred))

    grid = {
        'km__n_clusters': clusters,
        'NN__alpha': nn_reg,
        'NN__hidden_layer_sizes': nn_arch
    }
    mlp = MLPClassifier(activation='relu',
                        max_iter=2000,
                        early_stopping=True,
                        random_state=5)
    km = kmeans(random_state=5)
    pipe = Pipeline([('km', km), ('NN', mlp)])
    gs = GridSearchCV(pipe, grid, verbose=10)

    gs.fit(madelon_X, madelon_Y)
    tmp = pd.DataFrame(gs.cv_results_)
    tmp.to_csv(out + 'Madelon cluster Kmeans.csv')

    grid = {
        'gmm__n_components': clusters,
        'NN__alpha': nn_reg,
        'NN__hidden_layer_sizes': nn_arch
    }
    mlp = MLPClassifier(activation='relu',
                        max_iter=2000,
                        early_stopping=True,
                        random_state=5)
    gmm = myGMM(random_state=5)
    pipe = Pipeline([('gmm', gmm), ('NN', mlp)])
    gs = GridSearchCV(pipe, grid, verbose=10, cv=5)

    gs.fit(madelon_X, madelon_Y)
    tmp = pd.DataFrame(gs.cv_results_)
    tmp.to_csv(out + 'Madelon cluster GMM.csv')

    grid = {
        'km__n_clusters': clusters,
        'NN__alpha': nn_reg,
        'NN__hidden_layer_sizes': nn_arch
    }
    mlp = MLPClassifier(activation='relu',
                        max_iter=2000,
                        early_stopping=True,
                        random_state=5)
    km = kmeans(random_state=5)
    pipe = Pipeline([('km', km), ('NN', mlp)])
    gs = GridSearchCV(pipe, grid, verbose=10, cv=5)

    gs.fit(character_X, character_Y)
    tmp = pd.DataFrame(gs.cv_results_)
    tmp.to_csv(out + 'character_cluster_Kmeans.csv')

    grid = {
        'gmm__n_components': clusters,
        'NN__alpha': nn_reg,
        'NN__hidden_layer_sizes': nn_arch
    }
    mlp = MLPClassifier(activation='relu',
                        max_iter=2000,
                        early_stopping=True,
                        random_state=5)
    gmm = myGMM(random_state=5)
    pipe = Pipeline([('gmm', gmm), ('NN', mlp)])
    gs = GridSearchCV(pipe, grid, verbose=10, cv=5)

    gs.fit(character_X, character_Y)
    tmp = pd.DataFrame(gs.cv_results_)
    tmp.to_csv(out + 'character_cluster_GMM.csv')

    # %% For chart 4/5
    madelonX2D = TSNE(verbose=10, random_state=5).fit_transform(madelon_X)
    character_X2D = TSNE(verbose=10, random_state=5).fit_transform(character_X)

    madelon2D = pd.DataFrame(np.hstack(
        (madelonX2D, np.atleast_2d(madelon_Y).T)),
                             columns=['x', 'y', 'target'])
    character2D = pd.DataFrame(np.hstack(
        (character_X2D, np.atleast_2d(character_Y).T)),
                               columns=['x', 'y', 'target'])

    madelon2D.to_csv(out + 'madelon2D.csv')
    character2D.to_csv(out + 'character2D.csv')
Exemple #15
0
    SSE = defaultdict(dict)
    ll = defaultdict(dict)
    acc = defaultdict(lambda: defaultdict(dict))
    adjMI = defaultdict(lambda: defaultdict(dict))
    km = kmeans(random_state=5)
    gmm = GMM(random_state=5)

    st = clock()
    for k in clusters:
        km.set_params(n_clusters=k)
        gmm.set_params(n_components=k)
        km.fit(diamondsX)
        gmm.fit(diamondsX)
        SSE[k]['Diamonds'] = km.score(diamondsX)
        ll[k]['Diamonds'] = gmm.score(diamondsX)
        acc[k]['Diamonds']['Kmeans'] = cluster_acc(diamondsY,
                                                   km.predict(diamondsX))
        acc[k]['Diamonds']['GMM'] = cluster_acc(diamondsY,
                                                gmm.predict(diamondsX))
        adjMI[k]['Diamonds']['Kmeans'] = ami(diamondsY, km.predict(diamondsX))
        adjMI[k]['Diamonds']['GMM'] = ami(diamondsY, gmm.predict(diamondsX))

        km.fit(digitsX)
        gmm.fit(digitsX)
        SSE[k]['Digits'] = km.score(digitsX)
        ll[k]['Digits'] = gmm.score(digitsX)
        acc[k]['Digits']['Kmeans'] = cluster_acc(digitsY, km.predict(digitsX))
        acc[k]['Digits']['GMM'] = cluster_acc(digitsY, gmm.predict(digitsX))
        adjMI[k]['Digits']['Kmeans'] = ami(digitsY, km.predict(digitsX))
        adjMI[k]['Digits']['GMM'] = ami(digitsY, gmm.predict(digitsX))
        print(k, clock() - st)
Exemple #16
0
SSE = defaultdict(dict)
ll = defaultdict(dict)
acc = defaultdict(lambda: defaultdict(dict))
adjMI = defaultdict(lambda: defaultdict(dict))
km = kmeans(random_state=5)
gmm = GMM(random_state=5)

st = clock()
for k in clusters:
    km.set_params(n_clusters=k)
    gmm.set_params(n_components=k)
    km.fit(madelonX)
    gmm.fit(madelonX)
    SSE[k]['Madelon'] = km.score(madelonX)
    ll[k]['Madelon'] = gmm.score(madelonX)
    acc[k]['Madelon']['Kmeans'] = cluster_acc(madelonY, km.predict(madelonX))
    acc[k]['Madelon']['GMM'] = cluster_acc(madelonY, gmm.predict(madelonX))
    adjMI[k]['Madelon']['Kmeans'] = ami(madelonY, km.predict(madelonX))
    adjMI[k]['Madelon']['GMM'] = ami(madelonY, gmm.predict(madelonX))

    km.fit(digitsX)
    gmm.fit(digitsX)
    SSE[k]['Digits'] = km.score(digitsX)
    ll[k]['Digits'] = gmm.score(digitsX)
    acc[k]['Digits']['Kmeans'] = cluster_acc(digitsY, km.predict(digitsX))
    acc[k]['Digits']['GMM'] = cluster_acc(digitsY, gmm.predict(digitsX))
    adjMI[k]['Digits']['Kmeans'] = ami(digitsY, km.predict(digitsX))
    adjMI[k]['Digits']['GMM'] = ami(digitsY, gmm.predict(digitsX))
    print(k, clock() - st)

SSE = (-pd.DataFrame(SSE)).T
Exemple #17
0
silhouette = defaultdict(lambda: defaultdict(dict))
km = kmeans(random_state=5)
gmm = GMM(random_state=5)

st = clock()
abaloneX2D = TSNE(verbose=10, random_state=5).fit_transform(abaloneX)
digitsX2D = TSNE(verbose=10, random_state=5).fit_transform(digitsX)

for k in clusters:
    km.set_params(n_clusters=k)
    gmm.set_params(n_components=k)
    km.fit(abaloneX)
    gmm.fit(abaloneX)
    SSE[k]['abalone'] = km.score(abaloneX)
    ll[k]['abalone'] = gmm.score(abaloneX)
    acc[k]['abalone']['Kmeans'] = cluster_acc(abaloneY, km.predict(abaloneX))
    acc[k]['abalone']['GMM'] = cluster_acc(abaloneY, gmm.predict(abaloneX))
    adjMI[k]['abalone']['Kmeans'] = ami(abaloneY, km.predict(abaloneX))
    adjMI[k]['abalone']['GMM'] = ami(abaloneY, gmm.predict(abaloneX))
    silhouette[k]['abalone']['Kmeans'] = silhouette_score(abaloneX,
                                                          km.labels_,
                                                          metric='euclidean')
    silhouette[k]['abalone']['GMM'] = silhouette_score(abaloneX,
                                                       gmm.predict(abaloneX),
                                                       metric='euclidean')

    abalone2D = pd.DataFrame(np.hstack(
        (abaloneX2D, np.atleast_2d(km.predict(abaloneX)).T)),
                             columns=['x', 'y', 'target'])
    abalone2D.to_csv(out + 'abalone2D_km_{}.csv'.format(k))
    abalone2D = pd.DataFrame(np.hstack(
SSE = defaultdict(dict)
ll = defaultdict(dict)
acc = defaultdict(lambda: defaultdict(dict))
adjMI = defaultdict(lambda: defaultdict(dict))
km = kmeans(random_state=5)
gmm = GMM(random_state=5)

st = clock()
for k in clusters:
    km.set_params(n_clusters=k)
    gmm.set_params(n_components=k)
    km.fit(contraX)
    gmm.fit(contraX)
    SSE[k]['contra'] = km.score(contraX)
    ll[k]['contra'] = gmm.score(contraX)
    acc[k]['contra']['Kmeans'] = cluster_acc(contraY, km.predict(contraX))
    acc[k]['contra']['GMM'] = cluster_acc(contraY, gmm.predict(contraX))
    adjMI[k]['contra']['Kmeans'] = ami(contraY, km.predict(contraX))
    adjMI[k]['contra']['GMM'] = ami(contraY, gmm.predict(contraX))

    km.fit(cancerX)
    gmm.fit(cancerX)
    SSE[k]['cancer'] = km.score(cancerX)
    ll[k]['cancer'] = gmm.score(cancerX)
    acc[k]['cancer']['Kmeans'] = cluster_acc(cancerY, km.predict(cancerX))
    acc[k]['cancer']['GMM'] = cluster_acc(cancerY, gmm.predict(cancerX))
    adjMI[k]['cancer']['Kmeans'] = ami(cancerY, km.predict(cancerX))
    adjMI[k]['cancer']['GMM'] = ami(cancerY, gmm.predict(cancerX))
    print(k, clock() - st)

## Keith Mertan: Adding cluster outputs for best parameters and saving at the end of the file