Ejemplo n.º 1
0
def run_km_em(perm_x, perm_y, dname, clstr):
    SSE_km_perm = []
    ll_em_perm = []
    acc_km_perm = []
    acc_em_perm = []
    adjMI_km_perm = []
    adjMI_em_perm = []
    homo_km_perm = []
    homo_em_perm = []
    comp_km_perm = []
    comp_em_perm = []
    silhou_km_perm = []
    bic_em_perm = []
    clk_time = []

    for k in clstr:
        st = clock()
        km = KMeans(n_clusters=k, random_state=10)
        gmm = GMM(n_components=k, random_state=10)

        SSE_km_perm.append(-km.score(perm_x, km.fit_predict(perm_x)))
        ll_em_perm.append(gmm.score(perm_x, gmm.fit_predict(perm_x)))
        acc_km_perm.append(cluster_acc(perm_y, km.fit_predict(perm_x)))
        acc_em_perm.append(cluster_acc(perm_y, gmm.fit_predict(perm_x)))
        adjMI_km_perm.append(ami(perm_y, km.fit_predict(perm_x)))
        adjMI_em_perm.append(ami(perm_y, gmm.fit_predict(perm_x)))
        homo_km_perm.append(
            metrics.homogeneity_score(perm_y, km.fit_predict(perm_x)))
        homo_em_perm.append(
            metrics.homogeneity_score(perm_y, gmm.fit_predict(perm_x)))
        comp_km_perm.append(
            metrics.completeness_score(perm_y, km.fit_predict(perm_x)))
        comp_em_perm.append(
            metrics.completeness_score(perm_y, gmm.fit_predict(perm_x)))
        silhou_km_perm.append(
            metrics.silhouette_score(perm_x, km.fit_predict(perm_x)))
        bic_em_perm.append(gmm.bic(perm_x))
        clk_time.append(clock() - st)
        print(k, clock() - st)

    dbcluster = pd.DataFrame({
        'k': clstr,
        'SSE_km': SSE_km_perm,
        'll_em': ll_em_perm,
        'acc_km': acc_km_perm,
        'acc_em': acc_em_perm,
        'adjMI_km': adjMI_km_perm,
        'adjMI_em': adjMI_em_perm,
        'homo_km': homo_km_perm,
        'homo_em': homo_em_perm,
        'comp_km': comp_km_perm,
        'comp_em': comp_em_perm,
        'silhou_km': silhou_km_perm,
        'bic_em': bic_em_perm,
        'clk_time': clk_time
    })

    dbcluster.to_csv('./results/cluster_{}.csv'.format(dname), sep=',')
Ejemplo n.º 2
0
def run_adult_analysis(adultX, adultY):
    np.random.seed(0)
    clusters = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 18, 20, 25, 30]

    print(
        'Part 1 - Running clustering algoirthms on original datasets...adult')
    SSE = defaultdict(dict)
    BIC = defaultdict(dict)
    h**o = defaultdict(lambda: defaultdict(dict))
    compl = defaultdict(lambda: defaultdict(dict))
    adjMI = defaultdict(lambda: defaultdict(dict))
    km = kmeans(random_state=5)
    gmm = GMM(random_state=5)

    st = clock()
    for k in clusters:
        km.set_params(n_clusters=k)
        gmm.set_params(n_components=k)
        km.fit(adultX)
        gmm.fit(adultX)
        SSE[k]['Adult SSE'] = km.score(adultX)
        BIC[k]['Adult BIC'] = gmm.bic(adultX)  #BAYESIAN INFORMATION CRITERION
        #true = np.transpose(adultY)
        #pred = np.transpose(km.predict(adultX))
        h**o[k]['Adult']['Kmeans'] = homogeneity_score(
            adultY, km.predict(adultX))  # Agreement of labels
        h**o[k]['Adult']['GMM'] = homogeneity_score(adultY,
                                                    gmm.predict(adultX))
        compl[k]['Adult']['Kmeans'] = completeness_score(
            adultY, km.predict(adultX)
        )  #A clustering result satisfies completeness if all the data points that are members of a given class are elements of the same cluster
        compl[k]['Adult']['GMM'] = completeness_score(adultY,
                                                      gmm.predict(adultX))
        adjMI[k]['Adult']['Kmeans'] = ami(
            adultY, km.predict(adultX))  #ADJUSTED MUTUAL INFORMATION
        adjMI[k]['A dult']['GMM'] = ami(adultY, gmm.predict(adultX))

        print(k, clock() - st)

    SSE = (-pd.DataFrame(SSE)).T
    BIC = pd.DataFrame(BIC).T
    h**o = pd.Panel(h**o)
    compl = pd.Panel(compl)
    adjMI = pd.Panel(adjMI)

    SSE.to_csv(
        './P1_Clustering_Algorithms_Non_Transformed/Adult_Cluster_Select_Kmeans.csv'
    )
    BIC.to_csv(
        './P1_Clustering_Algorithms_Non_Transformed/Adult_Cluster_Select_GMM.csv'
    )
    h**o.ix[:, :, 'Adult'].to_csv(
        './P1_Clustering_Algorithms_Non_Transformed/Adult_homo.csv')
    compl.ix[:, :, 'Adult'].to_csv(
        './P1_Clustering_Algorithms_Non_Transformed/Adult_compl.csv')
    adjMI.ix[:, :, 'Adult'].to_csv(
        './P1_Clustering_Algorithms_Non_Transformed/Adult_adjMI.csv')
Ejemplo n.º 3
0
def performance(encoder, models, K):
    mean_ami = dict(zip(models.keys(), list(np.zeros(len(models)))))
    mean_chs = dict(zip(models.keys(), list(np.zeros(len(models)))))
    mean_sil = dict(zip(models.keys(), list(np.zeros(len(models)))))

    tic = time.perf_counter()
    for i in range(K):
        features_enc = encoder.fit_transform(features, target)

        for key in models:
            model = models[key]
            
            y_predict = model.fit_predict(features_enc, target)

            mean_ami[key] += ami(target, y_predict)/K
            mean_chs[key] += chs(features_enc, y_predict)/K
            mean_sil[key] += sil(features_enc, y_predict, metric='euclidean')/K

    toc = time.perf_counter()

    # Write results to file
    res = open('../results/'+name_prefix+'_results.txt', 'a')
    res.write(type(encoder).__name__[0:-7]+' Encoder\n')
    for key in mean_ami:
        res.write(' '+key+': '+str(mean_ami[key])+', '+str(mean_chs[key])+', '+str(mean_sil[key])+'\n')
    res.write('Total time: '+str(round(toc-tic,3))+'\n') 
    res.close()

    print('Evaluation of', type(encoder).__name__[0:-7], 'Encoder completed in', round(toc-tic,3),'s')
Ejemplo n.º 4
0
 def __iteration_summary(self, mcmc_iteration, intra_step_count,
                         temp_assignments):
     print(
         '----------------------------------------------------------------')
     print('iteration: %i' % mcmc_iteration)
     print(time.strftime('%H:%M:%S  %d/%m'))
     if self.gt_display is not None:
         gt = cp.deepcopy(self.gt_display)
         gt += np.abs(np.min(gt))
         bgt = np.bincount(gt)
     hist_num_groups = np.zeros(self.cfg.L)
     hist_num_groups[len(self.m[self.m > self.nump * 1E-2]) - 1] += 1
     valG = len(self.m[self.m > self.nump * 5E-4]) - 1
     selG = np.sort(self.m)[::-1]
     candidates = [
         obs_id for obs_id, n in enumerate(self.m) if n in selG[:valG + 1]
     ]
     # print(candidates)
     print('Number of active clusters: %i' % len(candidates))
     res = sorted(
         [int(itm) for itm in self.m[candidates] if itm > self.nump * 1E-4],
         reverse=True)
     print(res)
     if self.gt_display is not None:
         print(set(self.gt_display))
         print('GT: %s' % bgt[bgt > 0])
         print('NMI: %f' % ami(self.gt_display, temp_assignments))
     print('residue: %i' % (self.nump - np.sum(res)))
Ejemplo n.º 5
0
def run_clustering(out, perm_x, perm_y, housing_x, housing_y):
    SSE = defaultdict(dict)
    ll = defaultdict(dict)
    acc = defaultdict(lambda: defaultdict(dict))
    adjMI = defaultdict(lambda: defaultdict(dict))
    km = kmeans(random_state=5)
    gmm = GMM(random_state=5)

    st = clock()
    for k in clusters:
        km.set_params(n_clusters=k)
        gmm.set_params(n_components=k)
        km.fit(perm_x)
        gmm.fit(perm_x)
        SSE[k]['perm'] = km.score(perm_x)
        ll[k]['perm'] = gmm.score(perm_x)
        acc[k]['perm']['Kmeans'] = cluster_acc(perm_y, km.predict(perm_x))
        acc[k]['perm']['GMM'] = cluster_acc(perm_y, gmm.predict(perm_x))
        adjMI[k]['perm']['Kmeans'] = ami(perm_y, km.predict(perm_x))
        adjMI[k]['perm']['GMM'] = ami(perm_y, gmm.predict(perm_x))

        km.fit(housing_x)
        gmm.fit(housing_x)
        SSE[k]['housing'] = km.score(housing_x)
        ll[k]['housing'] = gmm.score(housing_x)
        acc[k]['housing']['Kmeans'] = cluster_acc(housing_y,
                                                  km.predict(housing_x))
        acc[k]['housing']['GMM'] = cluster_acc(housing_y,
                                               gmm.predict(housing_x))
        adjMI[k]['housing']['Kmeans'] = ami(housing_y, km.predict(housing_x))
        adjMI[k]['housing']['GMM'] = ami(housing_y, gmm.predict(housing_x))
        print(k, clock() - st)

    SSE = (-pd.DataFrame(SSE)).T
    SSE.rename(columns=lambda x: x + ' SSE (left)', inplace=True)
    ll = pd.DataFrame(ll).T
    ll.rename(columns=lambda x: x + ' log-likelihood', inplace=True)
    acc = pd.Panel(acc)
    adjMI = pd.Panel(adjMI)

    SSE.to_csv(out + 'SSE.csv')
    ll.to_csv(out + 'logliklihood.csv')
    acc.ix[:, :, 'housing'].to_csv(out + 'Housing acc.csv')
    acc.ix[:, :, 'perm'].to_csv(out + 'Perm acc.csv')
    adjMI.ix[:, :, 'housing'].to_csv(out + 'Housing adjMI.csv')
    adjMI.ix[:, :, 'perm'].to_csv(out + 'Perm adjMI.csv')
def tensfact_baseline():
    n_clusters = 81
    f = open('buzz_user_tensor_45.npy')
    X_buzz = np.load(f)
    print X_buzz.shape

    X_buzz = X_buzz[buzz_ground.keys()]
    buzz_ground1 = buzz_ground.values()

    km = KMeans(n_clusters=81, init='k-means++', n_init=1, verbose=False)
    sc = 0.0
    sc1 = 0.0
    sc2 = 0.0
    for i in xrange(10):
        km.fit(X_buzz)
        sc += nmi(buzz_ground1, km.labels_)
        sc1 += ari(buzz_ground1, km.labels_)
        sc2 += ami(buzz_ground1, km.labels_)

    print "BUZZ"
    print "nmi score %f" % (sc / float(10))
    print "ari score %f" % (sc1 / float(10))
    print "ami score %f" % (sc2 / float(10))

    f = open('poli_user_tensor_75.npy')
    X_poli = np.load(f)
    print X_poli.shape
    X_poli = X_poli[poli_ground.keys()]
    poli_ground1 = poli_ground.values()
    sc = 0.0
    sc1 = 0.0
    km1 = KMeans(n_clusters=310, init='k-means++', n_init=1, verbose=False)
    sc = 0.0
    sc1 = 0.0
    sc2 = 0.0
    for i in xrange(10):
        km1.fit(X_poli)
        sc += nmi(poli_ground1, km1.labels_)
        sc1 += ari(poli_ground1, km1.labels_)
        sc2 += ami(poli_ground1, km1.labels_)

    print "poli"
    print "nmi score %f" % (sc / float(10))
    print "ari score %f" % (sc1 / float(10))
    print "ami score %f" % (sc2 / float(10))
Ejemplo n.º 7
0
def comp_clusters_communities(embedding, labels_communities, algo = True, n_clusters = 5):
    X = StandardScaler().fit_transform(embedding) #rescaling of the data
    if algo: #choose which algo you want to find communities with
        db = DBSCAN().fit(X)
        labels_clusters = db.labels_
    else:
        kM = KMeans(n_clusters = n_clusters).fit(X)
        labels_clusters = kM.labels_
    return ami(labels_clusters, labels_communities) #adjusted mutual information between ground truth and communities discovered by the algorithm
    def func(X_train, X_test, y_train, y_test, name, it):
        km = kmeans(random_state=5)
        gmm = GMM(random_state=5)
        km.set_params(n_clusters=it)
        gmm.set_params(n_components=it)
        km.fit(X_train)
        gmm.fit(X_train)

        if args[0] != 'BASE':
            file_it(name, 'km', X_train, y_train, km.predict(X_train), it=it)
            file_it(name, 'gmm', X_train, y_train, gmm.predict(X_train), it=it)

        SSE[it][name] = km.score(X_train)
        ll[it][name] = gmm.score(X_train)
        acc[it][name]['Kmeans'] = cluster_acc(y_test, km.predict(X_test))
        acc[it][name]['GMM'] = cluster_acc(y_test, gmm.predict(X_test))
        adjMI[it][name]['Kmeans'] = ami(y_train, km.predict(X_train))
        adjMI[it][name]['GMM'] = ami(y_train, gmm.predict(X_train))
        print(it, clock()-st)
Ejemplo n.º 9
0
def prune_groups(groups, inverse=False):
    """
    Returns the index of informative levels after the nested_model has
    been run. It works by looking at level entropy and, moreover, checks if
    two consecutive levels have the same clustering
    """
    
    n_groups = groups.shape[1]
    
    mi_groups = np.array([ami(groups.iloc[:, x - 1], groups.iloc[:, x]) for x in range(1, n_groups)])
    
    if inverse:
        return groups.columns[np.where(mi_groups != 1)]
    
    return groups.columns[np.where(mi_groups == 1)]
Ejemplo n.º 10
0
    km.set_params(n_clusters=k)
    gmm.set_params(n_components=k)

    km.fit(X_train)
    gmm.fit(X_train)

    SSE[k]['IncomeInertia'] = (km.inertia_)
    BIC[k]['IncomeBIC'] = gmm.bic(X_train)

    hScore[k]['KM'] = homogeneity_score(Y_train, km.predict(X_train))
    hScore[k]['GMM'] = homogeneity_score(Y_train, gmm.predict(X_train))

    cScore[k]['KM'] = completeness_score(Y_train, km.predict(X_train))
    cScore[k]['GMM'] = completeness_score(Y_train, gmm.predict(X_train))

    AMI[k]['KM'] = ami(Y_train, km.predict(X_train))
    AMI[k]['GMM'] = ami(Y_train, gmm.predict(X_train))

    a, b, vm = homogeneity_completeness_v_measure(Y_train, km.predict(X_train))
    VMeasure[k]['KM'] = vm

    a, b, vm = homogeneity_completeness_v_measure(Y_train,
                                                  gmm.predict(X_train))
    VMeasure[k]['GMM'] = vm

SSE = (pd.DataFrame(SSE)).T
BIC = pd.DataFrame(BIC).T

hScore = pd.DataFrame(hScore).T
cScore = pd.DataFrame(cScore).T
AMI = pd.DataFrame(AMI).T
Ejemplo n.º 11
0
def run_credit_analysis_dim_red():
    algo_name = ['PCA', 'ICA', 'RP', 'RF']
    clusters = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
                12]  #, 13, 14, 16, 18, 20, 25, 30]
    # %% Part 3 - Run k-means and EM clustering algorithms on each dimensionally reduced dataset

    print(
        'Part 3 - Running clustering algoirthms on dimensionally reduced datasets...credit'
    )
    for i in range(len(algo_name)):
        # load datasets
        credit = pd.read_hdf('datasets.hdf', 'credit_' + algo_name[i])
        creditX = credit.drop('Class', 1).copy().values
        creditY = credit['Class'].copy().values

        SSE = defaultdict(dict)
        BIC = defaultdict(dict)
        h**o = defaultdict(lambda: defaultdict(dict))
        compl = defaultdict(lambda: defaultdict(dict))
        adjMI = defaultdict(lambda: defaultdict(dict))
        km = kmeans(random_state=5)
        gmm = GMM(random_state=5)

        st = clock()
        for k in clusters:
            km.set_params(n_clusters=k)
            gmm.set_params(n_components=k)
            km.fit(creditX)
            gmm.fit(creditX)
            SSE[k]['Credit SSE'] = km.score(creditX)
            BIC[k]['Credit BIC'] = gmm.bic(creditX)
            h**o[k]['Credit']['Kmeans'] = homogeneity_score(
                creditY, km.predict(creditX))
            h**o[k]['Credit']['GMM'] = homogeneity_score(
                creditY, gmm.predict(creditX))
            compl[k]['Credit']['Kmeans'] = completeness_score(
                creditY, km.predict(creditX))
            compl[k]['Credit']['GMM'] = completeness_score(
                creditY, gmm.predict(creditX))
            adjMI[k]['Credit']['Kmeans'] = ami(creditY, km.predict(creditX))
            adjMI[k]['Credit']['GMM'] = ami(creditY, gmm.predict(creditX))
            print(k, clock() - st)

        SSE = (-pd.DataFrame(SSE)).T
        BIC = pd.DataFrame(BIC).T
        h**o = pd.Panel(h**o)
        compl = pd.Panel(compl)
        adjMI = pd.Panel(adjMI)

        SSE.to_csv('./P3_Clustering_Algorithms_Reduced/Credit/Credit_SSE_' +
                   algo_name[i] + '.csv')
        BIC.to_csv('./P3_Clustering_Algorithms_Reduced/Credit/Credit_BIC_' +
                   algo_name[i] + '.csv')
        h**o.ix[:, :, 'Credit'].to_csv(
            './P3_Clustering_Algorithms_Reduced/Credit/credit_' +
            algo_name[i] + '_homo.csv')
        compl.ix[:, :, 'Credit'].to_csv(
            './P3_Clustering_Algorithms_Reduced/Credit/credit_' +
            algo_name[i] + '_compl.csv')
        adjMI.ix[:, :, 'Credit'].to_csv(
            './P3_Clustering_Algorithms_Reduced/Credit/credit_' +
            algo_name[i] + '_adjMI.csv')
Ejemplo n.º 12
0
loans_km_acc = []
loans_gmm_acc = []
loans_km_score = []
loans_gmm_score = []
loans_km_ami = []
loans_gmm_ami = []
loans_km_silhouette = []
loans_gmm_silhouette = []

for k in clusters:
    km.set_params(n_clusters=k)
    km.fit(loansX_pca)
    loans_km_acc.append(cluster_acc(loans_Y, km.predict(loansX_pca)))
    loans_km_score.append(km.score(loansX_pca))
    loans_km_ami.append(ami(loans_Y, km.predict(loansX_pca)))
    loans_km_silhouette.append(
        silhouette_score(loansX_pca, km.predict(loansX_pca)))

    gmm.set_params(n_components=k)
    gmm.fit(loansX_pca)
    loans_gmm_acc.append(cluster_acc(loans_Y, gmm.predict(loansX_pca)))
    loans_gmm_score.append(gmm.score(loansX_pca))
    loans_gmm_ami.append(ami(loans_Y, gmm.predict(loansX_pca)))
    loans_gmm_silhouette.append(
        silhouette_score(loansX_pca, gmm.predict(loansX_pca)))

loans_df= pd.DataFrame({'Kmeans acc': loans_km_acc, 'GMM acc': loans_gmm_acc,\
           'Kmeans score': loans_km_score, 'GMM score': loans_gmm_score,\
           'Kmeans ami': loans_km_ami, 'GMM ami': loans_gmm_ami,\
           'km avg silhouette': loans_km_silhouette, 'GMM avg silhouette':loans_gmm_silhouette },\
acc = defaultdict(lambda: defaultdict(dict))
adjMI = defaultdict(lambda: defaultdict(dict))
km = kmeans(random_state=5)
gmm = GMM(random_state=5)

st = clock()
for k in clusters:
    km.set_params(n_clusters=k)
    gmm.set_params(n_components=k)
    km.fit(contraX)
    gmm.fit(contraX)
    SSE[k]['contra'] = km.score(contraX)
    ll[k]['contra'] = gmm.score(contraX)
    acc[k]['contra']['Kmeans'] = cluster_acc(contraY, km.predict(contraX))
    acc[k]['contra']['GMM'] = cluster_acc(contraY, gmm.predict(contraX))
    adjMI[k]['contra']['Kmeans'] = ami(contraY, km.predict(contraX))
    adjMI[k]['contra']['GMM'] = ami(contraY, gmm.predict(contraX))

    km.fit(cancerX)
    gmm.fit(cancerX)
    SSE[k]['cancer'] = km.score(cancerX)
    ll[k]['cancer'] = gmm.score(cancerX)
    acc[k]['cancer']['Kmeans'] = cluster_acc(cancerY, km.predict(cancerX))
    acc[k]['cancer']['GMM'] = cluster_acc(cancerY, gmm.predict(cancerX))
    adjMI[k]['cancer']['Kmeans'] = ami(cancerY, km.predict(cancerX))
    adjMI[k]['cancer']['GMM'] = ami(cancerY, gmm.predict(cancerX))
    print(k, clock() - st)

## Keith Mertan: Adding cluster outputs for best parameters and saving at the end of the file

## Cancer data first
Ejemplo n.º 14
0
km = kmeans(random_state=5)
gmm = GMM(random_state=5)

st = clock()
for k in clusters:
    km.set_params(n_clusters=k)
    gmm.set_params(n_components=k)
    km.fit(spamX)
    gmm.fit(spamX)
    SSE[k]['spam SSE'] = km.score(spamX)
    BIC[k]['spam BIC'] = gmm.bic(spamX)
    h**o[k]['spam']['Kmeans'] = homogeneity_score(spamY, km.predict(spamX))
    h**o[k]['spam']['GMM'] = homogeneity_score(spamY, gmm.predict(spamX))
    compl[k]['spam']['Kmeans'] = completeness_score(spamY, km.predict(spamX))
    compl[k]['spam']['GMM'] = completeness_score(spamY, gmm.predict(spamX))
    adjMI[k]['spam']['Kmeans'] = ami(spamY, km.predict(spamX))
    adjMI[k]['spam']['GMM'] = ami(spamY, gmm.predict(spamX))

    km.fit(letterX)
    gmm.fit(letterX)
    SSE[k]['letter'] = km.score(letterX)
    BIC[k]['letter BIC'] = gmm.bic(letterX)
    h**o[k]['letter']['Kmeans'] = homogeneity_score(letterY,
                                                    km.predict(letterX))
    h**o[k]['letter']['GMM'] = homogeneity_score(letterY, gmm.predict(letterX))
    compl[k]['letter']['Kmeans'] = completeness_score(letterY,
                                                      km.predict(letterX))
    compl[k]['letter']['GMM'] = completeness_score(letterY,
                                                   gmm.predict(letterX))
    adjMI[k]['letter']['Kmeans'] = ami(letterY, km.predict(letterX))
    adjMI[k]['letter']['GMM'] = ami(letterY, gmm.predict(letterX))
Ejemplo n.º 15
0
    silh = defaultdict(lambda: defaultdict(dict))
    km = kmeans(random_state=5)
    gmm = GMM(random_state=5)

    st = clock()
    for k in clusters:
        km.set_params(n_clusters=k)
        gmm.set_params(n_components=k)
        km.fit(wineX)
        gmm.fit(wineX)
        SSE[k]['Wine'] = km.score(wineX)
        ll[k]['Wine'] = gmm.score(wineX)
        acc[k]['Wine']['Kmeans'] = cluster_acc(wineY.ravel(),
                                               km.predict(wineX))
        acc[k]['Wine']['GMM'] = cluster_acc(wineY.ravel(), gmm.predict(wineX))
        adjMI[k]['Wine']['Kmeans'] = ami(wineY.ravel(), km.predict(wineX))
        adjMI[k]['Wine']['GMM'] = ami(wineY.ravel(), gmm.predict(wineX))
        adjRI[k]['Wine']['Kmeans'] = ari(wineY.ravel(), km.predict(wineX))
        adjRI[k]['Wine']['GMM'] = ari(wineY.ravel(), gmm.predict(wineX))
        bic[k]['Wine']['Kmeans'] = -compute_bic(km, wineX)
        bic[k]['Wine']['GMM'] = gmm.bic(wineX)
        silh[k]['Wine']['Kmeans'] = silhouette_score(wineX, km.predict(wineX))
        silh[k]['Wine']['GMM'] = silhouette_score(wineX, gmm.predict(wineX))

        km.fit(digitX)
        gmm.fit(digitX)
        SSE[k]['Digit'] = km.score(digitX)
        ll[k]['Digit'] = gmm.score(digitX)
        acc[k]['Digit']['Kmeans'] = cluster_acc(digitY.ravel(),
                                                km.predict(digitX))
        acc[k]['Digit']['GMM'] = cluster_acc(digitY.ravel(),
Ejemplo n.º 16
0
acc = defaultdict(lambda: defaultdict(dict))
adjMI = defaultdict(lambda: defaultdict(dict))
km = kmeans(random_state=5)
gmm = GMM(random_state=5)

st = clock()
for k in clusters:
    km.set_params(n_clusters=k)
    gmm.set_params(n_components=k)
    km.fit(madelonX)
    gmm.fit(madelonX)
    SSE[k]['Madelon'] = km.score(madelonX)
    ll[k]['Madelon'] = gmm.score(madelonX)
    acc[k]['Madelon']['Kmeans'] = cluster_acc(madelonY, km.predict(madelonX))
    acc[k]['Madelon']['GMM'] = cluster_acc(madelonY, gmm.predict(madelonX))
    adjMI[k]['Madelon']['Kmeans'] = ami(madelonY, km.predict(madelonX))
    adjMI[k]['Madelon']['GMM'] = ami(madelonY, gmm.predict(madelonX))

    km.fit(digitsX)
    gmm.fit(digitsX)
    SSE[k]['Digits'] = km.score(digitsX)
    ll[k]['Digits'] = gmm.score(digitsX)
    acc[k]['Digits']['Kmeans'] = cluster_acc(digitsY, km.predict(digitsX))
    acc[k]['Digits']['GMM'] = cluster_acc(digitsY, gmm.predict(digitsX))
    adjMI[k]['Digits']['Kmeans'] = ami(digitsY, km.predict(digitsX))
    adjMI[k]['Digits']['GMM'] = ami(digitsY, gmm.predict(digitsX))
    print(k, clock() - st)

SSE = (-pd.DataFrame(SSE)).T
SSE.rename(columns=lambda x: x + ' SSE (left)', inplace=True)
ll = pd.DataFrame(ll).T
Ejemplo n.º 17
0
def tensfact_baseline():
	G_buzz, N_buzz, C_buzz, G_poli, N_poli, C_poli = parse_graphs()
	n_news1 = N_buzz.shape[0]
	n_news2 = N_poli.shape[0]
	y_buzz = [0] * n_news1
	y_poli = [0] * n_news2
	y_buzz = np.array(y_buzz)
	y_poli = np.array(y_poli)
	y_buzz[91:] = 1
	y_poli[120:] = 1
	n_clusters = 81
	if not os.path.isfile('tensor_buzz.npy'):
		T = np.zeros((N_buzz.shape[0], G_buzz.shape[0], C_buzz.shape[1]))
		n_users = G_buzz.shape[0]
		n_news = N_buzz.shape[0]
		n_comm = C_buzz.shape[1]
		for i in xrange(n_news):
			for j in xrange(n_users):
				for k in xrange(n_comm):
					T[i,j,k] = N_buzz[i,j] * C_buzz[j, k] 
		np.save('tensor_buzz.npy', T)
	else:
		f = open('tensor_buzz.npy')
		T_buzz = np.load(f)
		print T_buzz.shape
		print "Buzz tensor loaded"
		#T = dtensor(T_buzz)
		#print T.shape
		#factors = parafac(T_buzz, rank=25, init='random')
		#T_buzz = tl.tensor(T_buzz)
		# Best so far [50, 100, 5]
		core, factors = tucker(T_buzz, ranks=[45, 100, 5])
		print core.shape
		print factors[0].shape
		print factors[1].shape
		#P, fit, itr, exectimes = cp_als(T, 35, init='random')
		#P, F, D, A, fit, itr, exectimes = parafac2.parafac2(T, 10, init=42)
		# Extracting news embeddings
		#X_buzz = T_buzz
		X_buzz = factors[1]
		#X_buzz = P.U[0]
		F = open('buzz_lsi.npy', 'r')
		buzz_lsi = np.load(F)
		#X_buzz = np.hstack((X_buzz, buzz_lsi))
		print X_buzz.shape	
		#caler = MinMaxScaler()
		#X_buzz = preprocessing.scale(X_buzz)
		#X_buzz = scaler.fit_transform(X_buzz)
		#assert np.where(np.isnan(X_buzz) == True)[0].shape[0] == 0
	
		#X_buzz = X_buzz[buzz_ground.keys()]

		buzz_ground1 = buzz_ground.values()

	
		km = KMeans(n_clusters=81, init='k-means++', n_init=1, verbose=False)
		print "Buzzfeed dataset's feat. extracted"
		#print X_buzz.shape 
        	#X_buzz, y_buzz = shuffle(X_buzz, y_buzz, random_state=42)
		sc = 0.0
		sc1 = 0.0
		sc2 = 0.0
		for i in xrange(10):
		        km.fit(X_buzz)
		        sc+=nmi(buzz_ground1, km.labels_)
		        sc1+=ari(buzz_ground1, km.labels_)
		        sc2+=ami(buzz_ground1, km.labels_)


		print "BUZZ"
		print "nmi score %f"%(sc/float(10))
		print "ari score %f"%(sc1/float(10))
		print "ami score %f"%(sc2/float(10))
		









	

	if not os.path.isfile('tensor_poli.npy'):
		T = np.zeros((N_poli.shape[0], G_poli.shape[0], C_poli.shape[1]))
		n_users = G_poli.shape[0]
		n_news = N_poli.shape[0]
		n_comm = C_poli.shape[1]
		for i in xrange(n_news):
			for j in xrange(n_users):
				for k in xrange(n_comm):
					T[i,j,k] = N_poli[i,j] * C_poli[j, k] 
		np.save('tensor_poli.npy', T)
	else:
		f = open('tensor_poli.npy')
		T_poli = np.load(f)
		print T_poli.shape
		print "Politifact tensor loaded"
		T = dtensor(T_poli)
		#factors = parafac(T_poli, rank=50)
		#P, fit, itr, exectimes = cp_als(T, 35,  init='random')
		# Best so far: [50, 100, 5]
		T_poli = tl.tensor(T_poli)
		core, factors = tucker(T_poli, ranks=[45, 100, 5])
		#print " Fit value, Itr and Exectimes are:"
		#print fit
		#print itr
		#print exectimes
		# Extracting news embeddings
		X_poli = factors[1]
		#X_poli = P.U[0]
		F = open('poli_lsi.npy', 'r')
		poli_lsi = np.load(F)
		
		
		#X_poli = X_poli[poli_ground.keys()]
		#X_poli = np.hstack((X_poli, poli_lsi))
		print X_poli.shape
		#X_buzz = preprocessing.scale(X_poli)
		#X_poli = scaler.fit_transform(X_poli)
		assert np.where(np.isnan(X_buzz) == True)[0].shape[0] == 0
		print X_poli.shape
		print "Politifact news feats. extracted"
		poli_ground1 = poli_ground.values()
		km = KMeans(n_clusters=310, init='k-means++', n_init=1, verbose=False)
		print "Buzzfeed dataset's feat. extracted"
		#print X_buzz.shape 
        	#X_buzz, y_buzz = shuffle(X_buzz, y_buzz, random_state=42)
		sc = 0.0
		sc1 = 0.0
		sc2 = 0.0
		for i in xrange(10):
		        km.fit(X_poli)
		        sc+=nmi(poli_ground1, km.labels_)
		        sc1+=ari(poli_ground1, km.labels_)
		        sc2+=ami(poli_ground1, km.labels_)


		print "BUZZ"
		print "nmi score %f"%(sc/float(10))
		print "ari score %f"%(sc1/float(10))
		print "ami score %f"%(sc2/float(10))
Ejemplo n.º 18
0
km = kmeans(random_state=5)
gmm = GMM(random_state=5)

st = time.time()
print(len(clusters))
for k in clusters:
    km.set_params(n_clusters=k)
    gmm.set_params(n_components=k)
    km.fit(perm_x)
    gmm.fit(perm_x)
    SSE[k]['perm'] = km.score(perm_x)
    ll[k]['perm'] = gmm.score(perm_x)

    acc[k]['perm']['Kmeans'] = cluster_acc(perm_y, km.predict(perm_x))
    acc[k]['perm']['GMM'] = cluster_acc(perm_y, gmm.predict(perm_x))
    adjMI[k]['perm']['Kmeans'] = ami(perm_y, km.predict(perm_x))
    adjMI[k]['perm']['GMM'] = ami(perm_y, gmm.predict(perm_x))

for k in clusters:
    km.set_params(n_clusters=k)
    gmm.set_params(n_components=k)

    km.fit(housing_x)
    gmm.fit(housing_x)
    SSE[k]['housing'] = km.score(housing_x)
    ll[k]['housing'] = gmm.score(housing_x)

    acc[k]['housing']['Kmeans'] = cluster_acc(housing_y, km.predict(housing_x))
    acc[k]['housing']['GMM'] = cluster_acc(housing_y, gmm.predict(housing_x))
    adjMI[k]['housing']['Kmeans'] = ami(housing_y, km.predict(housing_x))
    adjMI[k]['housing']['GMM'] = ami(housing_y, gmm.predict(housing_x))
poli_featvec = poli_featvec[poli_ground.keys()]

buzz_ground = buzz_ground.values()
poli_ground = poli_ground.values()

km = KMeans(n_clusters=81, n_init=1)
km1 = KMeans(n_clusters=310, n_init=1)

sc = 0.0
sc1 = 0.0
sc2 = 0.0
for i in xrange(10):
    km.fit(buzz_featvec)
    sc += nmi(buzz_ground, km.labels_)
    sc1 += ari(buzz_ground, km.labels_)
    sc2 += ami(buzz_ground, km.labels_)

print "BUZZ"
print "nmi score %f" % (sc / float(10))
print "ari score %f" % (sc1 / float(10))
print "ami score %f" % (sc2 / float(10))

sc = 0.0
sc1 = 0.0
sc2 = 0.0
for i in xrange(10):
    km1.fit(poli_featvec)
    sc += nmi(poli_ground, km1.labels_)
    sc1 += ari(poli_ground, km1.labels_)
    sc2 += ami(poli_ground, km1.labels_)
Ejemplo n.º 20
0
    def __do_perform(self,
                     custom_out=None,
                     main_experiment=None
                     ):  # ./output/ICA/clustering//{}', ICAExperiment
        if custom_out is not None:
            # if not os.path.exists(custom_out):
            #     os.makedirs(custom_out)
            self._old_out = self._out  # './output/ICA/{}'
            self._out = custom_out  # ./output/ICA/clustering//{}'
        elif self._old_out is not None:
            self._out = self._old_out

        if main_experiment is not None:
            self.log("Performing {} as part of {}".format(
                self.experiment_name(),
                main_experiment.experiment_name()))  # 'clustering', 'ICA'
        else:
            self.log("Performing {}".format(self.experiment_name()))

        # Adapted from https://github.com/JonathanTay/CS-7641-assignment-3/blob/master/clustering.py
        # %% Data for 1-3
        sse = defaultdict(list)
        ll = defaultdict(list)
        bic = defaultdict(list)
        sil = defaultdict(lambda: defaultdict(list))
        sil_s = np.empty(shape=(2 * len(self._clusters) *
                                self._details.ds.training_x.shape[0], 4),
                         dtype='<U21')
        acc = defaultdict(lambda: defaultdict(float))
        adj_mi = defaultdict(lambda: defaultdict(float))
        km = kmeans(random_state=self._details.seed)
        gmm = GMM(random_state=self._details.seed)

        st = clock()
        j = 0
        for k in self._clusters:
            km.set_params(n_clusters=k)
            gmm.set_params(n_components=k)
            km.fit(
                self._details.ds.training_x
            )  # cluster the ICA-transformed input features using kMeans with varying K
            gmm.fit(
                self._details.ds.training_x
            )  # cluster the ICA-transformed input features using GMM with varying k

            km_labels = km.predict(
                self._details.ds.training_x
            )  # give each ICA-transformed input feature a label
            gmm_labels = gmm.predict(self._details.ds.training_x)

            sil[k]['Kmeans'] = sil_score(
                self._details.ds.training_x, km_labels
            )  # compute mean silhouette score for all ICA-transformed input features
            sil[k]['GMM'] = sil_score(self._details.ds.training_x, gmm_labels)

            km_sil_samples = sil_samples(
                self._details.ds.training_x, km_labels
            )  # compute silhouette score for each ICA-transformed input feature
            gmm_sil_samples = sil_samples(self._details.ds.training_x,
                                          gmm_labels)
            # There has got to be a better way to do this, but I can't brain right now
            for i, x in enumerate(km_sil_samples):
                sil_s[j] = [
                    k, 'Kmeans', round(x, 6), km_labels[i]
                ]  # record the silhouette score x for each instance i given its label kn_labels[i] by kMeans with value k
                j += 1
            for i, x in enumerate(gmm_sil_samples):
                sil_s[j] = [k, 'GMM', round(x, 6), gmm_labels[i]]
                j += 1

            sse[k] = [
                km.score(self._details.ds.training_x)
            ]  # score (opposite of the value of X on the k-Means objective (what is the objective???)
            ll[k] = [gmm.score(self._details.ds.training_x)
                     ]  # per-sample average log-likelihood
            bic[k] = [
                gmm.bic(self._details.ds.training_x)
            ]  # bayesian information criterion (review ???) on the input X

            acc[k]['Kmeans'] = cluster_acc(
                self._details.ds.training_y, km_labels
            )  # compute the accuracy of the clustering algorithm on the ICA-transformed data (against the original y-label) if it predicted the majority y-label for each cluster
            acc[k]['GMM'] = cluster_acc(self._details.ds.training_y,
                                        gmm_labels)

            adj_mi[k]['Kmeans'] = ami(
                self._details.ds.training_y, km_labels
            )  # compute the adjusted mutual information between the true labels and the cluster predicted labels (how well does clustering match truth)
            adj_mi[k]['GMM'] = ami(self._details.ds.training_y, gmm_labels)

            self.log("Cluster: {}, time: {}".format(k, clock() - st))

        sse = (-pd.DataFrame(sse)).T
        sse.index.name = 'k'
        sse.columns = ['{} sse (left)'.format(self._details.ds_readable_name)
                       ]  # Bank sse (left)

        ll = pd.DataFrame(ll).T
        ll.index.name = 'k'
        ll.columns = [
            '{} log-likelihood'.format(self._details.ds_readable_name)
        ]  # Bank log-likelihood

        bic = pd.DataFrame(bic).T
        bic.index.name = 'k'
        bic.columns = ['{} BIC'.format(self._details.ds_readable_name)
                       ]  # Bank BIC

        sil = pd.DataFrame(sil).T
        sil_s = pd.DataFrame(sil_s, columns=['k', 'type', 'score',
                                             'label']).set_index('k')  #.T
        # sil_s = sil_s.T
        acc = pd.DataFrame(acc).T
        adj_mi = pd.DataFrame(adj_mi).T

        sil.index.name = 'k'
        sil_s.index.name = 'k'
        acc.index.name = 'k'
        adj_mi.index.name = 'k'

        # write scores to files
        sse.to_csv(self._out.format('{}_sse.csv'.format(
            self._details.ds_name)))
        ll.to_csv(
            self._out.format('{}_logliklihood.csv'.format(
                self._details.ds_name)))
        bic.to_csv(self._out.format('{}_bic.csv'.format(
            self._details.ds_name)))
        sil.to_csv(
            self._out.format('{}_sil_score.csv'.format(self._details.ds_name)))
        sil_s.to_csv(
            self._out.format('{}_sil_samples.csv'.format(
                self._details.ds_name)))
        acc.to_csv(self._out.format('{}_acc.csv'.format(
            self._details.ds_name)))
        adj_mi.to_csv(
            self._out.format('{}_adj_mi.csv'.format(self._details.ds_name)))

        # %% NN fit data (2,3)
        # train a NN on clustered data
        grid = {
            'km__n_clusters': self._clusters,
            'NN__alpha': self._nn_reg,
            'NN__hidden_layer_sizes': self._nn_arch
        }
        mlp = MLPClassifier(activation='relu',
                            max_iter=2000,
                            early_stopping=True,
                            random_state=self._details.seed)
        km = kmeans(random_state=self._details.seed,
                    n_jobs=self._details.threads)
        pipe = Pipeline(
            [('km', km), ('NN', mlp)], memory=experiments.pipeline_memory
        )  # run a NN on the clustered data (only on the cluster labels, or input features + cluster labels???)
        gs, _ = self.gs_with_best_estimator(
            pipe, grid, type='kmeans')  # write the best NN to file
        self.log("KMmeans Grid search complete")

        tmp = pd.DataFrame(gs.cv_results_)
        tmp.to_csv(
            self._out.format('{}_cluster_kmeans.csv'.format(
                self._details.ds_name))
        )  # write grid search results --> bank_cluster_kmeans.csv

        grid = {
            'gmm__n_components': self._clusters,
            'NN__alpha': self._nn_reg,
            'NN__hidden_layer_sizes': self._nn_arch
        }
        mlp = MLPClassifier(activation='relu',
                            max_iter=2000,
                            early_stopping=True,
                            random_state=self._details.seed)
        gmm = CustomGMM(random_state=self._details.seed)
        pipe = Pipeline([('gmm', gmm), ('NN', mlp)],
                        memory=experiments.pipeline_memory)
        gs, _ = self.gs_with_best_estimator(
            pipe, grid, type='gmm')  # write the best NN to file
        self.log("GMM search complete")

        tmp = pd.DataFrame(gs.cv_results_)
        tmp.to_csv(
            self._out.format('{}_cluster_GMM.csv'.format(
                self._details.ds_name))
        )  # write grid search results --> bank_cluster_GMM.csv

        # %% For chart 4/5
        # perform TSNE D.R on training data (why???)
        self._details.ds.training_x2D = TSNE(
            verbose=10, random_state=self._details.seed).fit_transform(
                self._details.ds.training_x)

        ds_2d = pd.DataFrame(
            np.hstack((self._details.ds.training_x2D,
                       np.atleast_2d(self._details.ds.training_y).T)),
            columns=['x', 'y', 'target']
        )  # prepare NN-learnable data using TSNE D.R'd input features + label
        ds_2d.to_csv(
            self._out.format('{}_2D.csv'.format(
                self._details.ds_name)))  # --> bank_2D.csv
        self.log("Done")
Ejemplo n.º 21
0
X = np.append(X, noise, axis = 1)
X=  normalize(X)
# Y = SelfOrganizingSwarm(iterations=250, alpha=1, beta = 0.9,delta=0.001, theta=3).fit_transform(X)
# Y = PCA(2).fit_transform(X)
# Y =TSNE().fit_transform(X)
Y= GSOM().fit_transform(X, lr = 1.0, beta=0.5, sf=0.6, wd=0.175, fd=0.8)#X,lr = 1.0, beta=0.0,sf=0.01, fd=0.75, wd=0.5)
# fig = plt.figure()
# ax = Axes3D(fig)00
# ax.scatter(X.T[0], X.T.[1], X.T[2],c = color, alpha=0.5, edgecolors='none')
# plt.show()
plt.subplot(211)
# ax = fig.add_subplot(211)
plt.scatter(Y.T[0], Y.T[1], s = 15, c = plt.cm.jet(color/(n_clusters*1.0)), edgecolors='none', alpha=0.375)


labs = KMeans(n_clusters).fit(Y).labels_

plt.subplot(212)
plt.scatter(Y.T[0], Y.T[1], s = 15, c =plt.cm.jet(labs/(n_clusters*1.0)), edgecolors='none', alpha=0.375)


print 'ars ', ars(color,labs)
print 'ami ', ami(color, labs)


#
# Y = Isomap().fit_transform(X)
# ax2 = fig.add_subplot(121)
# ax2.scatter(Y.T[0], Y.T[1], c = color, edgecolors='none', alpha=0.5)

plt.show()
Ejemplo n.º 22
0
            sil[k]['Kmeans'] = sil_score(dataX, km_labels)
            sil[k]['GMM'] = sil_score(dataX, gmm_labels)
            km_sil_samples = sil_samples(dataX, km_labels)
            gmm_sil_samples = sil_samples(dataX, gmm_labels)
            for i, x in enumerate(km_sil_samples):
                sil_samp[j] = [k, 'Kmeans', round(x, 6), km_labels[i]]
                j += 1
            for i, x in enumerate(gmm_sil_samples):
                sil_samp[j] = [k, 'GMM', round(x, 6), gmm_labels[i]]
                j += 1
            sse[k] = km.score(dataX)
            ll[k] = gmm.score(dataX)
            bic[k] = gmm.bic(dataX)
            acc[k]['Kmeans'] = cluster_acc(dataY,km.predict(dataX))
            acc[k]['GMM'] = cluster_acc(dataY,gmm.predict(dataX))
            adj_mi[k]['Kmeans'] = ami(dataY,km.predict(dataX))
            adj_mi[k]['GMM'] = ami(dataY,gmm.predict(dataX))



        gmm_clusters = pd.DataFrame()
        kmeans_clusters = pd.DataFrame()

        for i in clusters:
            gmm_clusters[i] = labels[i]['GMM']
            kmeans_clusters[i] = labels[i]['Kmeans']

        bic = pd.DataFrame(bic, index=[0]).T
        bic.index.name = 'k'
        bic.rename(columns= {bic.columns[0]: 'BIC'}, inplace=True)
        
Ejemplo n.º 23
0
    #Sum of Squared Errors for K-means
    SSE[k]['Faults'] = km.score(faultsX)

    #Log-Likelihood for GMM
    ll[k]['Faults'] = gmm.score(faultsX)

    #Silhouette Score
    #The best value is 1 and the worst value is -1. Silhouette analysis can be used to study the separation distance between the resulting clusters.
    SS[k]['Faults']['Kmeans'] = ss(faultsX, km.predict(faultsX))
    SS[k]['Faults']['GMM'] = ss(faultsX, gmm.predict(faultsX))
    #Cluster Accuracy
    acc[k]['Faults']['Kmeans'] = cluster_acc(faultsY, km.predict(faultsX))
    acc[k]['Faults']['GMM'] = cluster_acc(faultsY, gmm.predict(faultsX))

    #Adjusted Mutual Information
    adjMI[k]['Faults']['Kmeans'] = ami(faultsY, km.predict(faultsX))
    adjMI[k]['Faults']['GMM'] = ami(faultsY, gmm.predict(faultsX))

    #Breast Cancer dataset
    km.fit(bcX)
    gmm.fit(bcX)
    SSE[k]['BreastC'] = km.score(bcX)
    ll[k]['BreastC'] = gmm.score(bcX)
    SS[k]['BreastC']['Kmeans'] = ss(bcX, km.predict(bcX))
    SS[k]['BreastC']['GMM'] = ss(bcX, gmm.predict(bcX))
    acc[k]['BreastC']['Kmeans'] = cluster_acc(bcY, km.predict(bcX))
    acc[k]['BreastC']['GMM'] = cluster_acc(bcY, gmm.predict(bcX))
    adjMI[k]['BreastC']['Kmeans'] = ami(bcY, km.predict(bcX))
    adjMI[k]['BreastC']['GMM'] = ami(bcY, gmm.predict(bcX))
    print(k, clock() - st)
Ejemplo n.º 24
0
def clustering_experiment(X, y, name, clusters, rdir):
    """Generate results CSVs for given datasets using the K-Means and EM
    clustering algorithms.

    Args:
        X (Numpy.Array): Attributes.
        y (Numpy.Array): Labels.
        name (str): Dataset name.
        clusters (list[int]): List of k values.
        rdir (str): Output directory.

    """
    sse = defaultdict(dict)  # sum of squared errors
    logl = defaultdict(dict)  # log-likelihood
    bic = defaultdict(dict)  # BIC for EM
    silhouette = defaultdict(dict)  # silhouette score
    acc = defaultdict(lambda: defaultdict(dict))  # accuracy scores
    adjmi = defaultdict(lambda: defaultdict(dict))  # adjusted mutual info
    km = KMeans(random_state=0)  # K-Means
    gmm = GMM(random_state=0)  # Gaussian Mixture Model (EM)

    # start loop for given values of k
    for k in clusters:
        km.set_params(n_clusters=k)
        gmm.set_params(n_components=k)
        km.fit(X)
        gmm.fit(X)

        # calculate SSE, log-likelihood, accuracy, and adjusted mutual info
        sse[k][name] = km.score(X)
        logl[k][name] = gmm.score(X)
        acc[k][name]['km'] = cluster_acc(y, km.predict(X))
        acc[k][name]['gmm'] = cluster_acc(y, gmm.predict(X))
        adjmi[k][name]['km'] = ami(y, km.predict(X))
        adjmi[k][name]['gmm'] = ami(y, gmm.predict(X))

        # calculate silhouette score for K-Means
        km_silhouette = silhouette_score(X, km.predict(X))
        silhouette[k][name] = km_silhouette

        # calculate BIC for EM
        bic[k][name] = gmm.bic(X)

    # generate output dataframes
    sse = (-pd.DataFrame(sse)).T
    sse.rename(columns={name: 'sse'}, inplace=True)
    logl = pd.DataFrame(logl).T
    logl.rename(columns={name: 'log-likelihood'}, inplace=True)
    bic = pd.DataFrame(bic).T
    bic.rename(columns={name: 'bic'}, inplace=True)
    silhouette = pd.DataFrame(silhouette).T
    silhouette.rename(columns={name: 'silhouette_score'}, inplace=True)
    acc = pd.Panel(acc)
    acc = acc.loc[:, :, name].T.rename(lambda x: '{}_acc'.format(x),
                                       axis='columns')
    adjmi = pd.Panel(adjmi)
    adjmi = adjmi.loc[:, :, name].T.rename(lambda x: '{}_adjmi'.format(x),
                                           axis='columns')

    # concatenate all results
    dfs = (sse, silhouette, logl, bic, acc, adjmi)
    metrics = pd.concat(dfs, axis=1)
    resfile = get_abspath('{}_metrics.csv'.format(name), rdir)
    metrics.to_csv(resfile, index_label='k')
Ejemplo n.º 25
0
        g = gt.load_graph_from_csv(G.graph['edgelist'],
                                   directed=isDirected,
                                   csv_options={
                                       "delimiter": " ",
                                       "quotechar": '"'
                                   })
        block = gt.minimize_nested_blockmodel_dl(
            g,
            B_min=G.graph['number_communities'],
            B_max=G.graph['number_communities'])
        num_block = block.levels[0].get_B()
        block = block.levels[0].get_blocks()
        partition = [0 for i in range(G.number_of_nodes())]
        for i in range(G.number_of_nodes()):  #for every node
            partition[i] = block[i]
        zsbm.append(ami(partition, G.graph['labels_communities']))

        igraph = ig.Read_Edgelist(G.graph['edgelist'])
        part = igraph.community_infomap()
        partition = [0 for i in range(G.number_of_nodes())]
        for i in range(G.number_of_nodes()):
            for j in range(len(part)):
                if i in part[j]:
                    partition[i] = j
        zinfomap.append(ami(partition, G.graph['labels_communities']))

        Y = community.best_partition(G.to_undirected(
        ))  #https://perso.crans.org/aynaud/communities/api.html
        #uses Louvain heuristices
        partition = [0 for i in range(G.number_of_nodes())]
        for k in range(G.number_of_nodes()):
Ejemplo n.º 26
0
gmm = GMM(random_state=5)

st = clock()
abaloneX2D = TSNE(verbose=10, random_state=5).fit_transform(abaloneX)
digitsX2D = TSNE(verbose=10, random_state=5).fit_transform(digitsX)

for k in clusters:
    km.set_params(n_clusters=k)
    gmm.set_params(n_components=k)
    km.fit(abaloneX)
    gmm.fit(abaloneX)
    SSE[k]['abalone'] = km.score(abaloneX)
    ll[k]['abalone'] = gmm.score(abaloneX)
    acc[k]['abalone']['Kmeans'] = cluster_acc(abaloneY, km.predict(abaloneX))
    acc[k]['abalone']['GMM'] = cluster_acc(abaloneY, gmm.predict(abaloneX))
    adjMI[k]['abalone']['Kmeans'] = ami(abaloneY, km.predict(abaloneX))
    adjMI[k]['abalone']['GMM'] = ami(abaloneY, gmm.predict(abaloneX))
    silhouette[k]['abalone']['Kmeans'] = silhouette_score(abaloneX,
                                                          km.labels_,
                                                          metric='euclidean')
    silhouette[k]['abalone']['GMM'] = silhouette_score(abaloneX,
                                                       gmm.predict(abaloneX),
                                                       metric='euclidean')

    abalone2D = pd.DataFrame(np.hstack(
        (abaloneX2D, np.atleast_2d(km.predict(abaloneX)).T)),
                             columns=['x', 'y', 'target'])
    abalone2D.to_csv(out + 'abalone2D_km_{}.csv'.format(k))
    abalone2D = pd.DataFrame(np.hstack(
        (abaloneX2D, np.atleast_2d(gmm.predict(abaloneX)).T)),
                             columns=['x', 'y', 'target'])
Ejemplo n.º 27
0
    gmm.set_params(n_components=k)
    km.fit(X_train)
    gmm.fit(X_train)
    #km.score = Opposite of the value of X on the K-means objective.
    #         =Sum of distances of samples to their closest cluster center
    SSE[k]['Diamond'] = km.score(X_train)
    ll[k]['Diamond'] = gmm.score(X_train)

    aic[k]['Diamond'] = gmm.aic(X_train)
    bic[k]['Diamond'] = gmm.bic(X_train)

    #training accuracy
    acc_[k]['Diamond']['Kmeans'] = cluster_acc(Y_test, km.predict(X_test))
    acc_[k]['Diamond']['GMM'] = cluster_acc(Y_test, gmm.predict(X_test))
    #mutual information score
    adjMI[k]['Diamond']['Kmeans'] = ami(Y_test, km.predict(X_test))
    adjMI[k]['Diamond']['GMM'] = ami(Y_test, gmm.predict(X_test))

    km.fit(X_train2)
    gmm.fit(X_train2)
    SSE[k]['CreditCard'] = km.score(X_train2)
    ll[k]['CreditCard'] = gmm.score(X_train2)
    aic[k]['CreditCard'] = gmm.aic(X_train2)
    bic[k]['CreditCard'] = gmm.bic(X_train2)

    acc_[k]['CreditCard']['Kmeans'] = cluster_acc(Y_test2, km.predict(X_test2))
    acc_[k]['CreditCard']['GMM'] = cluster_acc(Y_test2, gmm.predict(X_test2))
    adjMI[k]['CreditCard']['Kmeans'] = ami(Y_test2, km.predict(X_test2))
    adjMI[k]['CreditCard']['GMM'] = ami(Y_test2, gmm.predict(X_test2))
    print('cluster: ', k, 'Wall clock time', clock() - st)
Ejemplo n.º 28
0
    def __do_perform(self, custom_out=None, main_experiment=None):
        if custom_out is not None:
            # if not os.path.exists(custom_out):
            #     os.makedirs(custom_out)
            self._old_out = self._out
            self._out = custom_out
        elif self._old_out is not None:
            self._out = self._old_out

        if main_experiment is not None:
            self.log("Performing {} as part of {}".format(self.experiment_name(), main_experiment.experiment_name()))
        else:
            self.log("Performing {}".format(self.experiment_name()))

        # Adapted from https://github.com/JonathanTay/CS-7641-assignment-3/blob/master/clustering.py
        # %% Data for 1-3
        sse = defaultdict(list)
        ll = defaultdict(list)
        bic = defaultdict(list)
        sil = defaultdict(lambda: defaultdict(list))
        sil_s = np.empty(shape=(2*len(self._clusters)*self._details.ds.training_x.shape[0],4), dtype='<U21')
        acc = defaultdict(lambda: defaultdict(float))
        adj_mi = defaultdict(lambda: defaultdict(float))
        km = kmeans(random_state=self._details.seed)
        gmm = GMM(random_state=self._details.seed)

        st = clock()
        j = 0
        for k in self._clusters:
            km.set_params(n_clusters=k)
            gmm.set_params(n_components=k)
            km.fit(self._details.ds.training_x)
            gmm.fit(self._details.ds.training_x)

            km_labels = km.predict(self._details.ds.training_x)
            gmm_labels = gmm.predict(self._details.ds.training_x)

            sil[k]['Kmeans'] = sil_score(self._details.ds.training_x, km_labels)
            sil[k]['GMM'] = sil_score(self._details.ds.training_x, gmm_labels)

            km_sil_samples = sil_samples(self._details.ds.training_x, km_labels)
            gmm_sil_samples = sil_samples(self._details.ds.training_x, gmm_labels)
            # There has got to be a better way to do this, but I can't brain right now
            for i, x in enumerate(km_sil_samples):
                sil_s[j] = [k, 'Kmeans', round(x, 6), km_labels[i]]
                j += 1
            for i, x in enumerate(gmm_sil_samples):
                sil_s[j] = [k, 'GMM', round(x, 6), gmm_labels[i]]
                j += 1

            sse[k] = [km.score(self._details.ds.training_x)]
            ll[k] = [gmm.score(self._details.ds.training_x)]
            bic[k] = [gmm.bic(self._details.ds.training_x)]

            acc[k]['Kmeans'] = cluster_acc(self._details.ds.training_y, km_labels)
            acc[k]['GMM'] = cluster_acc(self._details.ds.training_y, gmm_labels)

            adj_mi[k]['Kmeans'] = ami(self._details.ds.training_y, km_labels)
            adj_mi[k]['GMM'] = ami(self._details.ds.training_y, gmm_labels)

            self.log("Cluster: {}, time: {}".format(k, clock() - st))

        sse = (-pd.DataFrame(sse)).T
        sse.index.name = 'k'
        sse.columns = ['{} sse (left)'.format(self._details.ds_readable_name)]

        ll = pd.DataFrame(ll).T
        ll.index.name = 'k'
        ll.columns = ['{} log-likelihood'.format(self._details.ds_readable_name)]

        bic = pd.DataFrame(bic).T
        bic.index.name = 'k'
        bic.columns = ['{} BIC'.format(self._details.ds_readable_name)]

        sil = pd.DataFrame(sil).T
        sil_s = pd.DataFrame(sil_s, columns=['k', 'type', 'score', 'label']).set_index('k')  #.T
        # sil_s = sil_s.T
        acc = pd.DataFrame(acc).T
        adj_mi = pd.DataFrame(adj_mi).T

        sil.index.name = 'k'
        sil_s.index.name = 'k'
        acc.index.name = 'k'
        adj_mi.index.name = 'k'

        sse.to_csv(self._out.format('{}_sse.csv'.format(self._details.ds_name)))
        ll.to_csv(self._out.format('{}_logliklihood.csv'.format(self._details.ds_name)))
        bic.to_csv(self._out.format('{}_bic.csv'.format(self._details.ds_name)))
        sil.to_csv(self._out.format('{}_sil_score.csv'.format(self._details.ds_name)))
        sil_s.to_csv(self._out.format('{}_sil_samples.csv'.format(self._details.ds_name)))
        acc.to_csv(self._out.format('{}_acc.csv'.format(self._details.ds_name)))
        adj_mi.to_csv(self._out.format('{}_adj_mi.csv'.format(self._details.ds_name)))

        # %% NN fit data (2,3)
        grid = {'km__n_clusters': self._clusters, 'NN__alpha': self._nn_reg, 'NN__hidden_layer_sizes': self._nn_arch}
        mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=self._details.seed)
        km = kmeans(random_state=self._details.seed, n_jobs=self._details.threads)
        pipe = Pipeline([('km', km), ('NN', mlp)], memory=experiments.pipeline_memory)
        gs, _ = self.gs_with_best_estimator(pipe, grid, type='kmeans')
        self.log("KMmeans Grid search complete")

        tmp = pd.DataFrame(gs.cv_results_)
        tmp.to_csv(self._out.format('{}_cluster_kmeans.csv'.format(self._details.ds_name)))

        grid = {'gmm__n_components': self._clusters, 'NN__alpha': self._nn_reg, 'NN__hidden_layer_sizes': self._nn_arch}
        mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=self._details.seed)
        gmm = CustomGMM(random_state=self._details.seed)
        pipe = Pipeline([('gmm', gmm), ('NN', mlp)], memory=experiments.pipeline_memory)
        gs, _ = self.gs_with_best_estimator(pipe, grid, type='gmm')
        self.log("GMM search complete")

        tmp = pd.DataFrame(gs.cv_results_)
        tmp.to_csv(self._out.format('{}_cluster_GMM.csv'.format(self._details.ds_name)))

        # %% For chart 4/5
        self._details.ds.training_x2D = TSNE(verbose=10, random_state=self._details.seed).fit_transform(
            self._details.ds.training_x
        )

        ds_2d = pd.DataFrame(np.hstack((self._details.ds.training_x2D, np.atleast_2d(self._details.ds.training_y).T)),
                             columns=['x', 'y', 'target'])
        ds_2d.to_csv(self._out.format('{}_2D.csv'.format(self._details.ds_name)))
        self.log("Done")
Ejemplo n.º 29
0
def run_clustering(out, cancer_x, cancer_y, housing_x, housing_y):
    SSE = defaultdict(dict)
    ll = defaultdict(dict)
    acc = defaultdict(lambda: defaultdict(dict))
    adjMI = defaultdict(lambda: defaultdict(dict))
    km = kmeans(random_state=5)
    gmm = GMM(random_state=5)

    silhouette = defaultdict(lambda: defaultdict(dict))
    completeness = defaultdict(lambda: defaultdict(dict))
    homogeniety = defaultdict(lambda: defaultdict(dict))

    st = clock()
    for k in range(2, 20, 1):
        km.set_params(n_clusters=k)
        gmm.set_params(n_components=k)
        km.fit(cancer_x)
        gmm.fit(cancer_x)

        SSE[k]['cancer'] = km.score(cancer_x)
        ll[k]['cancer'] = gmm.score(cancer_x)

        acc[k]['cancer']['Kmeans'] = cluster_acc(cancer_y,
                                                 km.predict(cancer_x))
        acc[k]['cancer']['GMM'] = cluster_acc(cancer_y, gmm.predict(cancer_x))

        adjMI[k]['cancer']['Kmeans'] = ami(cancer_y, km.predict(cancer_x))
        adjMI[k]['cancer']['GMM'] = ami(cancer_y, gmm.predict(cancer_x))

        silhouette[k]['cancer']['Kmeans Silhouette'] = ss(
            cancer_x, km.predict(cancer_x))
        silhouette[k]['cancer']['GMM Silhouette'] = ss(cancer_x,
                                                       gmm.predict(cancer_x))

        completeness[k]['cancer']['Kmeans Completeness'] = cs(
            cancer_y, km.predict(cancer_x))
        completeness[k]['cancer']['GMM Completeness'] = cs(
            cancer_y, gmm.predict(cancer_x))

        homogeniety[k]['cancer']['Kmeans Homogeniety'] = hs(
            cancer_y, km.predict(cancer_x))
        homogeniety[k]['cancer']['GMM Homogeniety'] = hs(
            cancer_y, gmm.predict(cancer_x))

        km.fit(housing_x)
        gmm.fit(housing_x)
        SSE[k]['housing'] = km.score(housing_x)
        ll[k]['housing'] = gmm.score(housing_x)

        acc[k]['housing']['Kmeans'] = cluster_acc(housing_y,
                                                  km.predict(housing_x))
        acc[k]['housing']['GMM'] = cluster_acc(housing_y,
                                               gmm.predict(housing_x))

        adjMI[k]['housing']['Kmeans'] = ami(housing_y, km.predict(housing_x))
        adjMI[k]['housing']['GMM'] = ami(housing_y, gmm.predict(housing_x))

        silhouette[k]['housing']['Kmeans Silhouette'] = ss(
            housing_x, km.predict(housing_x))
        silhouette[k]['housing']['GMM Silhouette'] = ss(
            housing_x, gmm.predict(housing_x))

        completeness[k]['housing']['Kmeans Completeness'] = cs(
            housing_y, km.predict(housing_x))
        completeness[k]['housing']['GMM Completeness'] = cs(
            housing_y, gmm.predict(housing_x))

        homogeniety[k]['housing']['Kmeans Homogeniety'] = hs(
            housing_y, km.predict(housing_x))
        homogeniety[k]['housing']['GMM Homogeniety'] = hs(
            housing_y, gmm.predict(housing_x))

        print(k, clock() - st)
    SSE = (-pd.DataFrame(SSE)).T
    SSE.rename(columns=lambda x: x + ' SSE (left)', inplace=True)
    ll = pd.DataFrame(ll).T
    ll.rename(columns=lambda x: x + ' log-likelihood', inplace=True)
    acc = pd.Panel(acc)

    adjMI = pd.Panel(adjMI)

    silhouette = pd.Panel(silhouette)
    completeness = pd.Panel(completeness)
    homogeniety = pd.Panel(homogeniety)

    SSE.to_csv(out + 'SSE.csv')
    ll.to_csv(out + 'logliklihood.csv')
    acc.ix[:, :, 'housing'].to_csv(out + 'Housing acc.csv')
    acc.ix[:, :, 'cancer'].to_csv(out + 'Perm acc.csv')

    adjMI.ix[:, :, 'housing'].to_csv(out + 'Housing adjMI.csv')
    adjMI.ix[:, :, 'cancer'].to_csv(out + 'Perm adjMI.csv')

    silhouette.ix[:, :, 'cancer'].to_csv(out + 'Perm silhouette.csv')
    completeness.ix[:, :, 'cancer'].to_csv(out + 'Perm completeness.csv')
    homogeniety.ix[:, :, 'cancer'].to_csv(out + 'Perm homogeniety.csv')

    silhouette.ix[:, :, 'housing'].to_csv(out + 'housing silhouette.csv')
    completeness.ix[:, :, 'housing'].to_csv(out + 'housing completeness.csv')
    homogeniety.ix[:, :, 'housing'].to_csv(out + 'housing homogeniety.csv')
acc = defaultdict(lambda: defaultdict(dict))
adjMI = defaultdict(lambda: defaultdict(dict))
km = kmeans(random_state=5)
gmm = GMM(random_state=5)

st = clock()
for k in clusters:
    km.set_params(n_clusters=k)
    gmm.set_params(n_components=k)
    km.fit(digitsX)
    gmm.fit(digitsX)
    SSE[k]['Digits'] = km.score(digitsX)
    ll[k]['Digits'] = gmm.score(digitsX)
    acc[k]['Digits']['Kmeans'] = cluster_acc(digitsY, km.predict(digitsX))
    acc[k]['Digits']['GMM'] = cluster_acc(digitsY, gmm.predict(digitsX))
    adjMI[k]['Digits']['Kmeans'] = ami(digitsY, km.predict(digitsX))
    adjMI[k]['Digits']['GMM'] = ami(digitsY, gmm.predict(digitsX))
    print(k, clock() - st)

SSE = (-pd.DataFrame(SSE)).T
SSE.rename(columns=lambda x: x + ' SSE (left)', inplace=True)
ll = pd.DataFrame(ll).T
ll.rename(columns=lambda x: x + ' log-likelihood', inplace=True)
acc = pd.Panel(acc)
adjMI = pd.Panel(adjMI)

SSE.to_csv(out + 'SSE.csv')
ll.to_csv(out + 'logliklihood.csv')
acc.ix[:, :, 'Digits'].to_csv(out + 'Digits acc.csv')
adjMI.ix[:, :, 'Digits'].to_csv(out + 'Digits adjMI.csv')
Ejemplo n.º 31
0
    ax1.plot(clusters, [m.bic(dataX) for m in models], color=color)
    ax1.tick_params(axis='y', labelcolor=color)
    ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis
    color = 'tab:blue'
    ax2.set_ylabel('Log Likelihood',
                   color=color)  # we already handled the x-label with ax1
    ax2.plot(clusters, [m.score(dataX) for m in models], color=color)
    ax2.tick_params(axis='y', labelcolor=color)
    plt.title("Evaluation of E-M Metrics - {}".format(dataset))
    plt.savefig(out + "{}_EM_BIC_LL.png".format(dataset))

    print("Plotting adjusted mutual info...")
    plt.close()
    plt.figure()
    plot_ami = [
        ami(dataY, m.predict(dataX), average_method='arithmetic')
        for m in models
    ]
    plt.plot(clusters, plot_ami)
    plt.xlabel('Clusters')
    plt.ylabel('Adjusted Mutual Information')
    plt.title("Performance of E-M {}".format(dataset))
    plt.savefig(out + "{}_EM_AMI.png".format(dataset))

    print("Validating EM labels....")
    if dataset == 'QSAR':
        k = 5
    else:
        k = 10
    model = GaussianMixture(k, covariance_type='full', random_state=0)
    labels = model.fit_predict(dataX)