def run_km_em(perm_x, perm_y, dname, clstr): SSE_km_perm = [] ll_em_perm = [] acc_km_perm = [] acc_em_perm = [] adjMI_km_perm = [] adjMI_em_perm = [] homo_km_perm = [] homo_em_perm = [] comp_km_perm = [] comp_em_perm = [] silhou_km_perm = [] bic_em_perm = [] clk_time = [] for k in clstr: st = clock() km = KMeans(n_clusters=k, random_state=10) gmm = GMM(n_components=k, random_state=10) SSE_km_perm.append(-km.score(perm_x, km.fit_predict(perm_x))) ll_em_perm.append(gmm.score(perm_x, gmm.fit_predict(perm_x))) acc_km_perm.append(cluster_acc(perm_y, km.fit_predict(perm_x))) acc_em_perm.append(cluster_acc(perm_y, gmm.fit_predict(perm_x))) adjMI_km_perm.append(ami(perm_y, km.fit_predict(perm_x))) adjMI_em_perm.append(ami(perm_y, gmm.fit_predict(perm_x))) homo_km_perm.append( metrics.homogeneity_score(perm_y, km.fit_predict(perm_x))) homo_em_perm.append( metrics.homogeneity_score(perm_y, gmm.fit_predict(perm_x))) comp_km_perm.append( metrics.completeness_score(perm_y, km.fit_predict(perm_x))) comp_em_perm.append( metrics.completeness_score(perm_y, gmm.fit_predict(perm_x))) silhou_km_perm.append( metrics.silhouette_score(perm_x, km.fit_predict(perm_x))) bic_em_perm.append(gmm.bic(perm_x)) clk_time.append(clock() - st) print(k, clock() - st) dbcluster = pd.DataFrame({ 'k': clstr, 'SSE_km': SSE_km_perm, 'll_em': ll_em_perm, 'acc_km': acc_km_perm, 'acc_em': acc_em_perm, 'adjMI_km': adjMI_km_perm, 'adjMI_em': adjMI_em_perm, 'homo_km': homo_km_perm, 'homo_em': homo_em_perm, 'comp_km': comp_km_perm, 'comp_em': comp_em_perm, 'silhou_km': silhou_km_perm, 'bic_em': bic_em_perm, 'clk_time': clk_time }) dbcluster.to_csv('./results/cluster_{}.csv'.format(dname), sep=',')
def run_adult_analysis(adultX, adultY): np.random.seed(0) clusters = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 18, 20, 25, 30] print( 'Part 1 - Running clustering algoirthms on original datasets...adult') SSE = defaultdict(dict) BIC = defaultdict(dict) h**o = defaultdict(lambda: defaultdict(dict)) compl = defaultdict(lambda: defaultdict(dict)) adjMI = defaultdict(lambda: defaultdict(dict)) km = kmeans(random_state=5) gmm = GMM(random_state=5) st = clock() for k in clusters: km.set_params(n_clusters=k) gmm.set_params(n_components=k) km.fit(adultX) gmm.fit(adultX) SSE[k]['Adult SSE'] = km.score(adultX) BIC[k]['Adult BIC'] = gmm.bic(adultX) #BAYESIAN INFORMATION CRITERION #true = np.transpose(adultY) #pred = np.transpose(km.predict(adultX)) h**o[k]['Adult']['Kmeans'] = homogeneity_score( adultY, km.predict(adultX)) # Agreement of labels h**o[k]['Adult']['GMM'] = homogeneity_score(adultY, gmm.predict(adultX)) compl[k]['Adult']['Kmeans'] = completeness_score( adultY, km.predict(adultX) ) #A clustering result satisfies completeness if all the data points that are members of a given class are elements of the same cluster compl[k]['Adult']['GMM'] = completeness_score(adultY, gmm.predict(adultX)) adjMI[k]['Adult']['Kmeans'] = ami( adultY, km.predict(adultX)) #ADJUSTED MUTUAL INFORMATION adjMI[k]['A dult']['GMM'] = ami(adultY, gmm.predict(adultX)) print(k, clock() - st) SSE = (-pd.DataFrame(SSE)).T BIC = pd.DataFrame(BIC).T h**o = pd.Panel(h**o) compl = pd.Panel(compl) adjMI = pd.Panel(adjMI) SSE.to_csv( './P1_Clustering_Algorithms_Non_Transformed/Adult_Cluster_Select_Kmeans.csv' ) BIC.to_csv( './P1_Clustering_Algorithms_Non_Transformed/Adult_Cluster_Select_GMM.csv' ) h**o.ix[:, :, 'Adult'].to_csv( './P1_Clustering_Algorithms_Non_Transformed/Adult_homo.csv') compl.ix[:, :, 'Adult'].to_csv( './P1_Clustering_Algorithms_Non_Transformed/Adult_compl.csv') adjMI.ix[:, :, 'Adult'].to_csv( './P1_Clustering_Algorithms_Non_Transformed/Adult_adjMI.csv')
def performance(encoder, models, K): mean_ami = dict(zip(models.keys(), list(np.zeros(len(models))))) mean_chs = dict(zip(models.keys(), list(np.zeros(len(models))))) mean_sil = dict(zip(models.keys(), list(np.zeros(len(models))))) tic = time.perf_counter() for i in range(K): features_enc = encoder.fit_transform(features, target) for key in models: model = models[key] y_predict = model.fit_predict(features_enc, target) mean_ami[key] += ami(target, y_predict)/K mean_chs[key] += chs(features_enc, y_predict)/K mean_sil[key] += sil(features_enc, y_predict, metric='euclidean')/K toc = time.perf_counter() # Write results to file res = open('../results/'+name_prefix+'_results.txt', 'a') res.write(type(encoder).__name__[0:-7]+' Encoder\n') for key in mean_ami: res.write(' '+key+': '+str(mean_ami[key])+', '+str(mean_chs[key])+', '+str(mean_sil[key])+'\n') res.write('Total time: '+str(round(toc-tic,3))+'\n') res.close() print('Evaluation of', type(encoder).__name__[0:-7], 'Encoder completed in', round(toc-tic,3),'s')
def __iteration_summary(self, mcmc_iteration, intra_step_count, temp_assignments): print( '----------------------------------------------------------------') print('iteration: %i' % mcmc_iteration) print(time.strftime('%H:%M:%S %d/%m')) if self.gt_display is not None: gt = cp.deepcopy(self.gt_display) gt += np.abs(np.min(gt)) bgt = np.bincount(gt) hist_num_groups = np.zeros(self.cfg.L) hist_num_groups[len(self.m[self.m > self.nump * 1E-2]) - 1] += 1 valG = len(self.m[self.m > self.nump * 5E-4]) - 1 selG = np.sort(self.m)[::-1] candidates = [ obs_id for obs_id, n in enumerate(self.m) if n in selG[:valG + 1] ] # print(candidates) print('Number of active clusters: %i' % len(candidates)) res = sorted( [int(itm) for itm in self.m[candidates] if itm > self.nump * 1E-4], reverse=True) print(res) if self.gt_display is not None: print(set(self.gt_display)) print('GT: %s' % bgt[bgt > 0]) print('NMI: %f' % ami(self.gt_display, temp_assignments)) print('residue: %i' % (self.nump - np.sum(res)))
def run_clustering(out, perm_x, perm_y, housing_x, housing_y): SSE = defaultdict(dict) ll = defaultdict(dict) acc = defaultdict(lambda: defaultdict(dict)) adjMI = defaultdict(lambda: defaultdict(dict)) km = kmeans(random_state=5) gmm = GMM(random_state=5) st = clock() for k in clusters: km.set_params(n_clusters=k) gmm.set_params(n_components=k) km.fit(perm_x) gmm.fit(perm_x) SSE[k]['perm'] = km.score(perm_x) ll[k]['perm'] = gmm.score(perm_x) acc[k]['perm']['Kmeans'] = cluster_acc(perm_y, km.predict(perm_x)) acc[k]['perm']['GMM'] = cluster_acc(perm_y, gmm.predict(perm_x)) adjMI[k]['perm']['Kmeans'] = ami(perm_y, km.predict(perm_x)) adjMI[k]['perm']['GMM'] = ami(perm_y, gmm.predict(perm_x)) km.fit(housing_x) gmm.fit(housing_x) SSE[k]['housing'] = km.score(housing_x) ll[k]['housing'] = gmm.score(housing_x) acc[k]['housing']['Kmeans'] = cluster_acc(housing_y, km.predict(housing_x)) acc[k]['housing']['GMM'] = cluster_acc(housing_y, gmm.predict(housing_x)) adjMI[k]['housing']['Kmeans'] = ami(housing_y, km.predict(housing_x)) adjMI[k]['housing']['GMM'] = ami(housing_y, gmm.predict(housing_x)) print(k, clock() - st) SSE = (-pd.DataFrame(SSE)).T SSE.rename(columns=lambda x: x + ' SSE (left)', inplace=True) ll = pd.DataFrame(ll).T ll.rename(columns=lambda x: x + ' log-likelihood', inplace=True) acc = pd.Panel(acc) adjMI = pd.Panel(adjMI) SSE.to_csv(out + 'SSE.csv') ll.to_csv(out + 'logliklihood.csv') acc.ix[:, :, 'housing'].to_csv(out + 'Housing acc.csv') acc.ix[:, :, 'perm'].to_csv(out + 'Perm acc.csv') adjMI.ix[:, :, 'housing'].to_csv(out + 'Housing adjMI.csv') adjMI.ix[:, :, 'perm'].to_csv(out + 'Perm adjMI.csv')
def tensfact_baseline(): n_clusters = 81 f = open('buzz_user_tensor_45.npy') X_buzz = np.load(f) print X_buzz.shape X_buzz = X_buzz[buzz_ground.keys()] buzz_ground1 = buzz_ground.values() km = KMeans(n_clusters=81, init='k-means++', n_init=1, verbose=False) sc = 0.0 sc1 = 0.0 sc2 = 0.0 for i in xrange(10): km.fit(X_buzz) sc += nmi(buzz_ground1, km.labels_) sc1 += ari(buzz_ground1, km.labels_) sc2 += ami(buzz_ground1, km.labels_) print "BUZZ" print "nmi score %f" % (sc / float(10)) print "ari score %f" % (sc1 / float(10)) print "ami score %f" % (sc2 / float(10)) f = open('poli_user_tensor_75.npy') X_poli = np.load(f) print X_poli.shape X_poli = X_poli[poli_ground.keys()] poli_ground1 = poli_ground.values() sc = 0.0 sc1 = 0.0 km1 = KMeans(n_clusters=310, init='k-means++', n_init=1, verbose=False) sc = 0.0 sc1 = 0.0 sc2 = 0.0 for i in xrange(10): km1.fit(X_poli) sc += nmi(poli_ground1, km1.labels_) sc1 += ari(poli_ground1, km1.labels_) sc2 += ami(poli_ground1, km1.labels_) print "poli" print "nmi score %f" % (sc / float(10)) print "ari score %f" % (sc1 / float(10)) print "ami score %f" % (sc2 / float(10))
def comp_clusters_communities(embedding, labels_communities, algo = True, n_clusters = 5): X = StandardScaler().fit_transform(embedding) #rescaling of the data if algo: #choose which algo you want to find communities with db = DBSCAN().fit(X) labels_clusters = db.labels_ else: kM = KMeans(n_clusters = n_clusters).fit(X) labels_clusters = kM.labels_ return ami(labels_clusters, labels_communities) #adjusted mutual information between ground truth and communities discovered by the algorithm
def func(X_train, X_test, y_train, y_test, name, it): km = kmeans(random_state=5) gmm = GMM(random_state=5) km.set_params(n_clusters=it) gmm.set_params(n_components=it) km.fit(X_train) gmm.fit(X_train) if args[0] != 'BASE': file_it(name, 'km', X_train, y_train, km.predict(X_train), it=it) file_it(name, 'gmm', X_train, y_train, gmm.predict(X_train), it=it) SSE[it][name] = km.score(X_train) ll[it][name] = gmm.score(X_train) acc[it][name]['Kmeans'] = cluster_acc(y_test, km.predict(X_test)) acc[it][name]['GMM'] = cluster_acc(y_test, gmm.predict(X_test)) adjMI[it][name]['Kmeans'] = ami(y_train, km.predict(X_train)) adjMI[it][name]['GMM'] = ami(y_train, gmm.predict(X_train)) print(it, clock()-st)
def prune_groups(groups, inverse=False): """ Returns the index of informative levels after the nested_model has been run. It works by looking at level entropy and, moreover, checks if two consecutive levels have the same clustering """ n_groups = groups.shape[1] mi_groups = np.array([ami(groups.iloc[:, x - 1], groups.iloc[:, x]) for x in range(1, n_groups)]) if inverse: return groups.columns[np.where(mi_groups != 1)] return groups.columns[np.where(mi_groups == 1)]
km.set_params(n_clusters=k) gmm.set_params(n_components=k) km.fit(X_train) gmm.fit(X_train) SSE[k]['IncomeInertia'] = (km.inertia_) BIC[k]['IncomeBIC'] = gmm.bic(X_train) hScore[k]['KM'] = homogeneity_score(Y_train, km.predict(X_train)) hScore[k]['GMM'] = homogeneity_score(Y_train, gmm.predict(X_train)) cScore[k]['KM'] = completeness_score(Y_train, km.predict(X_train)) cScore[k]['GMM'] = completeness_score(Y_train, gmm.predict(X_train)) AMI[k]['KM'] = ami(Y_train, km.predict(X_train)) AMI[k]['GMM'] = ami(Y_train, gmm.predict(X_train)) a, b, vm = homogeneity_completeness_v_measure(Y_train, km.predict(X_train)) VMeasure[k]['KM'] = vm a, b, vm = homogeneity_completeness_v_measure(Y_train, gmm.predict(X_train)) VMeasure[k]['GMM'] = vm SSE = (pd.DataFrame(SSE)).T BIC = pd.DataFrame(BIC).T hScore = pd.DataFrame(hScore).T cScore = pd.DataFrame(cScore).T AMI = pd.DataFrame(AMI).T
def run_credit_analysis_dim_red(): algo_name = ['PCA', 'ICA', 'RP', 'RF'] clusters = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] #, 13, 14, 16, 18, 20, 25, 30] # %% Part 3 - Run k-means and EM clustering algorithms on each dimensionally reduced dataset print( 'Part 3 - Running clustering algoirthms on dimensionally reduced datasets...credit' ) for i in range(len(algo_name)): # load datasets credit = pd.read_hdf('datasets.hdf', 'credit_' + algo_name[i]) creditX = credit.drop('Class', 1).copy().values creditY = credit['Class'].copy().values SSE = defaultdict(dict) BIC = defaultdict(dict) h**o = defaultdict(lambda: defaultdict(dict)) compl = defaultdict(lambda: defaultdict(dict)) adjMI = defaultdict(lambda: defaultdict(dict)) km = kmeans(random_state=5) gmm = GMM(random_state=5) st = clock() for k in clusters: km.set_params(n_clusters=k) gmm.set_params(n_components=k) km.fit(creditX) gmm.fit(creditX) SSE[k]['Credit SSE'] = km.score(creditX) BIC[k]['Credit BIC'] = gmm.bic(creditX) h**o[k]['Credit']['Kmeans'] = homogeneity_score( creditY, km.predict(creditX)) h**o[k]['Credit']['GMM'] = homogeneity_score( creditY, gmm.predict(creditX)) compl[k]['Credit']['Kmeans'] = completeness_score( creditY, km.predict(creditX)) compl[k]['Credit']['GMM'] = completeness_score( creditY, gmm.predict(creditX)) adjMI[k]['Credit']['Kmeans'] = ami(creditY, km.predict(creditX)) adjMI[k]['Credit']['GMM'] = ami(creditY, gmm.predict(creditX)) print(k, clock() - st) SSE = (-pd.DataFrame(SSE)).T BIC = pd.DataFrame(BIC).T h**o = pd.Panel(h**o) compl = pd.Panel(compl) adjMI = pd.Panel(adjMI) SSE.to_csv('./P3_Clustering_Algorithms_Reduced/Credit/Credit_SSE_' + algo_name[i] + '.csv') BIC.to_csv('./P3_Clustering_Algorithms_Reduced/Credit/Credit_BIC_' + algo_name[i] + '.csv') h**o.ix[:, :, 'Credit'].to_csv( './P3_Clustering_Algorithms_Reduced/Credit/credit_' + algo_name[i] + '_homo.csv') compl.ix[:, :, 'Credit'].to_csv( './P3_Clustering_Algorithms_Reduced/Credit/credit_' + algo_name[i] + '_compl.csv') adjMI.ix[:, :, 'Credit'].to_csv( './P3_Clustering_Algorithms_Reduced/Credit/credit_' + algo_name[i] + '_adjMI.csv')
loans_km_acc = [] loans_gmm_acc = [] loans_km_score = [] loans_gmm_score = [] loans_km_ami = [] loans_gmm_ami = [] loans_km_silhouette = [] loans_gmm_silhouette = [] for k in clusters: km.set_params(n_clusters=k) km.fit(loansX_pca) loans_km_acc.append(cluster_acc(loans_Y, km.predict(loansX_pca))) loans_km_score.append(km.score(loansX_pca)) loans_km_ami.append(ami(loans_Y, km.predict(loansX_pca))) loans_km_silhouette.append( silhouette_score(loansX_pca, km.predict(loansX_pca))) gmm.set_params(n_components=k) gmm.fit(loansX_pca) loans_gmm_acc.append(cluster_acc(loans_Y, gmm.predict(loansX_pca))) loans_gmm_score.append(gmm.score(loansX_pca)) loans_gmm_ami.append(ami(loans_Y, gmm.predict(loansX_pca))) loans_gmm_silhouette.append( silhouette_score(loansX_pca, gmm.predict(loansX_pca))) loans_df= pd.DataFrame({'Kmeans acc': loans_km_acc, 'GMM acc': loans_gmm_acc,\ 'Kmeans score': loans_km_score, 'GMM score': loans_gmm_score,\ 'Kmeans ami': loans_km_ami, 'GMM ami': loans_gmm_ami,\ 'km avg silhouette': loans_km_silhouette, 'GMM avg silhouette':loans_gmm_silhouette },\
acc = defaultdict(lambda: defaultdict(dict)) adjMI = defaultdict(lambda: defaultdict(dict)) km = kmeans(random_state=5) gmm = GMM(random_state=5) st = clock() for k in clusters: km.set_params(n_clusters=k) gmm.set_params(n_components=k) km.fit(contraX) gmm.fit(contraX) SSE[k]['contra'] = km.score(contraX) ll[k]['contra'] = gmm.score(contraX) acc[k]['contra']['Kmeans'] = cluster_acc(contraY, km.predict(contraX)) acc[k]['contra']['GMM'] = cluster_acc(contraY, gmm.predict(contraX)) adjMI[k]['contra']['Kmeans'] = ami(contraY, km.predict(contraX)) adjMI[k]['contra']['GMM'] = ami(contraY, gmm.predict(contraX)) km.fit(cancerX) gmm.fit(cancerX) SSE[k]['cancer'] = km.score(cancerX) ll[k]['cancer'] = gmm.score(cancerX) acc[k]['cancer']['Kmeans'] = cluster_acc(cancerY, km.predict(cancerX)) acc[k]['cancer']['GMM'] = cluster_acc(cancerY, gmm.predict(cancerX)) adjMI[k]['cancer']['Kmeans'] = ami(cancerY, km.predict(cancerX)) adjMI[k]['cancer']['GMM'] = ami(cancerY, gmm.predict(cancerX)) print(k, clock() - st) ## Keith Mertan: Adding cluster outputs for best parameters and saving at the end of the file ## Cancer data first
km = kmeans(random_state=5) gmm = GMM(random_state=5) st = clock() for k in clusters: km.set_params(n_clusters=k) gmm.set_params(n_components=k) km.fit(spamX) gmm.fit(spamX) SSE[k]['spam SSE'] = km.score(spamX) BIC[k]['spam BIC'] = gmm.bic(spamX) h**o[k]['spam']['Kmeans'] = homogeneity_score(spamY, km.predict(spamX)) h**o[k]['spam']['GMM'] = homogeneity_score(spamY, gmm.predict(spamX)) compl[k]['spam']['Kmeans'] = completeness_score(spamY, km.predict(spamX)) compl[k]['spam']['GMM'] = completeness_score(spamY, gmm.predict(spamX)) adjMI[k]['spam']['Kmeans'] = ami(spamY, km.predict(spamX)) adjMI[k]['spam']['GMM'] = ami(spamY, gmm.predict(spamX)) km.fit(letterX) gmm.fit(letterX) SSE[k]['letter'] = km.score(letterX) BIC[k]['letter BIC'] = gmm.bic(letterX) h**o[k]['letter']['Kmeans'] = homogeneity_score(letterY, km.predict(letterX)) h**o[k]['letter']['GMM'] = homogeneity_score(letterY, gmm.predict(letterX)) compl[k]['letter']['Kmeans'] = completeness_score(letterY, km.predict(letterX)) compl[k]['letter']['GMM'] = completeness_score(letterY, gmm.predict(letterX)) adjMI[k]['letter']['Kmeans'] = ami(letterY, km.predict(letterX)) adjMI[k]['letter']['GMM'] = ami(letterY, gmm.predict(letterX))
silh = defaultdict(lambda: defaultdict(dict)) km = kmeans(random_state=5) gmm = GMM(random_state=5) st = clock() for k in clusters: km.set_params(n_clusters=k) gmm.set_params(n_components=k) km.fit(wineX) gmm.fit(wineX) SSE[k]['Wine'] = km.score(wineX) ll[k]['Wine'] = gmm.score(wineX) acc[k]['Wine']['Kmeans'] = cluster_acc(wineY.ravel(), km.predict(wineX)) acc[k]['Wine']['GMM'] = cluster_acc(wineY.ravel(), gmm.predict(wineX)) adjMI[k]['Wine']['Kmeans'] = ami(wineY.ravel(), km.predict(wineX)) adjMI[k]['Wine']['GMM'] = ami(wineY.ravel(), gmm.predict(wineX)) adjRI[k]['Wine']['Kmeans'] = ari(wineY.ravel(), km.predict(wineX)) adjRI[k]['Wine']['GMM'] = ari(wineY.ravel(), gmm.predict(wineX)) bic[k]['Wine']['Kmeans'] = -compute_bic(km, wineX) bic[k]['Wine']['GMM'] = gmm.bic(wineX) silh[k]['Wine']['Kmeans'] = silhouette_score(wineX, km.predict(wineX)) silh[k]['Wine']['GMM'] = silhouette_score(wineX, gmm.predict(wineX)) km.fit(digitX) gmm.fit(digitX) SSE[k]['Digit'] = km.score(digitX) ll[k]['Digit'] = gmm.score(digitX) acc[k]['Digit']['Kmeans'] = cluster_acc(digitY.ravel(), km.predict(digitX)) acc[k]['Digit']['GMM'] = cluster_acc(digitY.ravel(),
acc = defaultdict(lambda: defaultdict(dict)) adjMI = defaultdict(lambda: defaultdict(dict)) km = kmeans(random_state=5) gmm = GMM(random_state=5) st = clock() for k in clusters: km.set_params(n_clusters=k) gmm.set_params(n_components=k) km.fit(madelonX) gmm.fit(madelonX) SSE[k]['Madelon'] = km.score(madelonX) ll[k]['Madelon'] = gmm.score(madelonX) acc[k]['Madelon']['Kmeans'] = cluster_acc(madelonY, km.predict(madelonX)) acc[k]['Madelon']['GMM'] = cluster_acc(madelonY, gmm.predict(madelonX)) adjMI[k]['Madelon']['Kmeans'] = ami(madelonY, km.predict(madelonX)) adjMI[k]['Madelon']['GMM'] = ami(madelonY, gmm.predict(madelonX)) km.fit(digitsX) gmm.fit(digitsX) SSE[k]['Digits'] = km.score(digitsX) ll[k]['Digits'] = gmm.score(digitsX) acc[k]['Digits']['Kmeans'] = cluster_acc(digitsY, km.predict(digitsX)) acc[k]['Digits']['GMM'] = cluster_acc(digitsY, gmm.predict(digitsX)) adjMI[k]['Digits']['Kmeans'] = ami(digitsY, km.predict(digitsX)) adjMI[k]['Digits']['GMM'] = ami(digitsY, gmm.predict(digitsX)) print(k, clock() - st) SSE = (-pd.DataFrame(SSE)).T SSE.rename(columns=lambda x: x + ' SSE (left)', inplace=True) ll = pd.DataFrame(ll).T
def tensfact_baseline(): G_buzz, N_buzz, C_buzz, G_poli, N_poli, C_poli = parse_graphs() n_news1 = N_buzz.shape[0] n_news2 = N_poli.shape[0] y_buzz = [0] * n_news1 y_poli = [0] * n_news2 y_buzz = np.array(y_buzz) y_poli = np.array(y_poli) y_buzz[91:] = 1 y_poli[120:] = 1 n_clusters = 81 if not os.path.isfile('tensor_buzz.npy'): T = np.zeros((N_buzz.shape[0], G_buzz.shape[0], C_buzz.shape[1])) n_users = G_buzz.shape[0] n_news = N_buzz.shape[0] n_comm = C_buzz.shape[1] for i in xrange(n_news): for j in xrange(n_users): for k in xrange(n_comm): T[i,j,k] = N_buzz[i,j] * C_buzz[j, k] np.save('tensor_buzz.npy', T) else: f = open('tensor_buzz.npy') T_buzz = np.load(f) print T_buzz.shape print "Buzz tensor loaded" #T = dtensor(T_buzz) #print T.shape #factors = parafac(T_buzz, rank=25, init='random') #T_buzz = tl.tensor(T_buzz) # Best so far [50, 100, 5] core, factors = tucker(T_buzz, ranks=[45, 100, 5]) print core.shape print factors[0].shape print factors[1].shape #P, fit, itr, exectimes = cp_als(T, 35, init='random') #P, F, D, A, fit, itr, exectimes = parafac2.parafac2(T, 10, init=42) # Extracting news embeddings #X_buzz = T_buzz X_buzz = factors[1] #X_buzz = P.U[0] F = open('buzz_lsi.npy', 'r') buzz_lsi = np.load(F) #X_buzz = np.hstack((X_buzz, buzz_lsi)) print X_buzz.shape #caler = MinMaxScaler() #X_buzz = preprocessing.scale(X_buzz) #X_buzz = scaler.fit_transform(X_buzz) #assert np.where(np.isnan(X_buzz) == True)[0].shape[0] == 0 #X_buzz = X_buzz[buzz_ground.keys()] buzz_ground1 = buzz_ground.values() km = KMeans(n_clusters=81, init='k-means++', n_init=1, verbose=False) print "Buzzfeed dataset's feat. extracted" #print X_buzz.shape #X_buzz, y_buzz = shuffle(X_buzz, y_buzz, random_state=42) sc = 0.0 sc1 = 0.0 sc2 = 0.0 for i in xrange(10): km.fit(X_buzz) sc+=nmi(buzz_ground1, km.labels_) sc1+=ari(buzz_ground1, km.labels_) sc2+=ami(buzz_ground1, km.labels_) print "BUZZ" print "nmi score %f"%(sc/float(10)) print "ari score %f"%(sc1/float(10)) print "ami score %f"%(sc2/float(10)) if not os.path.isfile('tensor_poli.npy'): T = np.zeros((N_poli.shape[0], G_poli.shape[0], C_poli.shape[1])) n_users = G_poli.shape[0] n_news = N_poli.shape[0] n_comm = C_poli.shape[1] for i in xrange(n_news): for j in xrange(n_users): for k in xrange(n_comm): T[i,j,k] = N_poli[i,j] * C_poli[j, k] np.save('tensor_poli.npy', T) else: f = open('tensor_poli.npy') T_poli = np.load(f) print T_poli.shape print "Politifact tensor loaded" T = dtensor(T_poli) #factors = parafac(T_poli, rank=50) #P, fit, itr, exectimes = cp_als(T, 35, init='random') # Best so far: [50, 100, 5] T_poli = tl.tensor(T_poli) core, factors = tucker(T_poli, ranks=[45, 100, 5]) #print " Fit value, Itr and Exectimes are:" #print fit #print itr #print exectimes # Extracting news embeddings X_poli = factors[1] #X_poli = P.U[0] F = open('poli_lsi.npy', 'r') poli_lsi = np.load(F) #X_poli = X_poli[poli_ground.keys()] #X_poli = np.hstack((X_poli, poli_lsi)) print X_poli.shape #X_buzz = preprocessing.scale(X_poli) #X_poli = scaler.fit_transform(X_poli) assert np.where(np.isnan(X_buzz) == True)[0].shape[0] == 0 print X_poli.shape print "Politifact news feats. extracted" poli_ground1 = poli_ground.values() km = KMeans(n_clusters=310, init='k-means++', n_init=1, verbose=False) print "Buzzfeed dataset's feat. extracted" #print X_buzz.shape #X_buzz, y_buzz = shuffle(X_buzz, y_buzz, random_state=42) sc = 0.0 sc1 = 0.0 sc2 = 0.0 for i in xrange(10): km.fit(X_poli) sc+=nmi(poli_ground1, km.labels_) sc1+=ari(poli_ground1, km.labels_) sc2+=ami(poli_ground1, km.labels_) print "BUZZ" print "nmi score %f"%(sc/float(10)) print "ari score %f"%(sc1/float(10)) print "ami score %f"%(sc2/float(10))
km = kmeans(random_state=5) gmm = GMM(random_state=5) st = time.time() print(len(clusters)) for k in clusters: km.set_params(n_clusters=k) gmm.set_params(n_components=k) km.fit(perm_x) gmm.fit(perm_x) SSE[k]['perm'] = km.score(perm_x) ll[k]['perm'] = gmm.score(perm_x) acc[k]['perm']['Kmeans'] = cluster_acc(perm_y, km.predict(perm_x)) acc[k]['perm']['GMM'] = cluster_acc(perm_y, gmm.predict(perm_x)) adjMI[k]['perm']['Kmeans'] = ami(perm_y, km.predict(perm_x)) adjMI[k]['perm']['GMM'] = ami(perm_y, gmm.predict(perm_x)) for k in clusters: km.set_params(n_clusters=k) gmm.set_params(n_components=k) km.fit(housing_x) gmm.fit(housing_x) SSE[k]['housing'] = km.score(housing_x) ll[k]['housing'] = gmm.score(housing_x) acc[k]['housing']['Kmeans'] = cluster_acc(housing_y, km.predict(housing_x)) acc[k]['housing']['GMM'] = cluster_acc(housing_y, gmm.predict(housing_x)) adjMI[k]['housing']['Kmeans'] = ami(housing_y, km.predict(housing_x)) adjMI[k]['housing']['GMM'] = ami(housing_y, gmm.predict(housing_x))
poli_featvec = poli_featvec[poli_ground.keys()] buzz_ground = buzz_ground.values() poli_ground = poli_ground.values() km = KMeans(n_clusters=81, n_init=1) km1 = KMeans(n_clusters=310, n_init=1) sc = 0.0 sc1 = 0.0 sc2 = 0.0 for i in xrange(10): km.fit(buzz_featvec) sc += nmi(buzz_ground, km.labels_) sc1 += ari(buzz_ground, km.labels_) sc2 += ami(buzz_ground, km.labels_) print "BUZZ" print "nmi score %f" % (sc / float(10)) print "ari score %f" % (sc1 / float(10)) print "ami score %f" % (sc2 / float(10)) sc = 0.0 sc1 = 0.0 sc2 = 0.0 for i in xrange(10): km1.fit(poli_featvec) sc += nmi(poli_ground, km1.labels_) sc1 += ari(poli_ground, km1.labels_) sc2 += ami(poli_ground, km1.labels_)
def __do_perform(self, custom_out=None, main_experiment=None ): # ./output/ICA/clustering//{}', ICAExperiment if custom_out is not None: # if not os.path.exists(custom_out): # os.makedirs(custom_out) self._old_out = self._out # './output/ICA/{}' self._out = custom_out # ./output/ICA/clustering//{}' elif self._old_out is not None: self._out = self._old_out if main_experiment is not None: self.log("Performing {} as part of {}".format( self.experiment_name(), main_experiment.experiment_name())) # 'clustering', 'ICA' else: self.log("Performing {}".format(self.experiment_name())) # Adapted from https://github.com/JonathanTay/CS-7641-assignment-3/blob/master/clustering.py # %% Data for 1-3 sse = defaultdict(list) ll = defaultdict(list) bic = defaultdict(list) sil = defaultdict(lambda: defaultdict(list)) sil_s = np.empty(shape=(2 * len(self._clusters) * self._details.ds.training_x.shape[0], 4), dtype='<U21') acc = defaultdict(lambda: defaultdict(float)) adj_mi = defaultdict(lambda: defaultdict(float)) km = kmeans(random_state=self._details.seed) gmm = GMM(random_state=self._details.seed) st = clock() j = 0 for k in self._clusters: km.set_params(n_clusters=k) gmm.set_params(n_components=k) km.fit( self._details.ds.training_x ) # cluster the ICA-transformed input features using kMeans with varying K gmm.fit( self._details.ds.training_x ) # cluster the ICA-transformed input features using GMM with varying k km_labels = km.predict( self._details.ds.training_x ) # give each ICA-transformed input feature a label gmm_labels = gmm.predict(self._details.ds.training_x) sil[k]['Kmeans'] = sil_score( self._details.ds.training_x, km_labels ) # compute mean silhouette score for all ICA-transformed input features sil[k]['GMM'] = sil_score(self._details.ds.training_x, gmm_labels) km_sil_samples = sil_samples( self._details.ds.training_x, km_labels ) # compute silhouette score for each ICA-transformed input feature gmm_sil_samples = sil_samples(self._details.ds.training_x, gmm_labels) # There has got to be a better way to do this, but I can't brain right now for i, x in enumerate(km_sil_samples): sil_s[j] = [ k, 'Kmeans', round(x, 6), km_labels[i] ] # record the silhouette score x for each instance i given its label kn_labels[i] by kMeans with value k j += 1 for i, x in enumerate(gmm_sil_samples): sil_s[j] = [k, 'GMM', round(x, 6), gmm_labels[i]] j += 1 sse[k] = [ km.score(self._details.ds.training_x) ] # score (opposite of the value of X on the k-Means objective (what is the objective???) ll[k] = [gmm.score(self._details.ds.training_x) ] # per-sample average log-likelihood bic[k] = [ gmm.bic(self._details.ds.training_x) ] # bayesian information criterion (review ???) on the input X acc[k]['Kmeans'] = cluster_acc( self._details.ds.training_y, km_labels ) # compute the accuracy of the clustering algorithm on the ICA-transformed data (against the original y-label) if it predicted the majority y-label for each cluster acc[k]['GMM'] = cluster_acc(self._details.ds.training_y, gmm_labels) adj_mi[k]['Kmeans'] = ami( self._details.ds.training_y, km_labels ) # compute the adjusted mutual information between the true labels and the cluster predicted labels (how well does clustering match truth) adj_mi[k]['GMM'] = ami(self._details.ds.training_y, gmm_labels) self.log("Cluster: {}, time: {}".format(k, clock() - st)) sse = (-pd.DataFrame(sse)).T sse.index.name = 'k' sse.columns = ['{} sse (left)'.format(self._details.ds_readable_name) ] # Bank sse (left) ll = pd.DataFrame(ll).T ll.index.name = 'k' ll.columns = [ '{} log-likelihood'.format(self._details.ds_readable_name) ] # Bank log-likelihood bic = pd.DataFrame(bic).T bic.index.name = 'k' bic.columns = ['{} BIC'.format(self._details.ds_readable_name) ] # Bank BIC sil = pd.DataFrame(sil).T sil_s = pd.DataFrame(sil_s, columns=['k', 'type', 'score', 'label']).set_index('k') #.T # sil_s = sil_s.T acc = pd.DataFrame(acc).T adj_mi = pd.DataFrame(adj_mi).T sil.index.name = 'k' sil_s.index.name = 'k' acc.index.name = 'k' adj_mi.index.name = 'k' # write scores to files sse.to_csv(self._out.format('{}_sse.csv'.format( self._details.ds_name))) ll.to_csv( self._out.format('{}_logliklihood.csv'.format( self._details.ds_name))) bic.to_csv(self._out.format('{}_bic.csv'.format( self._details.ds_name))) sil.to_csv( self._out.format('{}_sil_score.csv'.format(self._details.ds_name))) sil_s.to_csv( self._out.format('{}_sil_samples.csv'.format( self._details.ds_name))) acc.to_csv(self._out.format('{}_acc.csv'.format( self._details.ds_name))) adj_mi.to_csv( self._out.format('{}_adj_mi.csv'.format(self._details.ds_name))) # %% NN fit data (2,3) # train a NN on clustered data grid = { 'km__n_clusters': self._clusters, 'NN__alpha': self._nn_reg, 'NN__hidden_layer_sizes': self._nn_arch } mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=self._details.seed) km = kmeans(random_state=self._details.seed, n_jobs=self._details.threads) pipe = Pipeline( [('km', km), ('NN', mlp)], memory=experiments.pipeline_memory ) # run a NN on the clustered data (only on the cluster labels, or input features + cluster labels???) gs, _ = self.gs_with_best_estimator( pipe, grid, type='kmeans') # write the best NN to file self.log("KMmeans Grid search complete") tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv( self._out.format('{}_cluster_kmeans.csv'.format( self._details.ds_name)) ) # write grid search results --> bank_cluster_kmeans.csv grid = { 'gmm__n_components': self._clusters, 'NN__alpha': self._nn_reg, 'NN__hidden_layer_sizes': self._nn_arch } mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=self._details.seed) gmm = CustomGMM(random_state=self._details.seed) pipe = Pipeline([('gmm', gmm), ('NN', mlp)], memory=experiments.pipeline_memory) gs, _ = self.gs_with_best_estimator( pipe, grid, type='gmm') # write the best NN to file self.log("GMM search complete") tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv( self._out.format('{}_cluster_GMM.csv'.format( self._details.ds_name)) ) # write grid search results --> bank_cluster_GMM.csv # %% For chart 4/5 # perform TSNE D.R on training data (why???) self._details.ds.training_x2D = TSNE( verbose=10, random_state=self._details.seed).fit_transform( self._details.ds.training_x) ds_2d = pd.DataFrame( np.hstack((self._details.ds.training_x2D, np.atleast_2d(self._details.ds.training_y).T)), columns=['x', 'y', 'target'] ) # prepare NN-learnable data using TSNE D.R'd input features + label ds_2d.to_csv( self._out.format('{}_2D.csv'.format( self._details.ds_name))) # --> bank_2D.csv self.log("Done")
X = np.append(X, noise, axis = 1) X= normalize(X) # Y = SelfOrganizingSwarm(iterations=250, alpha=1, beta = 0.9,delta=0.001, theta=3).fit_transform(X) # Y = PCA(2).fit_transform(X) # Y =TSNE().fit_transform(X) Y= GSOM().fit_transform(X, lr = 1.0, beta=0.5, sf=0.6, wd=0.175, fd=0.8)#X,lr = 1.0, beta=0.0,sf=0.01, fd=0.75, wd=0.5) # fig = plt.figure() # ax = Axes3D(fig)00 # ax.scatter(X.T[0], X.T.[1], X.T[2],c = color, alpha=0.5, edgecolors='none') # plt.show() plt.subplot(211) # ax = fig.add_subplot(211) plt.scatter(Y.T[0], Y.T[1], s = 15, c = plt.cm.jet(color/(n_clusters*1.0)), edgecolors='none', alpha=0.375) labs = KMeans(n_clusters).fit(Y).labels_ plt.subplot(212) plt.scatter(Y.T[0], Y.T[1], s = 15, c =plt.cm.jet(labs/(n_clusters*1.0)), edgecolors='none', alpha=0.375) print 'ars ', ars(color,labs) print 'ami ', ami(color, labs) # # Y = Isomap().fit_transform(X) # ax2 = fig.add_subplot(121) # ax2.scatter(Y.T[0], Y.T[1], c = color, edgecolors='none', alpha=0.5) plt.show()
sil[k]['Kmeans'] = sil_score(dataX, km_labels) sil[k]['GMM'] = sil_score(dataX, gmm_labels) km_sil_samples = sil_samples(dataX, km_labels) gmm_sil_samples = sil_samples(dataX, gmm_labels) for i, x in enumerate(km_sil_samples): sil_samp[j] = [k, 'Kmeans', round(x, 6), km_labels[i]] j += 1 for i, x in enumerate(gmm_sil_samples): sil_samp[j] = [k, 'GMM', round(x, 6), gmm_labels[i]] j += 1 sse[k] = km.score(dataX) ll[k] = gmm.score(dataX) bic[k] = gmm.bic(dataX) acc[k]['Kmeans'] = cluster_acc(dataY,km.predict(dataX)) acc[k]['GMM'] = cluster_acc(dataY,gmm.predict(dataX)) adj_mi[k]['Kmeans'] = ami(dataY,km.predict(dataX)) adj_mi[k]['GMM'] = ami(dataY,gmm.predict(dataX)) gmm_clusters = pd.DataFrame() kmeans_clusters = pd.DataFrame() for i in clusters: gmm_clusters[i] = labels[i]['GMM'] kmeans_clusters[i] = labels[i]['Kmeans'] bic = pd.DataFrame(bic, index=[0]).T bic.index.name = 'k' bic.rename(columns= {bic.columns[0]: 'BIC'}, inplace=True)
#Sum of Squared Errors for K-means SSE[k]['Faults'] = km.score(faultsX) #Log-Likelihood for GMM ll[k]['Faults'] = gmm.score(faultsX) #Silhouette Score #The best value is 1 and the worst value is -1. Silhouette analysis can be used to study the separation distance between the resulting clusters. SS[k]['Faults']['Kmeans'] = ss(faultsX, km.predict(faultsX)) SS[k]['Faults']['GMM'] = ss(faultsX, gmm.predict(faultsX)) #Cluster Accuracy acc[k]['Faults']['Kmeans'] = cluster_acc(faultsY, km.predict(faultsX)) acc[k]['Faults']['GMM'] = cluster_acc(faultsY, gmm.predict(faultsX)) #Adjusted Mutual Information adjMI[k]['Faults']['Kmeans'] = ami(faultsY, km.predict(faultsX)) adjMI[k]['Faults']['GMM'] = ami(faultsY, gmm.predict(faultsX)) #Breast Cancer dataset km.fit(bcX) gmm.fit(bcX) SSE[k]['BreastC'] = km.score(bcX) ll[k]['BreastC'] = gmm.score(bcX) SS[k]['BreastC']['Kmeans'] = ss(bcX, km.predict(bcX)) SS[k]['BreastC']['GMM'] = ss(bcX, gmm.predict(bcX)) acc[k]['BreastC']['Kmeans'] = cluster_acc(bcY, km.predict(bcX)) acc[k]['BreastC']['GMM'] = cluster_acc(bcY, gmm.predict(bcX)) adjMI[k]['BreastC']['Kmeans'] = ami(bcY, km.predict(bcX)) adjMI[k]['BreastC']['GMM'] = ami(bcY, gmm.predict(bcX)) print(k, clock() - st)
def clustering_experiment(X, y, name, clusters, rdir): """Generate results CSVs for given datasets using the K-Means and EM clustering algorithms. Args: X (Numpy.Array): Attributes. y (Numpy.Array): Labels. name (str): Dataset name. clusters (list[int]): List of k values. rdir (str): Output directory. """ sse = defaultdict(dict) # sum of squared errors logl = defaultdict(dict) # log-likelihood bic = defaultdict(dict) # BIC for EM silhouette = defaultdict(dict) # silhouette score acc = defaultdict(lambda: defaultdict(dict)) # accuracy scores adjmi = defaultdict(lambda: defaultdict(dict)) # adjusted mutual info km = KMeans(random_state=0) # K-Means gmm = GMM(random_state=0) # Gaussian Mixture Model (EM) # start loop for given values of k for k in clusters: km.set_params(n_clusters=k) gmm.set_params(n_components=k) km.fit(X) gmm.fit(X) # calculate SSE, log-likelihood, accuracy, and adjusted mutual info sse[k][name] = km.score(X) logl[k][name] = gmm.score(X) acc[k][name]['km'] = cluster_acc(y, km.predict(X)) acc[k][name]['gmm'] = cluster_acc(y, gmm.predict(X)) adjmi[k][name]['km'] = ami(y, km.predict(X)) adjmi[k][name]['gmm'] = ami(y, gmm.predict(X)) # calculate silhouette score for K-Means km_silhouette = silhouette_score(X, km.predict(X)) silhouette[k][name] = km_silhouette # calculate BIC for EM bic[k][name] = gmm.bic(X) # generate output dataframes sse = (-pd.DataFrame(sse)).T sse.rename(columns={name: 'sse'}, inplace=True) logl = pd.DataFrame(logl).T logl.rename(columns={name: 'log-likelihood'}, inplace=True) bic = pd.DataFrame(bic).T bic.rename(columns={name: 'bic'}, inplace=True) silhouette = pd.DataFrame(silhouette).T silhouette.rename(columns={name: 'silhouette_score'}, inplace=True) acc = pd.Panel(acc) acc = acc.loc[:, :, name].T.rename(lambda x: '{}_acc'.format(x), axis='columns') adjmi = pd.Panel(adjmi) adjmi = adjmi.loc[:, :, name].T.rename(lambda x: '{}_adjmi'.format(x), axis='columns') # concatenate all results dfs = (sse, silhouette, logl, bic, acc, adjmi) metrics = pd.concat(dfs, axis=1) resfile = get_abspath('{}_metrics.csv'.format(name), rdir) metrics.to_csv(resfile, index_label='k')
g = gt.load_graph_from_csv(G.graph['edgelist'], directed=isDirected, csv_options={ "delimiter": " ", "quotechar": '"' }) block = gt.minimize_nested_blockmodel_dl( g, B_min=G.graph['number_communities'], B_max=G.graph['number_communities']) num_block = block.levels[0].get_B() block = block.levels[0].get_blocks() partition = [0 for i in range(G.number_of_nodes())] for i in range(G.number_of_nodes()): #for every node partition[i] = block[i] zsbm.append(ami(partition, G.graph['labels_communities'])) igraph = ig.Read_Edgelist(G.graph['edgelist']) part = igraph.community_infomap() partition = [0 for i in range(G.number_of_nodes())] for i in range(G.number_of_nodes()): for j in range(len(part)): if i in part[j]: partition[i] = j zinfomap.append(ami(partition, G.graph['labels_communities'])) Y = community.best_partition(G.to_undirected( )) #https://perso.crans.org/aynaud/communities/api.html #uses Louvain heuristices partition = [0 for i in range(G.number_of_nodes())] for k in range(G.number_of_nodes()):
gmm = GMM(random_state=5) st = clock() abaloneX2D = TSNE(verbose=10, random_state=5).fit_transform(abaloneX) digitsX2D = TSNE(verbose=10, random_state=5).fit_transform(digitsX) for k in clusters: km.set_params(n_clusters=k) gmm.set_params(n_components=k) km.fit(abaloneX) gmm.fit(abaloneX) SSE[k]['abalone'] = km.score(abaloneX) ll[k]['abalone'] = gmm.score(abaloneX) acc[k]['abalone']['Kmeans'] = cluster_acc(abaloneY, km.predict(abaloneX)) acc[k]['abalone']['GMM'] = cluster_acc(abaloneY, gmm.predict(abaloneX)) adjMI[k]['abalone']['Kmeans'] = ami(abaloneY, km.predict(abaloneX)) adjMI[k]['abalone']['GMM'] = ami(abaloneY, gmm.predict(abaloneX)) silhouette[k]['abalone']['Kmeans'] = silhouette_score(abaloneX, km.labels_, metric='euclidean') silhouette[k]['abalone']['GMM'] = silhouette_score(abaloneX, gmm.predict(abaloneX), metric='euclidean') abalone2D = pd.DataFrame(np.hstack( (abaloneX2D, np.atleast_2d(km.predict(abaloneX)).T)), columns=['x', 'y', 'target']) abalone2D.to_csv(out + 'abalone2D_km_{}.csv'.format(k)) abalone2D = pd.DataFrame(np.hstack( (abaloneX2D, np.atleast_2d(gmm.predict(abaloneX)).T)), columns=['x', 'y', 'target'])
gmm.set_params(n_components=k) km.fit(X_train) gmm.fit(X_train) #km.score = Opposite of the value of X on the K-means objective. # =Sum of distances of samples to their closest cluster center SSE[k]['Diamond'] = km.score(X_train) ll[k]['Diamond'] = gmm.score(X_train) aic[k]['Diamond'] = gmm.aic(X_train) bic[k]['Diamond'] = gmm.bic(X_train) #training accuracy acc_[k]['Diamond']['Kmeans'] = cluster_acc(Y_test, km.predict(X_test)) acc_[k]['Diamond']['GMM'] = cluster_acc(Y_test, gmm.predict(X_test)) #mutual information score adjMI[k]['Diamond']['Kmeans'] = ami(Y_test, km.predict(X_test)) adjMI[k]['Diamond']['GMM'] = ami(Y_test, gmm.predict(X_test)) km.fit(X_train2) gmm.fit(X_train2) SSE[k]['CreditCard'] = km.score(X_train2) ll[k]['CreditCard'] = gmm.score(X_train2) aic[k]['CreditCard'] = gmm.aic(X_train2) bic[k]['CreditCard'] = gmm.bic(X_train2) acc_[k]['CreditCard']['Kmeans'] = cluster_acc(Y_test2, km.predict(X_test2)) acc_[k]['CreditCard']['GMM'] = cluster_acc(Y_test2, gmm.predict(X_test2)) adjMI[k]['CreditCard']['Kmeans'] = ami(Y_test2, km.predict(X_test2)) adjMI[k]['CreditCard']['GMM'] = ami(Y_test2, gmm.predict(X_test2)) print('cluster: ', k, 'Wall clock time', clock() - st)
def __do_perform(self, custom_out=None, main_experiment=None): if custom_out is not None: # if not os.path.exists(custom_out): # os.makedirs(custom_out) self._old_out = self._out self._out = custom_out elif self._old_out is not None: self._out = self._old_out if main_experiment is not None: self.log("Performing {} as part of {}".format(self.experiment_name(), main_experiment.experiment_name())) else: self.log("Performing {}".format(self.experiment_name())) # Adapted from https://github.com/JonathanTay/CS-7641-assignment-3/blob/master/clustering.py # %% Data for 1-3 sse = defaultdict(list) ll = defaultdict(list) bic = defaultdict(list) sil = defaultdict(lambda: defaultdict(list)) sil_s = np.empty(shape=(2*len(self._clusters)*self._details.ds.training_x.shape[0],4), dtype='<U21') acc = defaultdict(lambda: defaultdict(float)) adj_mi = defaultdict(lambda: defaultdict(float)) km = kmeans(random_state=self._details.seed) gmm = GMM(random_state=self._details.seed) st = clock() j = 0 for k in self._clusters: km.set_params(n_clusters=k) gmm.set_params(n_components=k) km.fit(self._details.ds.training_x) gmm.fit(self._details.ds.training_x) km_labels = km.predict(self._details.ds.training_x) gmm_labels = gmm.predict(self._details.ds.training_x) sil[k]['Kmeans'] = sil_score(self._details.ds.training_x, km_labels) sil[k]['GMM'] = sil_score(self._details.ds.training_x, gmm_labels) km_sil_samples = sil_samples(self._details.ds.training_x, km_labels) gmm_sil_samples = sil_samples(self._details.ds.training_x, gmm_labels) # There has got to be a better way to do this, but I can't brain right now for i, x in enumerate(km_sil_samples): sil_s[j] = [k, 'Kmeans', round(x, 6), km_labels[i]] j += 1 for i, x in enumerate(gmm_sil_samples): sil_s[j] = [k, 'GMM', round(x, 6), gmm_labels[i]] j += 1 sse[k] = [km.score(self._details.ds.training_x)] ll[k] = [gmm.score(self._details.ds.training_x)] bic[k] = [gmm.bic(self._details.ds.training_x)] acc[k]['Kmeans'] = cluster_acc(self._details.ds.training_y, km_labels) acc[k]['GMM'] = cluster_acc(self._details.ds.training_y, gmm_labels) adj_mi[k]['Kmeans'] = ami(self._details.ds.training_y, km_labels) adj_mi[k]['GMM'] = ami(self._details.ds.training_y, gmm_labels) self.log("Cluster: {}, time: {}".format(k, clock() - st)) sse = (-pd.DataFrame(sse)).T sse.index.name = 'k' sse.columns = ['{} sse (left)'.format(self._details.ds_readable_name)] ll = pd.DataFrame(ll).T ll.index.name = 'k' ll.columns = ['{} log-likelihood'.format(self._details.ds_readable_name)] bic = pd.DataFrame(bic).T bic.index.name = 'k' bic.columns = ['{} BIC'.format(self._details.ds_readable_name)] sil = pd.DataFrame(sil).T sil_s = pd.DataFrame(sil_s, columns=['k', 'type', 'score', 'label']).set_index('k') #.T # sil_s = sil_s.T acc = pd.DataFrame(acc).T adj_mi = pd.DataFrame(adj_mi).T sil.index.name = 'k' sil_s.index.name = 'k' acc.index.name = 'k' adj_mi.index.name = 'k' sse.to_csv(self._out.format('{}_sse.csv'.format(self._details.ds_name))) ll.to_csv(self._out.format('{}_logliklihood.csv'.format(self._details.ds_name))) bic.to_csv(self._out.format('{}_bic.csv'.format(self._details.ds_name))) sil.to_csv(self._out.format('{}_sil_score.csv'.format(self._details.ds_name))) sil_s.to_csv(self._out.format('{}_sil_samples.csv'.format(self._details.ds_name))) acc.to_csv(self._out.format('{}_acc.csv'.format(self._details.ds_name))) adj_mi.to_csv(self._out.format('{}_adj_mi.csv'.format(self._details.ds_name))) # %% NN fit data (2,3) grid = {'km__n_clusters': self._clusters, 'NN__alpha': self._nn_reg, 'NN__hidden_layer_sizes': self._nn_arch} mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=self._details.seed) km = kmeans(random_state=self._details.seed, n_jobs=self._details.threads) pipe = Pipeline([('km', km), ('NN', mlp)], memory=experiments.pipeline_memory) gs, _ = self.gs_with_best_estimator(pipe, grid, type='kmeans') self.log("KMmeans Grid search complete") tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(self._out.format('{}_cluster_kmeans.csv'.format(self._details.ds_name))) grid = {'gmm__n_components': self._clusters, 'NN__alpha': self._nn_reg, 'NN__hidden_layer_sizes': self._nn_arch} mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=self._details.seed) gmm = CustomGMM(random_state=self._details.seed) pipe = Pipeline([('gmm', gmm), ('NN', mlp)], memory=experiments.pipeline_memory) gs, _ = self.gs_with_best_estimator(pipe, grid, type='gmm') self.log("GMM search complete") tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(self._out.format('{}_cluster_GMM.csv'.format(self._details.ds_name))) # %% For chart 4/5 self._details.ds.training_x2D = TSNE(verbose=10, random_state=self._details.seed).fit_transform( self._details.ds.training_x ) ds_2d = pd.DataFrame(np.hstack((self._details.ds.training_x2D, np.atleast_2d(self._details.ds.training_y).T)), columns=['x', 'y', 'target']) ds_2d.to_csv(self._out.format('{}_2D.csv'.format(self._details.ds_name))) self.log("Done")
def run_clustering(out, cancer_x, cancer_y, housing_x, housing_y): SSE = defaultdict(dict) ll = defaultdict(dict) acc = defaultdict(lambda: defaultdict(dict)) adjMI = defaultdict(lambda: defaultdict(dict)) km = kmeans(random_state=5) gmm = GMM(random_state=5) silhouette = defaultdict(lambda: defaultdict(dict)) completeness = defaultdict(lambda: defaultdict(dict)) homogeniety = defaultdict(lambda: defaultdict(dict)) st = clock() for k in range(2, 20, 1): km.set_params(n_clusters=k) gmm.set_params(n_components=k) km.fit(cancer_x) gmm.fit(cancer_x) SSE[k]['cancer'] = km.score(cancer_x) ll[k]['cancer'] = gmm.score(cancer_x) acc[k]['cancer']['Kmeans'] = cluster_acc(cancer_y, km.predict(cancer_x)) acc[k]['cancer']['GMM'] = cluster_acc(cancer_y, gmm.predict(cancer_x)) adjMI[k]['cancer']['Kmeans'] = ami(cancer_y, km.predict(cancer_x)) adjMI[k]['cancer']['GMM'] = ami(cancer_y, gmm.predict(cancer_x)) silhouette[k]['cancer']['Kmeans Silhouette'] = ss( cancer_x, km.predict(cancer_x)) silhouette[k]['cancer']['GMM Silhouette'] = ss(cancer_x, gmm.predict(cancer_x)) completeness[k]['cancer']['Kmeans Completeness'] = cs( cancer_y, km.predict(cancer_x)) completeness[k]['cancer']['GMM Completeness'] = cs( cancer_y, gmm.predict(cancer_x)) homogeniety[k]['cancer']['Kmeans Homogeniety'] = hs( cancer_y, km.predict(cancer_x)) homogeniety[k]['cancer']['GMM Homogeniety'] = hs( cancer_y, gmm.predict(cancer_x)) km.fit(housing_x) gmm.fit(housing_x) SSE[k]['housing'] = km.score(housing_x) ll[k]['housing'] = gmm.score(housing_x) acc[k]['housing']['Kmeans'] = cluster_acc(housing_y, km.predict(housing_x)) acc[k]['housing']['GMM'] = cluster_acc(housing_y, gmm.predict(housing_x)) adjMI[k]['housing']['Kmeans'] = ami(housing_y, km.predict(housing_x)) adjMI[k]['housing']['GMM'] = ami(housing_y, gmm.predict(housing_x)) silhouette[k]['housing']['Kmeans Silhouette'] = ss( housing_x, km.predict(housing_x)) silhouette[k]['housing']['GMM Silhouette'] = ss( housing_x, gmm.predict(housing_x)) completeness[k]['housing']['Kmeans Completeness'] = cs( housing_y, km.predict(housing_x)) completeness[k]['housing']['GMM Completeness'] = cs( housing_y, gmm.predict(housing_x)) homogeniety[k]['housing']['Kmeans Homogeniety'] = hs( housing_y, km.predict(housing_x)) homogeniety[k]['housing']['GMM Homogeniety'] = hs( housing_y, gmm.predict(housing_x)) print(k, clock() - st) SSE = (-pd.DataFrame(SSE)).T SSE.rename(columns=lambda x: x + ' SSE (left)', inplace=True) ll = pd.DataFrame(ll).T ll.rename(columns=lambda x: x + ' log-likelihood', inplace=True) acc = pd.Panel(acc) adjMI = pd.Panel(adjMI) silhouette = pd.Panel(silhouette) completeness = pd.Panel(completeness) homogeniety = pd.Panel(homogeniety) SSE.to_csv(out + 'SSE.csv') ll.to_csv(out + 'logliklihood.csv') acc.ix[:, :, 'housing'].to_csv(out + 'Housing acc.csv') acc.ix[:, :, 'cancer'].to_csv(out + 'Perm acc.csv') adjMI.ix[:, :, 'housing'].to_csv(out + 'Housing adjMI.csv') adjMI.ix[:, :, 'cancer'].to_csv(out + 'Perm adjMI.csv') silhouette.ix[:, :, 'cancer'].to_csv(out + 'Perm silhouette.csv') completeness.ix[:, :, 'cancer'].to_csv(out + 'Perm completeness.csv') homogeniety.ix[:, :, 'cancer'].to_csv(out + 'Perm homogeniety.csv') silhouette.ix[:, :, 'housing'].to_csv(out + 'housing silhouette.csv') completeness.ix[:, :, 'housing'].to_csv(out + 'housing completeness.csv') homogeniety.ix[:, :, 'housing'].to_csv(out + 'housing homogeniety.csv')
acc = defaultdict(lambda: defaultdict(dict)) adjMI = defaultdict(lambda: defaultdict(dict)) km = kmeans(random_state=5) gmm = GMM(random_state=5) st = clock() for k in clusters: km.set_params(n_clusters=k) gmm.set_params(n_components=k) km.fit(digitsX) gmm.fit(digitsX) SSE[k]['Digits'] = km.score(digitsX) ll[k]['Digits'] = gmm.score(digitsX) acc[k]['Digits']['Kmeans'] = cluster_acc(digitsY, km.predict(digitsX)) acc[k]['Digits']['GMM'] = cluster_acc(digitsY, gmm.predict(digitsX)) adjMI[k]['Digits']['Kmeans'] = ami(digitsY, km.predict(digitsX)) adjMI[k]['Digits']['GMM'] = ami(digitsY, gmm.predict(digitsX)) print(k, clock() - st) SSE = (-pd.DataFrame(SSE)).T SSE.rename(columns=lambda x: x + ' SSE (left)', inplace=True) ll = pd.DataFrame(ll).T ll.rename(columns=lambda x: x + ' log-likelihood', inplace=True) acc = pd.Panel(acc) adjMI = pd.Panel(adjMI) SSE.to_csv(out + 'SSE.csv') ll.to_csv(out + 'logliklihood.csv') acc.ix[:, :, 'Digits'].to_csv(out + 'Digits acc.csv') adjMI.ix[:, :, 'Digits'].to_csv(out + 'Digits adjMI.csv')
ax1.plot(clusters, [m.bic(dataX) for m in models], color=color) ax1.tick_params(axis='y', labelcolor=color) ax2 = ax1.twinx() # instantiate a second axes that shares the same x-axis color = 'tab:blue' ax2.set_ylabel('Log Likelihood', color=color) # we already handled the x-label with ax1 ax2.plot(clusters, [m.score(dataX) for m in models], color=color) ax2.tick_params(axis='y', labelcolor=color) plt.title("Evaluation of E-M Metrics - {}".format(dataset)) plt.savefig(out + "{}_EM_BIC_LL.png".format(dataset)) print("Plotting adjusted mutual info...") plt.close() plt.figure() plot_ami = [ ami(dataY, m.predict(dataX), average_method='arithmetic') for m in models ] plt.plot(clusters, plot_ami) plt.xlabel('Clusters') plt.ylabel('Adjusted Mutual Information') plt.title("Performance of E-M {}".format(dataset)) plt.savefig(out + "{}_EM_AMI.png".format(dataset)) print("Validating EM labels....") if dataset == 'QSAR': k = 5 else: k = 10 model = GaussianMixture(k, covariance_type='full', random_state=0) labels = model.fit_predict(dataX)