def computeWithScaling(): print "-------------- Scaled SVD----------------" svd_old=svd_matrix[:,0:100] scaled_svd_matrix = preprocessing.scale(svd_old, with_mean = False) kmeans = hlp.getKmeans(20) svd_new=hlp.getSVD(2) svd_matrix_new = svd_new.fit_transform(scaled_svd_matrix) kmeans.fit(scaled_svd_matrix) hlp.plot20Clusters(svd_matrix_new,kmeans,"clusters_svd_scaled_20classes.png") hlp.getStats(labels,kmeans.labels_) print "--------------Scaled NMF----------------" nmf_old = hlp.getNMF(10) nmf_matrix = nmf_old.fit_transform(tfidf_matrix) scaled_nmf_matrix = preprocessing.scale(nmf_matrix, with_mean = False) kmeans = hlp.getKmeans(20) nmf_new=hlp.getNMF(2) nmf_matrix_new=nmf_new.fit_transform(scaled_nmf_matrix) kmeans.fit(scaled_nmf_matrix) hlp.plot20Clusters(nmf_matrix_new,kmeans,"clusters_nmf_scaled_20classes.png") hlp.getStats(labels,kmeans.labels_) print "--------------Logarithmic NMF----------------" nmf_matrix = nmf_old.fit_transform(tfidf_matrix) log_matrix = np.log(nmf_matrix+1) nmf_new=hlp.getNMF(2) nmf_matrix_new=nmf_new.fit_transform(log_matrix) kmeans.fit(log_matrix) hlp.plot20Clusters(nmf_matrix_new,kmeans,"clusters_nmf_log_20classes.png") hlp.getStats(labels,kmeans.labels_) print "--------------Log scaled NMF----------------" nmf_matrix = nmf_old.fit_transform(tfidf_matrix) log_matrix = np.log(nmf_matrix+1) nmf_matrix_scaled = preprocessing.scale(log_matrix, with_mean = False) nmf_new=hlp.getNMF(2) nmf_matrix_new=nmf_new.fit_transform(nmf_matrix_scaled) kmeans.fit(nmf_matrix_scaled) hlp.plot20Clusters(nmf_matrix_new,kmeans,"clusters_nmf_log_scaled_20classes.png") hlp.getStats(labels,kmeans.labels_) print "--------------Scaled log NMF----------------" nmf_matrix = nmf_old.fit_transform(tfidf_matrix) scaled_matrix = preprocessing.scale(nmf_matrix, with_mean = False) log_scaled_nmf = np.log(scaled_matrix+1) nmf_new=hlp.getNMF(2) nmf_matrix_new=nmf_new.fit_transform(log_scaled_nmf) kmeans.fit(log_scaled_nmf) hlp.plot20Clusters(nmf_matrix_new,kmeans,"clusters_nmf_scaled_log_20classes.png") hlp.getStats(labels,kmeans.labels_)
def compute4a(): #convert HD to 2D print "........With LSI........" svd_old =svd_matrix[:,0:100] kmeans = hlp.getKmeans(20) svd_new=hlp.getSVD(2) svd_matrix_new = svd_new.fit_transform(svd_old) kmeans.fit(svd_old) hlp.plot20Clusters(svd_matrix_new,kmeans,"clusters_2d_svd_best_20classes.png") hlp.getStats(labels,kmeans.labels_) print ".........With NMF......." nmf = hlp.getNMF(10) nmf_matrix = nmf.fit_transform(tfidf_matrix) nmf_new=hlp.getNMF(2) nmf_matrix_new=nmf_new.fit_transform(nmf_matrix) kmeans.fit(nmf_matrix) hlp.plot20Clusters(nmf_matrix_new,kmeans,"clusters_2d_nmf_best_20classes.png") hlp.getStats(labels,kmeans.labels_)
predicted_ovo = classifier_ovo.predict(test) predicted_ovr = classifier_ovr.predict(test) print "One vs one" hlp.getStats(twenty_test.target, predicted_ovo) print "One vs Rest" hlp.getStats(twenty_test.target, predicted_ovr) for min_df in [2, 5]: print "Calculating for min_df = ", min_df count_vect = hlp.get_CountVectorizer(min_df) tfidf_transformer = TfidfTransformer(sublinear_tf=True) mysvd = TruncatedSVD(n_components=50) mynmf = hlp.getNMF() #for train data vectorizer-> tfidf transformer -> svd/nmf X_train_counts = count_vect.fit_transform(twenty_train.data) X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) X_train_svd = mysvd.fit_transform(X_train_tfidf) X_train_nmf = mynmf.fit_transform(X_train_tfidf) #for test data vectorizer-> tfidf transformer -> svd/nmf X_test_counts = count_vect.transform(twenty_test.data) X_test_tfidf = tfidf_transformer.transform(X_test_counts) X_test_svd = mysvd.transform(X_test_tfidf) X_test_nmf = mynmf.transform(X_test_tfidf) print "################# For SVM ##################" print "****** For SVD *******"
import helper as hlp import task1 as t1 from sklearn.cluster import KMeans import numpy as np import matplotlib.pyplot as plt dataset = hlp.fetch_data() hlp.classify_into_two_class(dataset) labels = hlp.fetch_labels(dataset) tfidf_matrix = t1.getTFIDF_matrix(dataset, 3) kmeans = hlp.getKmeans(2) svd = hlp.getSVD(3) svd_matrix = svd.fit_transform(tfidf_matrix) kmeans.fit(svd_matrix) hlp.plotClusters(svd_matrix, kmeans, "clusters_2d_svd_best.png") hlp.getStats(labels, kmeans.labels_) nmf = hlp.getNMF(10) nmf_matrix = nmf.fit_transform(tfidf_matrix) kmeans.fit(nmf_matrix) hlp.plotClusters(nmf_matrix, kmeans, "clusters_2d_nmf_best.png") hlp.getStats(labels, kmeans.labels_)
def getBestR(tfidf_matrix, num): rank_list = [1, 2, 3, 5, 10, 20, 50, 100, 300] homo_list_svd = [] comp_list_svd = [] vscore_list_svd = [] adjscore_list_svd = [] infoscore_list_svd = [] homo_list_nmf = [] comp_list_nmf = [] vscore_list_nmf = [] adjscore_list_nmf = [] infoscore_list_nmf = [] for r in rank_list: print "................. For r=", r, "......................\n" svd_matrix = hlp.getSVD(r) svd = svd_matrix.fit_transform(tfidf_matrix) nmf = hlp.getNMF(r) nmf_matrix = nmf.fit_transform(tfidf_matrix) km = hlp.getKmeans(num) print "*******With LSI********" km.fit(svd) h**o, comp, vscore, adjscore, infoscore = hlp.getStats( labels, km.labels_) homo_list_svd.append(h**o) comp_list_svd.append(comp) vscore_list_svd.append(vscore) adjscore_list_svd.append(adjscore) infoscore_list_svd.append(infoscore) print "" print "*******With NMF********" km.fit(nmf_matrix) h**o, comp, vscore, adjscore, infoscore = hlp.getStats( labels, km.labels_) homo_list_nmf.append(h**o) comp_list_nmf.append(comp) vscore_list_nmf.append(vscore) adjscore_list_nmf.append(adjscore) infoscore_list_nmf.append(infoscore) print "*******With LSI********" plt.plot(rank_list, homo_list_svd) plt.ylabel('Homogeneity Score') plt.show() plt.plot(rank_list, comp_list_svd) plt.ylabel('Completeness Score') plt.show() plt.plot(rank_list, vscore_list_svd) plt.ylabel('V-measure Score') plt.show() plt.plot(rank_list, adjscore_list_svd) plt.ylabel('Adjusted rand Score') plt.show() plt.plot(rank_list, infoscore_list_svd) plt.ylabel('Adjusted Mutual Info Score') plt.show() print "*******With NMF********" plt.plot(rank_list, homo_list_nmf) plt.ylabel('Homogeneity Score') plt.show() plt.plot(rank_list, comp_list_nmf) plt.ylabel('Completeness Score') plt.show() plt.plot(rank_list, vscore_list_nmf) plt.ylabel('V-measure Score') plt.show() plt.plot(rank_list, adjscore_list_nmf) plt.ylabel('Adjusted rand Score') plt.show() plt.plot(rank_list, infoscore_list_nmf) plt.ylabel('Adjusted Mutual Info Score') plt.show()
twenty_train, twenty_test, min_df) tfidfListTrain.append(tfidf_train_matrix) tfidfListTest.append(tfidf_test_matrix) svd = hlp.getSVD() svd_matrix_train = svd.fit_transform(tfidf_train_matrix) svd_matrix_test = svd.transform(tfidf_test_matrix) svdListTrain.append(svd_matrix_train) svdListTest.append(svd_matrix_test) print "Shape of svd matrix for min_df = ", min_df, " is ", svd.components_.shape ############### For NMF ################# nmfModel = hlp.getNMF() W_train = nmfModel.fit_transform(tfidf_train_matrix) W_test = nmfModel.transform(tfidf_test_matrix) nmfListTrain.append(W_train) nmfListTest.append(W_test) nmf = nmfModel.components_ print "Shape of nmf matrix for min_df = ", min_df, " is ", nmf.shape def getsvdListTrain(): return svdListTrain def getnmfListTrain():
def computeBestR(): for r in rank_list: print "................. For r=",r,"......................\n" svd=svd_matrix[:,0:r] nmf = hlp.getNMF(r) nmf_matrix = nmf.fit_transform(tfidf_matrix) km = hlp.getKmeans(20) print "*******With LSI********" km.fit(svd) h**o, comp, vscore, adjscore, infoscore = hlp.getStats(labels,km.labels_) homo_list_svd.append(h**o) comp_list_svd.append(comp) vscore_list_svd.append(vscore) adjscore_list_svd.append(adjscore) infoscore_list_svd.append(infoscore) print "" print "*******With NMF********" km.fit(nmf_matrix) h**o, comp, vscore, adjscore, infoscore = hlp.getStats(labels,km.labels_) homo_list_nmf.append(h**o) comp_list_nmf.append(comp) vscore_list_nmf.append(vscore) adjscore_list_nmf.append(adjscore) infoscore_list_nmf.append(infoscore) print ".............With LSI............." plt.plot(rank_list, homo_list_svd) plt.ylabel('Homogeneity Score') plt.show() plt.plot(rank_list, comp_list_svd) plt.ylabel('Completeness Score') plt.show() plt.plot(rank_list, vscore_list_svd) plt.ylabel('V-measure Score') plt.show() plt.plot(rank_list, adjscore_list_svd) plt.ylabel('Adjusted rand Score') plt.show() plt.plot(rank_list, infoscore_list_svd) plt.ylabel('Adjusted Mutual Info Score') plt.show() print "............With NMF............." plt.plot(rank_list, homo_list_nmf) plt.ylabel('Homogeneity Score') plt.show() plt.plot(rank_list, comp_list_nmf) plt.ylabel('Completeness Score') plt.show() plt.plot(rank_list, vscore_list_nmf) plt.ylabel('V-measure Score') plt.show() plt.plot(rank_list, adjscore_list_nmf) plt.ylabel('Adjusted rand Score') plt.show() plt.plot(rank_list, infoscore_list_nmf) plt.ylabel('Adjusted Mutual Info Score') plt.show()