Example #1
0
def computeWithScaling():
    print "-------------- Scaled SVD----------------"
    svd_old=svd_matrix[:,0:100]
    scaled_svd_matrix = preprocessing.scale(svd_old, with_mean = False)
    kmeans = hlp.getKmeans(20)
    svd_new=hlp.getSVD(2)
    svd_matrix_new = svd_new.fit_transform(scaled_svd_matrix)
    kmeans.fit(scaled_svd_matrix)
    hlp.plot20Clusters(svd_matrix_new,kmeans,"clusters_svd_scaled_20classes.png")
    hlp.getStats(labels,kmeans.labels_)
    
    print "--------------Scaled NMF----------------"
    nmf_old = hlp.getNMF(10)
    nmf_matrix = nmf_old.fit_transform(tfidf_matrix)
    scaled_nmf_matrix = preprocessing.scale(nmf_matrix, with_mean = False)
    kmeans = hlp.getKmeans(20)
    nmf_new=hlp.getNMF(2)
    nmf_matrix_new=nmf_new.fit_transform(scaled_nmf_matrix)
    kmeans.fit(scaled_nmf_matrix)
    hlp.plot20Clusters(nmf_matrix_new,kmeans,"clusters_nmf_scaled_20classes.png")
    hlp.getStats(labels,kmeans.labels_)
    
    print "--------------Logarithmic NMF----------------"
    nmf_matrix = nmf_old.fit_transform(tfidf_matrix)
    log_matrix = np.log(nmf_matrix+1)
    nmf_new=hlp.getNMF(2)
    nmf_matrix_new=nmf_new.fit_transform(log_matrix)
    kmeans.fit(log_matrix)
    hlp.plot20Clusters(nmf_matrix_new,kmeans,"clusters_nmf_log_20classes.png")
    hlp.getStats(labels,kmeans.labels_)
    
    print "--------------Log scaled NMF----------------"
    nmf_matrix = nmf_old.fit_transform(tfidf_matrix)
    log_matrix = np.log(nmf_matrix+1)
    nmf_matrix_scaled = preprocessing.scale(log_matrix, with_mean = False)
    nmf_new=hlp.getNMF(2)
    nmf_matrix_new=nmf_new.fit_transform(nmf_matrix_scaled)
    kmeans.fit(nmf_matrix_scaled)
    hlp.plot20Clusters(nmf_matrix_new,kmeans,"clusters_nmf_log_scaled_20classes.png")
    hlp.getStats(labels,kmeans.labels_)
    
    print "--------------Scaled log NMF----------------"
    nmf_matrix = nmf_old.fit_transform(tfidf_matrix)
    scaled_matrix = preprocessing.scale(nmf_matrix, with_mean = False)
    log_scaled_nmf = np.log(scaled_matrix+1)
    nmf_new=hlp.getNMF(2)
    nmf_matrix_new=nmf_new.fit_transform(log_scaled_nmf)
    kmeans.fit(log_scaled_nmf)
    hlp.plot20Clusters(nmf_matrix_new,kmeans,"clusters_nmf_scaled_log_20classes.png")
    hlp.getStats(labels,kmeans.labels_)
Example #2
0
def compute4a():
    #convert HD to 2D
	print "........With LSI........"
	svd_old =svd_matrix[:,0:100]
	kmeans = hlp.getKmeans(20)
	svd_new=hlp.getSVD(2)
	svd_matrix_new = svd_new.fit_transform(svd_old)
	kmeans.fit(svd_old)
	hlp.plot20Clusters(svd_matrix_new,kmeans,"clusters_2d_svd_best_20classes.png")
	hlp.getStats(labels,kmeans.labels_)
	
	print ".........With NMF......."
	nmf = hlp.getNMF(10)
	nmf_matrix = nmf.fit_transform(tfidf_matrix)
	nmf_new=hlp.getNMF(2)
	nmf_matrix_new=nmf_new.fit_transform(nmf_matrix)
	kmeans.fit(nmf_matrix)
	hlp.plot20Clusters(nmf_matrix_new,kmeans,"clusters_2d_nmf_best_20classes.png")
	hlp.getStats(labels,kmeans.labels_)
Example #3
0
    predicted_ovo = classifier_ovo.predict(test)
    predicted_ovr = classifier_ovr.predict(test)

    print "One vs one"
    hlp.getStats(twenty_test.target, predicted_ovo)

    print "One vs Rest"
    hlp.getStats(twenty_test.target, predicted_ovr)


for min_df in [2, 5]:
    print "Calculating for min_df = ", min_df
    count_vect = hlp.get_CountVectorizer(min_df)
    tfidf_transformer = TfidfTransformer(sublinear_tf=True)
    mysvd = TruncatedSVD(n_components=50)
    mynmf = hlp.getNMF()

    #for train data vectorizer-> tfidf transformer -> svd/nmf
    X_train_counts = count_vect.fit_transform(twenty_train.data)
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
    X_train_svd = mysvd.fit_transform(X_train_tfidf)
    X_train_nmf = mynmf.fit_transform(X_train_tfidf)

    #for test data vectorizer-> tfidf transformer -> svd/nmf
    X_test_counts = count_vect.transform(twenty_test.data)
    X_test_tfidf = tfidf_transformer.transform(X_test_counts)
    X_test_svd = mysvd.transform(X_test_tfidf)
    X_test_nmf = mynmf.transform(X_test_tfidf)

    print "################# For SVM ##################"
    print "****** For SVD *******"
Example #4
0
import helper as hlp
import task1 as t1
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt

dataset = hlp.fetch_data()

hlp.classify_into_two_class(dataset)
labels = hlp.fetch_labels(dataset)

tfidf_matrix = t1.getTFIDF_matrix(dataset, 3)
kmeans = hlp.getKmeans(2)

svd = hlp.getSVD(3)
svd_matrix = svd.fit_transform(tfidf_matrix)
kmeans.fit(svd_matrix)
hlp.plotClusters(svd_matrix, kmeans, "clusters_2d_svd_best.png")
hlp.getStats(labels, kmeans.labels_)

nmf = hlp.getNMF(10)
nmf_matrix = nmf.fit_transform(tfidf_matrix)
kmeans.fit(nmf_matrix)
hlp.plotClusters(nmf_matrix, kmeans, "clusters_2d_nmf_best.png")
hlp.getStats(labels, kmeans.labels_)
Example #5
0
def getBestR(tfidf_matrix, num):
    rank_list = [1, 2, 3, 5, 10, 20, 50, 100, 300]

    homo_list_svd = []
    comp_list_svd = []
    vscore_list_svd = []
    adjscore_list_svd = []
    infoscore_list_svd = []

    homo_list_nmf = []
    comp_list_nmf = []
    vscore_list_nmf = []
    adjscore_list_nmf = []
    infoscore_list_nmf = []

    for r in rank_list:
        print "................. For r=", r, "......................\n"
        svd_matrix = hlp.getSVD(r)
        svd = svd_matrix.fit_transform(tfidf_matrix)

        nmf = hlp.getNMF(r)
        nmf_matrix = nmf.fit_transform(tfidf_matrix)

        km = hlp.getKmeans(num)

        print "*******With LSI********"
        km.fit(svd)
        h**o, comp, vscore, adjscore, infoscore = hlp.getStats(
            labels, km.labels_)
        homo_list_svd.append(h**o)
        comp_list_svd.append(comp)
        vscore_list_svd.append(vscore)
        adjscore_list_svd.append(adjscore)
        infoscore_list_svd.append(infoscore)
        print ""

        print "*******With NMF********"
        km.fit(nmf_matrix)
        h**o, comp, vscore, adjscore, infoscore = hlp.getStats(
            labels, km.labels_)
        homo_list_nmf.append(h**o)
        comp_list_nmf.append(comp)
        vscore_list_nmf.append(vscore)
        adjscore_list_nmf.append(adjscore)
        infoscore_list_nmf.append(infoscore)

    print "*******With LSI********"

    plt.plot(rank_list, homo_list_svd)
    plt.ylabel('Homogeneity Score')
    plt.show()

    plt.plot(rank_list, comp_list_svd)
    plt.ylabel('Completeness Score')
    plt.show()

    plt.plot(rank_list, vscore_list_svd)
    plt.ylabel('V-measure Score')
    plt.show()

    plt.plot(rank_list, adjscore_list_svd)
    plt.ylabel('Adjusted rand Score')
    plt.show()

    plt.plot(rank_list, infoscore_list_svd)
    plt.ylabel('Adjusted Mutual Info Score')
    plt.show()

    print "*******With NMF********"

    plt.plot(rank_list, homo_list_nmf)
    plt.ylabel('Homogeneity Score')
    plt.show()

    plt.plot(rank_list, comp_list_nmf)
    plt.ylabel('Completeness Score')
    plt.show()

    plt.plot(rank_list, vscore_list_nmf)
    plt.ylabel('V-measure Score')
    plt.show()

    plt.plot(rank_list, adjscore_list_nmf)
    plt.ylabel('Adjusted rand Score')
    plt.show()

    plt.plot(rank_list, infoscore_list_nmf)
    plt.ylabel('Adjusted Mutual Info Score')
    plt.show()
Example #6
0
        twenty_train, twenty_test, min_df)

    tfidfListTrain.append(tfidf_train_matrix)
    tfidfListTest.append(tfidf_test_matrix)

    svd = hlp.getSVD()
    svd_matrix_train = svd.fit_transform(tfidf_train_matrix)
    svd_matrix_test = svd.transform(tfidf_test_matrix)

    svdListTrain.append(svd_matrix_train)
    svdListTest.append(svd_matrix_test)

    print "Shape of svd matrix for min_df = ", min_df, " is ", svd.components_.shape

    ############### For NMF #################
    nmfModel = hlp.getNMF()
    W_train = nmfModel.fit_transform(tfidf_train_matrix)
    W_test = nmfModel.transform(tfidf_test_matrix)

    nmfListTrain.append(W_train)
    nmfListTest.append(W_test)

    nmf = nmfModel.components_
    print "Shape of nmf matrix for min_df = ", min_df, " is ", nmf.shape


def getsvdListTrain():
    return svdListTrain


def getnmfListTrain():
Example #7
0
def computeBestR():
    for r in rank_list:
        print "................. For r=",r,"......................\n"
        svd=svd_matrix[:,0:r]
        nmf = hlp.getNMF(r)
        nmf_matrix = nmf.fit_transform(tfidf_matrix)

        km = hlp.getKmeans(20)

        print "*******With LSI********"
        km.fit(svd)
        h**o, comp, vscore, adjscore, infoscore = hlp.getStats(labels,km.labels_)
        homo_list_svd.append(h**o)
        comp_list_svd.append(comp)
        vscore_list_svd.append(vscore)
        adjscore_list_svd.append(adjscore)
        infoscore_list_svd.append(infoscore)
        print ""
        
        
        print "*******With NMF********"
        km.fit(nmf_matrix)
        h**o, comp, vscore, adjscore, infoscore = hlp.getStats(labels,km.labels_)
        homo_list_nmf.append(h**o)
        comp_list_nmf.append(comp)
        vscore_list_nmf.append(vscore)
        adjscore_list_nmf.append(adjscore)
        infoscore_list_nmf.append(infoscore)

    print ".............With LSI............."
    plt.plot(rank_list, homo_list_svd)
    plt.ylabel('Homogeneity Score')
    plt.show()

    plt.plot(rank_list, comp_list_svd)
    plt.ylabel('Completeness Score')
    plt.show()

    plt.plot(rank_list, vscore_list_svd)
    plt.ylabel('V-measure Score')
    plt.show()

    plt.plot(rank_list, adjscore_list_svd)
    plt.ylabel('Adjusted rand Score')
    plt.show()

    plt.plot(rank_list, infoscore_list_svd)
    plt.ylabel('Adjusted Mutual Info Score')
    plt.show()
    
    print "............With NMF............."
    plt.plot(rank_list, homo_list_nmf)
    plt.ylabel('Homogeneity Score')
    plt.show()

    plt.plot(rank_list, comp_list_nmf)
    plt.ylabel('Completeness Score')
    plt.show()

    plt.plot(rank_list, vscore_list_nmf)
    plt.ylabel('V-measure Score')
    plt.show()

    plt.plot(rank_list, adjscore_list_nmf)
    plt.ylabel('Adjusted rand Score')
    plt.show()

    plt.plot(rank_list, infoscore_list_nmf)
    plt.ylabel('Adjusted Mutual Info Score')
    plt.show()