Exemple #1
0
def classifyLR(train, test):
    classifier.fit(train, twenty_train.target)
    predicted = classifier.predict(test)
    predicted_probs = classifier.predict_proba(test)
    hlp.getStats(twenty_test.target, predicted)
    hlp.plot_roc(twenty_test.target, predicted_probs[:, 1],
                 'Logistic Regression')
Exemple #2
0
def classifyWithSVC(valC):
    clf = svm.SVC(C=valC, probability=True, kernel='linear', random_state=42)

    svdListTrain = td.getsvdListTrain()
    nmfListTrain = td.getnmfListTrain()
    svdListTest = td.getsvdListTest()
    nmfListTest = td.getnmfListTest()

    for min_df in [2,5]:
        print ".......... With min_df = ", min_df , "..........."
        if min_df == 2:
            svd_matrix_train=svdListTrain[0]
            nmf_matrix_train=nmfListTrain[0]
            svd_matrix_test=svdListTest[0]
            nmf_matrix_test=nmfListTest[0]
        else: 
            svd_matrix_train=svdListTrain[1]
            nmf_matrix_train=nmfListTrain[1]
            svd_matrix_test=svdListTest[1]
            nmf_matrix_test=nmfListTest[1]        

        print "With SVD"
        clf.fit(svd_matrix_train, twenty_train.target)
        predicted = clf.predict(svd_matrix_test)
        probabilities = clf.predict_proba(svd_matrix_test)
        hlp.getStats(twenty_test.target, predicted)
        hlp.plot_roc(twenty_test.target, probabilities[:,1], 'SVM')

        print "With NMF"
        clf.fit(nmf_matrix_train, twenty_train.target)
        predicted = clf.predict(nmf_matrix_test)
        probabilitiesnmf = clf.predict_proba(nmf_matrix_test)
        hlp.getStats(twenty_test.target, predicted)
        hlp.plot_roc(twenty_test.target, probabilitiesnmf[:,1], 'SVM')
Exemple #3
0
def computeWithScaling():
    print "-------------- Scaled SVD----------------"
    svd_old=svd_matrix[:,0:100]
    scaled_svd_matrix = preprocessing.scale(svd_old, with_mean = False)
    kmeans = hlp.getKmeans(20)
    svd_new=hlp.getSVD(2)
    svd_matrix_new = svd_new.fit_transform(scaled_svd_matrix)
    kmeans.fit(scaled_svd_matrix)
    hlp.plot20Clusters(svd_matrix_new,kmeans,"clusters_svd_scaled_20classes.png")
    hlp.getStats(labels,kmeans.labels_)
    
    print "--------------Scaled NMF----------------"
    nmf_old = hlp.getNMF(10)
    nmf_matrix = nmf_old.fit_transform(tfidf_matrix)
    scaled_nmf_matrix = preprocessing.scale(nmf_matrix, with_mean = False)
    kmeans = hlp.getKmeans(20)
    nmf_new=hlp.getNMF(2)
    nmf_matrix_new=nmf_new.fit_transform(scaled_nmf_matrix)
    kmeans.fit(scaled_nmf_matrix)
    hlp.plot20Clusters(nmf_matrix_new,kmeans,"clusters_nmf_scaled_20classes.png")
    hlp.getStats(labels,kmeans.labels_)
    
    print "--------------Logarithmic NMF----------------"
    nmf_matrix = nmf_old.fit_transform(tfidf_matrix)
    log_matrix = np.log(nmf_matrix+1)
    nmf_new=hlp.getNMF(2)
    nmf_matrix_new=nmf_new.fit_transform(log_matrix)
    kmeans.fit(log_matrix)
    hlp.plot20Clusters(nmf_matrix_new,kmeans,"clusters_nmf_log_20classes.png")
    hlp.getStats(labels,kmeans.labels_)
    
    print "--------------Log scaled NMF----------------"
    nmf_matrix = nmf_old.fit_transform(tfidf_matrix)
    log_matrix = np.log(nmf_matrix+1)
    nmf_matrix_scaled = preprocessing.scale(log_matrix, with_mean = False)
    nmf_new=hlp.getNMF(2)
    nmf_matrix_new=nmf_new.fit_transform(nmf_matrix_scaled)
    kmeans.fit(nmf_matrix_scaled)
    hlp.plot20Clusters(nmf_matrix_new,kmeans,"clusters_nmf_log_scaled_20classes.png")
    hlp.getStats(labels,kmeans.labels_)
    
    print "--------------Scaled log NMF----------------"
    nmf_matrix = nmf_old.fit_transform(tfidf_matrix)
    scaled_matrix = preprocessing.scale(nmf_matrix, with_mean = False)
    log_scaled_nmf = np.log(scaled_matrix+1)
    nmf_new=hlp.getNMF(2)
    nmf_matrix_new=nmf_new.fit_transform(log_scaled_nmf)
    kmeans.fit(log_scaled_nmf)
    hlp.plot20Clusters(nmf_matrix_new,kmeans,"clusters_nmf_scaled_log_20classes.png")
    hlp.getStats(labels,kmeans.labels_)
Exemple #4
0
def classify(train, test, obj):
    classifier_ovo = OneVsOneClassifier(obj)
    classifier_ovr = OneVsRestClassifier(obj)

    classifier_ovo.fit(train, twenty_train.target)
    classifier_ovr.fit(train, twenty_train.target)

    print "Testing"
    predicted_ovo = classifier_ovo.predict(test)
    predicted_ovr = classifier_ovr.predict(test)

    print "One vs one"
    hlp.getStats(twenty_test.target, predicted_ovo)

    print "One vs Rest"
    hlp.getStats(twenty_test.target, predicted_ovr)
Exemple #5
0
def api_stats():
    database = os.path.expanduser(flask.request.args.get('database'))
    if not os.path.isfile(database):
        return flask.abort(404)

    with sqlite3.connect(database) as conn:
        data = pandas.read_sql_query('select * from measurement;', conn)
        stats = helper.getStats(data)
        return flask.jsonify(stats)
Exemple #6
0
def compute4a():
    #convert HD to 2D
	print "........With LSI........"
	svd_old =svd_matrix[:,0:100]
	kmeans = hlp.getKmeans(20)
	svd_new=hlp.getSVD(2)
	svd_matrix_new = svd_new.fit_transform(svd_old)
	kmeans.fit(svd_old)
	hlp.plot20Clusters(svd_matrix_new,kmeans,"clusters_2d_svd_best_20classes.png")
	hlp.getStats(labels,kmeans.labels_)
	
	print ".........With NMF......."
	nmf = hlp.getNMF(10)
	nmf_matrix = nmf.fit_transform(tfidf_matrix)
	nmf_new=hlp.getNMF(2)
	nmf_matrix_new=nmf_new.fit_transform(nmf_matrix)
	kmeans.fit(nmf_matrix)
	hlp.plot20Clusters(nmf_matrix_new,kmeans,"clusters_2d_nmf_best_20classes.png")
	hlp.getStats(labels,kmeans.labels_)
Exemple #7
0
def getDataPerYear():
    year = request.args.get('year', default='1970', type=str)
    filename = "Data" + year + ".csv"

    df = helper.getDataFrameBasedOnYear(filename)

    attr1 = request.args.get('attr', default='Sex', type=str)
    if attr1 == 'Immigrant':
        attr1 = "Native"
    attr1 += "_Ratio"

    attr2 = request.args.get('profiler', default='PerCapitaIncome', type=str)
    corr, pval = helper.getStats(df, attr1, attr2)

    if (pval < 0.05):
        print("P - value : " + str(pval) + ". STATISTICALLY SIGNIFICANT.")
    else:
        print("P - value : " + str(pval) + ". STATISTICALLY INSIGNIFICANT.")
    helper.writeToFile("stats.txt", corr, pval)
    getStats()
    return df.to_csv()
Exemple #8
0
nmfListTest = td.getnmfListTest()

classifier = MultinomialNB()

for min_df in [2, 5]:
    print "WIth min_df = ", min_df
    if min_df == 2:
        nmf_matrix_train = nmfListTrain[0]
        nmf_matrix_test = nmfListTest[0]
        tfidf_matrix_train = tfidfListTrain[0]
        tfidf_matrix_test = tfidfListTest[0]
    else:
        nmf_matrix_train = nmfListTrain[1]
        nmf_matrix_test = nmfListTest[1]
        tfidf_matrix_train = tfidfListTrain[1]
        tfidf_matrix_test = tfidfListTest[1]

    print ".......... With SVD ........."
    classifier.fit(tfidf_matrix_train, twenty_train.target)
    predicted = classifier.predict(tfidf_matrix_test)
    probabilities = classifier.predict_proba(tfidf_matrix_test)
    hlp.getStats(twenty_test.target, predicted)
    hlp.plot_roc(twenty_test.target, probabilities[:, 1], 'MultinomialNB')

    print ".......... With NMF .........."
    classifier.fit(nmf_matrix_train, twenty_train.target)
    predicted = classifier.predict(nmf_matrix_test)
    probabilities = classifier.predict_proba(nmf_matrix_test)
    hlp.getStats(twenty_test.target, predicted)
    hlp.plot_roc(twenty_test.target, probabilities[:, 1], 'MultinomialNB')
Exemple #9
0
import helper as hlp
import task1 as t1

dataset = hlp.fetch_data()
hlp.classify_into_two_class(dataset)
labels = hlp.fetch_labels(dataset)

tfidf_matrix = t1.getTFIDF_matrix(dataset, 3)
km = hlp.getKmeans(2)
km.fit(tfidf_matrix)
hlp.getStats(labels, km.labels_)
Exemple #10
0
import helper as hlp
import task1 as t1
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt

dataset = hlp.fetch_data()

hlp.classify_into_two_class(dataset)
labels = hlp.fetch_labels(dataset)

tfidf_matrix = t1.getTFIDF_matrix(dataset, 3)
kmeans = hlp.getKmeans(2)

svd = hlp.getSVD(3)
svd_matrix = svd.fit_transform(tfidf_matrix)
kmeans.fit(svd_matrix)
hlp.plotClusters(svd_matrix, kmeans, "clusters_2d_svd_best.png")
hlp.getStats(labels, kmeans.labels_)

nmf = hlp.getNMF(10)
nmf_matrix = nmf.fit_transform(tfidf_matrix)
kmeans.fit(nmf_matrix)
hlp.plotClusters(nmf_matrix, kmeans, "clusters_2d_nmf_best.png")
hlp.getStats(labels, kmeans.labels_)
Exemple #11
0
def getBestR(tfidf_matrix, num):
    rank_list = [1, 2, 3, 5, 10, 20, 50, 100, 300]

    homo_list_svd = []
    comp_list_svd = []
    vscore_list_svd = []
    adjscore_list_svd = []
    infoscore_list_svd = []

    homo_list_nmf = []
    comp_list_nmf = []
    vscore_list_nmf = []
    adjscore_list_nmf = []
    infoscore_list_nmf = []

    for r in rank_list:
        print "................. For r=", r, "......................\n"
        svd_matrix = hlp.getSVD(r)
        svd = svd_matrix.fit_transform(tfidf_matrix)

        nmf = hlp.getNMF(r)
        nmf_matrix = nmf.fit_transform(tfidf_matrix)

        km = hlp.getKmeans(num)

        print "*******With LSI********"
        km.fit(svd)
        h**o, comp, vscore, adjscore, infoscore = hlp.getStats(
            labels, km.labels_)
        homo_list_svd.append(h**o)
        comp_list_svd.append(comp)
        vscore_list_svd.append(vscore)
        adjscore_list_svd.append(adjscore)
        infoscore_list_svd.append(infoscore)
        print ""

        print "*******With NMF********"
        km.fit(nmf_matrix)
        h**o, comp, vscore, adjscore, infoscore = hlp.getStats(
            labels, km.labels_)
        homo_list_nmf.append(h**o)
        comp_list_nmf.append(comp)
        vscore_list_nmf.append(vscore)
        adjscore_list_nmf.append(adjscore)
        infoscore_list_nmf.append(infoscore)

    print "*******With LSI********"

    plt.plot(rank_list, homo_list_svd)
    plt.ylabel('Homogeneity Score')
    plt.show()

    plt.plot(rank_list, comp_list_svd)
    plt.ylabel('Completeness Score')
    plt.show()

    plt.plot(rank_list, vscore_list_svd)
    plt.ylabel('V-measure Score')
    plt.show()

    plt.plot(rank_list, adjscore_list_svd)
    plt.ylabel('Adjusted rand Score')
    plt.show()

    plt.plot(rank_list, infoscore_list_svd)
    plt.ylabel('Adjusted Mutual Info Score')
    plt.show()

    print "*******With NMF********"

    plt.plot(rank_list, homo_list_nmf)
    plt.ylabel('Homogeneity Score')
    plt.show()

    plt.plot(rank_list, comp_list_nmf)
    plt.ylabel('Completeness Score')
    plt.show()

    plt.plot(rank_list, vscore_list_nmf)
    plt.ylabel('V-measure Score')
    plt.show()

    plt.plot(rank_list, adjscore_list_nmf)
    plt.ylabel('Adjusted rand Score')
    plt.show()

    plt.plot(rank_list, infoscore_list_nmf)
    plt.ylabel('Adjusted Mutual Info Score')
    plt.show()
Exemple #12
0
def computeBestR():
    for r in rank_list:
        print "................. For r=",r,"......................\n"
        svd=svd_matrix[:,0:r]
        nmf = hlp.getNMF(r)
        nmf_matrix = nmf.fit_transform(tfidf_matrix)

        km = hlp.getKmeans(20)

        print "*******With LSI********"
        km.fit(svd)
        h**o, comp, vscore, adjscore, infoscore = hlp.getStats(labels,km.labels_)
        homo_list_svd.append(h**o)
        comp_list_svd.append(comp)
        vscore_list_svd.append(vscore)
        adjscore_list_svd.append(adjscore)
        infoscore_list_svd.append(infoscore)
        print ""
        
        
        print "*******With NMF********"
        km.fit(nmf_matrix)
        h**o, comp, vscore, adjscore, infoscore = hlp.getStats(labels,km.labels_)
        homo_list_nmf.append(h**o)
        comp_list_nmf.append(comp)
        vscore_list_nmf.append(vscore)
        adjscore_list_nmf.append(adjscore)
        infoscore_list_nmf.append(infoscore)

    print ".............With LSI............."
    plt.plot(rank_list, homo_list_svd)
    plt.ylabel('Homogeneity Score')
    plt.show()

    plt.plot(rank_list, comp_list_svd)
    plt.ylabel('Completeness Score')
    plt.show()

    plt.plot(rank_list, vscore_list_svd)
    plt.ylabel('V-measure Score')
    plt.show()

    plt.plot(rank_list, adjscore_list_svd)
    plt.ylabel('Adjusted rand Score')
    plt.show()

    plt.plot(rank_list, infoscore_list_svd)
    plt.ylabel('Adjusted Mutual Info Score')
    plt.show()
    
    print "............With NMF............."
    plt.plot(rank_list, homo_list_nmf)
    plt.ylabel('Homogeneity Score')
    plt.show()

    plt.plot(rank_list, comp_list_nmf)
    plt.ylabel('Completeness Score')
    plt.show()

    plt.plot(rank_list, vscore_list_nmf)
    plt.ylabel('V-measure Score')
    plt.show()

    plt.plot(rank_list, adjscore_list_nmf)
    plt.ylabel('Adjusted rand Score')
    plt.show()

    plt.plot(rank_list, infoscore_list_nmf)
    plt.ylabel('Adjusted Mutual Info Score')
    plt.show()