def getDataSets(normalize=False, selected=False):
    
    y_train = load_numpy_matrix(feature_set_path +  r'valueVector'+tag+'_train.npy')
    y_test = load_numpy_matrix(feature_set_path +  r'valueVector'+tag+'_test.npy')
    
    
    X_train = load_numpy_matrix(feature_set_path +  r'featureArray'+tag+'_train.npy')
    sd = load_numpy_matrix(feature_set_path +  r'socialVector'+tag+'_train.npy')
    X_train =  np.hstack((X_train,sd))
    X_test = load_numpy_matrix(feature_set_path +  r'featureArray'+tag+'_test.npy')
    sd2 = load_numpy_matrix(feature_set_path +  r'socialVector'+tag+'_test.npy')
    X_test =  np.hstack((X_test,sd2))
    
    return X_train, X_test, y_train, y_test
def getDataSets(normalize=False, selected=False):

    y_train = load_numpy_matrix(feature_set_path + r'valueVector' + tag +
                                '_train.npy')
    y_test = load_numpy_matrix(feature_set_path + r'valueVector' + tag +
                               '_test.npy')

    X_train = load_numpy_matrix(feature_set_path + r'featureArray' + tag +
                                '_train.npy')
    sd = load_numpy_matrix(feature_set_path + r'socialVector' + tag +
                           '_train.npy')
    X_train = np.hstack((X_train, sd))
    X_test = load_numpy_matrix(feature_set_path + r'featureArray' + tag +
                               '_test.npy')
    sd2 = load_numpy_matrix(feature_set_path + r'socialVector' + tag +
                            '_test.npy')
    X_test = np.hstack((X_test, sd2))

    return X_train, X_test, y_train, y_test
Exemple #3
0
                           verbose=1,
                           n_jobs=2).fit(Xn, yn)

        print("Best parameters set found on development set:")
        print
        print(clf.best_estimator_)
        print(clf.best_score_)
        print()
        print confusion_matrix(yn, clf.predict(Xn))


if __name__ == '__main__':

    if manual_tests:
        Xn = csr_matrix(np.array((0, 0)))
        yn = load_numpy_matrix(feature_set_path + 'valueVector' + tag +
                               '_train.npy')
        print Counter(yn)

        filepath = 'MANUAL'
        print load_numpy_matrix(feature_set_path + 'featureArray' + tag +
                                '_train.npy').shape
        print load_numpy_matrix(feature_set_path + 'socialVector' + tag +
                                '_train.npy').shape
        Xn = np.hstack((load_numpy_matrix(feature_set_path + 'featureArray' +
                                          tag + '_train.npy'),
                        load_numpy_matrix(feature_set_path + 'socialVector' +
                                          tag + '_train.npy')))

        Xn = SelectPercentile(score_func=f_classif,
                              percentile=perc).fit_transform(Xn, yn)
from sklearn.linear_model.coordinate_descent import Lasso, ElasticNet
from sklearn.metrics.regression import mean_squared_error
from sklearn.preprocessing.data import normalize, Normalizer
from sklearn.svm.classes import SVR

from RatingPrediction.main import normalize_sets_sparse
from config import feature_set_path
import numpy as np
from sklearn.metrics.metrics import r2_score

import matplotlib.pyplot as plt

tag='_slashdot'


X_train = load_numpy_matrix(feature_set_path +  r'featureArray'+tag+'_train.npy')
sd = load_numpy_matrix(feature_set_path +  r'socialVector'+tag+'_train.npy')
X_train =  np.hstack((X_train,sd))
X_test = load_numpy_matrix(feature_set_path +  r'featureArray'+tag+'_test.npy')
sd2 = load_numpy_matrix(feature_set_path +  r'socialVector'+tag+'_test.npy')
X_test =  np.hstack((X_test,sd2))
'''

X_train = load_sparse_csr(feature_set_path +  r'binaryWordData'+tag+'_train.npz')  
X_test = load_sparse_csr(feature_set_path +  r'binaryWordData'+tag+'_test.npz')  
'''
            
y_train = load_numpy_matrix(feature_set_path +  r'valueVector'+tag+'_train.npy')
y_test = load_numpy_matrix(feature_set_path +  r'valueVector'+tag+'_test.npy')

Exemple #5
0
from sklearn.metrics.metrics import classification_report, f1_score
from sklearn.svm.classes import SVC, LinearSVC

from RatingPrediction.main import runClassificationTest
from config import feature_set_path
import numpy as np
from sklearn.multiclass import OneVsRestClassifier

perc = 50
scale = 1
#tag = "_toy"
#tag = '_main'
tag = '_slashdot'

Xn = csr_matrix(np.array((0, 0)))
yn = load_numpy_matrix(feature_set_path + 'valueVector' + tag + '_train.npy')
#yn = load_numpy_matrix(feature_set_path+ 'sentenceValueVector.npy')[:,valueV]
'''
filepath = 'MANUAL'
print load_numpy_matrix(feature_set_path+ 'featureArray' + tag + '_train.npy').shape
print load_numpy_matrix(feature_set_path+ 'socialVector' + tag + '_train.npy').shape
Xn = np.hstack((load_numpy_matrix(feature_set_path+ 'featureArray' + tag + '_train.npy'),load_numpy_matrix(feature_set_path+ 'socialVector' + tag + '_train.npy') ))
'''
#filepath = feature_set_path+ 'binaryWordData' + tag + '_train.npz'
#filepath = feature_set_path+ 'freqWordData' + tag + '_train.npz'
filepath = feature_set_path + 'tfidfWordData' + tag + '_train.npz'
#filepath = feature_set_path+ 'bigramBinaryWordData' + tag + '_train.npz'
#filepath = feature_set_path+ 'bigramTfidfWordData' + tag + '_train.npz'
#filepath = feature_set_path+ 'trigramBinaryWordData' + tag + '_train.npz'
#filepath = feature_set_path+ 'trigramTfidfWordData' + tag + '_train.npz'
#filepath = feature_set_path+ 'quadgramBinaryWordData' + tag + '_train.npz'
Exemple #6
0
from sklearn.linear_model import LinearRegression
from sklearn.linear_model.coordinate_descent import Lasso, ElasticNet
from sklearn.metrics.regression import mean_squared_error
from sklearn.preprocessing.data import normalize, Normalizer
from sklearn.svm.classes import SVR

from RatingPrediction.main import normalize_sets_sparse
from config import feature_set_path
import numpy as np
from sklearn.metrics.metrics import r2_score

import matplotlib.pyplot as plt

tag = '_slashdot'

X_train = load_numpy_matrix(feature_set_path + r'featureArray' + tag +
                            '_train.npy')
sd = load_numpy_matrix(feature_set_path + r'socialVector' + tag + '_train.npy')
X_train = np.hstack((X_train, sd))
X_test = load_numpy_matrix(feature_set_path + r'featureArray' + tag +
                           '_test.npy')
sd2 = load_numpy_matrix(feature_set_path + r'socialVector' + tag + '_test.npy')
X_test = np.hstack((X_test, sd2))
'''

X_train = load_sparse_csr(feature_set_path +  r'binaryWordData'+tag+'_train.npz')  
X_test = load_sparse_csr(feature_set_path +  r'binaryWordData'+tag+'_test.npz')  
'''

y_train = load_numpy_matrix(feature_set_path + r'valueVector' + tag +
                            '_train.npy')
y_test = load_numpy_matrix(feature_set_path + r'valueVector' + tag +
Exemple #7
0
        articleList, commentList, parentList, commentCount = read_toy_comments(
            comment_data_path + 'trainTestDataSet.txt',
            comment_data_path + 'toyComments.csv')
        tag = '_toy'
    elif set == 3:
        articleList, commentList, commentCount = read_slashdot_comments(
            comment_data_path + 'slashdotDataSet.txt', limit=100000)
        tag = '_slashdot'

    processed_comment_list = []
    for art in commentList.items():
        for comm in art[1]:
            processed_comment_list.append(comm.body.decode('ascii', 'ignore'))
    features = vectorizer.transform(processed_comment_list)

    y_train = load_numpy_matrix(feature_set_path + r'valueVector' + tag +
                                '_train.npy')
    y_test = load_numpy_matrix(feature_set_path + r'valueVector' + tag +
                               '_test.npy')

    print features.shape
    print y_train.shape
    print y_test.shape

    valueVector = np.concatenate([y_train, y_test])
    print
    print valueVector.shape

    # train_list = [' '.join(sent) for sent in train_list]
    # test_list = [' '.join(sent) for sent in test_list]
    predicted = [float(v) for v in clf.predict(features)]
Exemple #8
0
datatype = 3
perc = 50

if __name__ == '__main__':
    if datatype == 1:
        tag = '_main'
        model_tag = '_news24'
    elif datatype == 2:
        tag = "_toy"
        model_tag = '_news24'
    elif datatype == 3:
        tag = '_slashdot'
        model_tag = '_slashdot'

    for featureV in [8]:
        y_train = load_numpy_matrix(feature_set_path + r'valueVector' + tag +
                                    '_train.npy')
        y_test = load_numpy_matrix(feature_set_path + r'valueVector' + tag +
                                   '_test.npy')

        if featureV == 0:
            X_train = load_numpy_matrix(feature_set_path + r'featureArray' +
                                        tag + '_train.npy')
            sd = load_numpy_matrix(feature_set_path + r'socialVector' + tag +
                                   '_train.npy')
            X_train = np.hstack((X_train, sd))
            X_test = load_numpy_matrix(feature_set_path + r'featureArray' +
                                       tag + '_test.npy')
            sd2 = load_numpy_matrix(feature_set_path + r'socialVector' + tag +
                                    '_test.npy')
            X_test = np.hstack((X_test, sd2))
            perc = 80
def saveIt(filename, matrix, header, delimiter=","):
    with open(filename, 'w') as fh:        
        fh.write(delimiter.join(header) +"\n")
        for row in matrix:
            line = delimiter.join(str(int(value)) if value.is_integer() else "%.6f" % value for value in row)
            fh.write(line+"\n")
            
if __name__ == '__main__':
    #get_new_test_X()
    #get_new_test_Y()
    tempCWD = os.getcwd()
    
    os.chdir('D:\Workspace\NLTK comments\src\FeatureExtraction')    
         
    X_orig = load_numpy_matrix("featureArray.npy")
    y_orig = load_numpy_matrix("valueVector.npy")
    #wd = load_sparse_csr("freqWordData.npz").todense()
    
    print X_orig.shape
    print y_orig[:,3].shape
    #print wd.shape
    Xn = np.c_[X_orig, y_orig[:,3]]
    #wd = np.c_[wd, y_orig[:,3]]
    
    
    head = ["Lengthiness",
            "Questionfrequency",
            "Exclamationfrequency",
            "CapitalFrequency",
            "SentenceCapitalFrequency",
    if LINEAR:
        clf = GridSearchCV(estimator=SVC(C=1, cache_size=1000), param_grid=linear_parameters, cv=cv, scoring='accuracy', verbose=1, n_jobs=2).fit(Xn, yn)


        print("Best parameters set found on development set:")
        print
        print(clf.best_estimator_)
        print(clf.best_score_)
        print()
        print confusion_matrix(yn, clf.predict(Xn))

if __name__ == '__main__':

    if manual_tests:
        Xn = csr_matrix(np.array((0,0)))
        yn = load_numpy_matrix(feature_set_path+ 'valueVector' + tag + '_train.npy')
        print Counter(yn)

        filepath = 'MANUAL'
        print load_numpy_matrix(feature_set_path+ 'featureArray' + tag + '_train.npy').shape
        print load_numpy_matrix(feature_set_path+ 'socialVector' + tag + '_train.npy').shape
        Xn = np.hstack((load_numpy_matrix(feature_set_path+ 'featureArray' + tag + '_train.npy'),load_numpy_matrix(feature_set_path+ 'socialVector' + tag + '_train.npy') ))

        Xn = SelectPercentile(score_func=f_classif, percentile=perc).fit_transform(Xn,yn)


        if split:
            sss = StratifiedShuffleSplit(yn, 1, test_size=0.85, random_state=42)
            for train, test in sss:
                Xn , yn = Xn[train], yn[train]
        fh.write(delimiter.join(header) + "\n")
        for row in matrix:
            line = delimiter.join(
                str(int(value)) if value.is_integer() else "%.6f" % value
                for value in row)
            fh.write(line + "\n")


if __name__ == '__main__':
    #get_new_test_X()
    #get_new_test_Y()
    tempCWD = os.getcwd()

    os.chdir('D:\Workspace\NLTK comments\src\FeatureExtraction')

    X_orig = load_numpy_matrix("featureArray.npy")
    y_orig = load_numpy_matrix("valueVector.npy")
    #wd = load_sparse_csr("freqWordData.npz").todense()

    print X_orig.shape
    print y_orig[:, 3].shape
    #print wd.shape
    Xn = np.c_[X_orig, y_orig[:, 3]]
    #wd = np.c_[wd, y_orig[:,3]]

    head = [
        "Lengthiness", "Questionfrequency", "Exclamationfrequency",
        "CapitalFrequency", "SentenceCapitalFrequency", "SpelledCorrectly",
        "SpelledFreq", "BadWords", "BadWordsFreq", "spelledPerc*badWordsPerc",
        "Complexity", "Readibility", "AvgTermFreq", "Sentiment", "SubjObj",
        "VerbCount", "NounCount", "ThreadRelevance", "ArticleRelevence",
from sklearn.svm.classes import SVC, LinearSVC

from RatingPrediction.main import  runClassificationTest
from config import feature_set_path
import numpy as np
from sklearn.multiclass import OneVsRestClassifier


perc = 50
scale = 1
#tag = "_toy"
#tag = '_main'
tag = '_slashdot'

Xn = csr_matrix(np.array((0,0)))
yn = load_numpy_matrix(feature_set_path+ 'valueVector' + tag + '_train.npy')
#yn = load_numpy_matrix(feature_set_path+ 'sentenceValueVector.npy')[:,valueV]

'''
filepath = 'MANUAL'
print load_numpy_matrix(feature_set_path+ 'featureArray' + tag + '_train.npy').shape
print load_numpy_matrix(feature_set_path+ 'socialVector' + tag + '_train.npy').shape
Xn = np.hstack((load_numpy_matrix(feature_set_path+ 'featureArray' + tag + '_train.npy'),load_numpy_matrix(feature_set_path+ 'socialVector' + tag + '_train.npy') ))
'''
#filepath = feature_set_path+ 'binaryWordData' + tag + '_train.npz'
#filepath = feature_set_path+ 'freqWordData' + tag + '_train.npz'
filepath = feature_set_path+ 'tfidfWordData' + tag + '_train.npz'
#filepath = feature_set_path+ 'bigramBinaryWordData' + tag + '_train.npz'
#filepath = feature_set_path+ 'bigramTfidfWordData' + tag + '_train.npz'
#filepath = feature_set_path+ 'trigramBinaryWordData' + tag + '_train.npz'
#filepath = feature_set_path+ 'trigramTfidfWordData' + tag + '_train.npz'