def getDataSets(normalize=False, selected=False): y_train = load_numpy_matrix(feature_set_path + r'valueVector'+tag+'_train.npy') y_test = load_numpy_matrix(feature_set_path + r'valueVector'+tag+'_test.npy') X_train = load_numpy_matrix(feature_set_path + r'featureArray'+tag+'_train.npy') sd = load_numpy_matrix(feature_set_path + r'socialVector'+tag+'_train.npy') X_train = np.hstack((X_train,sd)) X_test = load_numpy_matrix(feature_set_path + r'featureArray'+tag+'_test.npy') sd2 = load_numpy_matrix(feature_set_path + r'socialVector'+tag+'_test.npy') X_test = np.hstack((X_test,sd2)) return X_train, X_test, y_train, y_test
def getDataSets(normalize=False, selected=False): y_train = load_numpy_matrix(feature_set_path + r'valueVector' + tag + '_train.npy') y_test = load_numpy_matrix(feature_set_path + r'valueVector' + tag + '_test.npy') X_train = load_numpy_matrix(feature_set_path + r'featureArray' + tag + '_train.npy') sd = load_numpy_matrix(feature_set_path + r'socialVector' + tag + '_train.npy') X_train = np.hstack((X_train, sd)) X_test = load_numpy_matrix(feature_set_path + r'featureArray' + tag + '_test.npy') sd2 = load_numpy_matrix(feature_set_path + r'socialVector' + tag + '_test.npy') X_test = np.hstack((X_test, sd2)) return X_train, X_test, y_train, y_test
verbose=1, n_jobs=2).fit(Xn, yn) print("Best parameters set found on development set:") print print(clf.best_estimator_) print(clf.best_score_) print() print confusion_matrix(yn, clf.predict(Xn)) if __name__ == '__main__': if manual_tests: Xn = csr_matrix(np.array((0, 0))) yn = load_numpy_matrix(feature_set_path + 'valueVector' + tag + '_train.npy') print Counter(yn) filepath = 'MANUAL' print load_numpy_matrix(feature_set_path + 'featureArray' + tag + '_train.npy').shape print load_numpy_matrix(feature_set_path + 'socialVector' + tag + '_train.npy').shape Xn = np.hstack((load_numpy_matrix(feature_set_path + 'featureArray' + tag + '_train.npy'), load_numpy_matrix(feature_set_path + 'socialVector' + tag + '_train.npy'))) Xn = SelectPercentile(score_func=f_classif, percentile=perc).fit_transform(Xn, yn)
from sklearn.linear_model.coordinate_descent import Lasso, ElasticNet from sklearn.metrics.regression import mean_squared_error from sklearn.preprocessing.data import normalize, Normalizer from sklearn.svm.classes import SVR from RatingPrediction.main import normalize_sets_sparse from config import feature_set_path import numpy as np from sklearn.metrics.metrics import r2_score import matplotlib.pyplot as plt tag='_slashdot' X_train = load_numpy_matrix(feature_set_path + r'featureArray'+tag+'_train.npy') sd = load_numpy_matrix(feature_set_path + r'socialVector'+tag+'_train.npy') X_train = np.hstack((X_train,sd)) X_test = load_numpy_matrix(feature_set_path + r'featureArray'+tag+'_test.npy') sd2 = load_numpy_matrix(feature_set_path + r'socialVector'+tag+'_test.npy') X_test = np.hstack((X_test,sd2)) ''' X_train = load_sparse_csr(feature_set_path + r'binaryWordData'+tag+'_train.npz') X_test = load_sparse_csr(feature_set_path + r'binaryWordData'+tag+'_test.npz') ''' y_train = load_numpy_matrix(feature_set_path + r'valueVector'+tag+'_train.npy') y_test = load_numpy_matrix(feature_set_path + r'valueVector'+tag+'_test.npy')
from sklearn.metrics.metrics import classification_report, f1_score from sklearn.svm.classes import SVC, LinearSVC from RatingPrediction.main import runClassificationTest from config import feature_set_path import numpy as np from sklearn.multiclass import OneVsRestClassifier perc = 50 scale = 1 #tag = "_toy" #tag = '_main' tag = '_slashdot' Xn = csr_matrix(np.array((0, 0))) yn = load_numpy_matrix(feature_set_path + 'valueVector' + tag + '_train.npy') #yn = load_numpy_matrix(feature_set_path+ 'sentenceValueVector.npy')[:,valueV] ''' filepath = 'MANUAL' print load_numpy_matrix(feature_set_path+ 'featureArray' + tag + '_train.npy').shape print load_numpy_matrix(feature_set_path+ 'socialVector' + tag + '_train.npy').shape Xn = np.hstack((load_numpy_matrix(feature_set_path+ 'featureArray' + tag + '_train.npy'),load_numpy_matrix(feature_set_path+ 'socialVector' + tag + '_train.npy') )) ''' #filepath = feature_set_path+ 'binaryWordData' + tag + '_train.npz' #filepath = feature_set_path+ 'freqWordData' + tag + '_train.npz' filepath = feature_set_path + 'tfidfWordData' + tag + '_train.npz' #filepath = feature_set_path+ 'bigramBinaryWordData' + tag + '_train.npz' #filepath = feature_set_path+ 'bigramTfidfWordData' + tag + '_train.npz' #filepath = feature_set_path+ 'trigramBinaryWordData' + tag + '_train.npz' #filepath = feature_set_path+ 'trigramTfidfWordData' + tag + '_train.npz' #filepath = feature_set_path+ 'quadgramBinaryWordData' + tag + '_train.npz'
from sklearn.linear_model import LinearRegression from sklearn.linear_model.coordinate_descent import Lasso, ElasticNet from sklearn.metrics.regression import mean_squared_error from sklearn.preprocessing.data import normalize, Normalizer from sklearn.svm.classes import SVR from RatingPrediction.main import normalize_sets_sparse from config import feature_set_path import numpy as np from sklearn.metrics.metrics import r2_score import matplotlib.pyplot as plt tag = '_slashdot' X_train = load_numpy_matrix(feature_set_path + r'featureArray' + tag + '_train.npy') sd = load_numpy_matrix(feature_set_path + r'socialVector' + tag + '_train.npy') X_train = np.hstack((X_train, sd)) X_test = load_numpy_matrix(feature_set_path + r'featureArray' + tag + '_test.npy') sd2 = load_numpy_matrix(feature_set_path + r'socialVector' + tag + '_test.npy') X_test = np.hstack((X_test, sd2)) ''' X_train = load_sparse_csr(feature_set_path + r'binaryWordData'+tag+'_train.npz') X_test = load_sparse_csr(feature_set_path + r'binaryWordData'+tag+'_test.npz') ''' y_train = load_numpy_matrix(feature_set_path + r'valueVector' + tag + '_train.npy') y_test = load_numpy_matrix(feature_set_path + r'valueVector' + tag +
articleList, commentList, parentList, commentCount = read_toy_comments( comment_data_path + 'trainTestDataSet.txt', comment_data_path + 'toyComments.csv') tag = '_toy' elif set == 3: articleList, commentList, commentCount = read_slashdot_comments( comment_data_path + 'slashdotDataSet.txt', limit=100000) tag = '_slashdot' processed_comment_list = [] for art in commentList.items(): for comm in art[1]: processed_comment_list.append(comm.body.decode('ascii', 'ignore')) features = vectorizer.transform(processed_comment_list) y_train = load_numpy_matrix(feature_set_path + r'valueVector' + tag + '_train.npy') y_test = load_numpy_matrix(feature_set_path + r'valueVector' + tag + '_test.npy') print features.shape print y_train.shape print y_test.shape valueVector = np.concatenate([y_train, y_test]) print print valueVector.shape # train_list = [' '.join(sent) for sent in train_list] # test_list = [' '.join(sent) for sent in test_list] predicted = [float(v) for v in clf.predict(features)]
datatype = 3 perc = 50 if __name__ == '__main__': if datatype == 1: tag = '_main' model_tag = '_news24' elif datatype == 2: tag = "_toy" model_tag = '_news24' elif datatype == 3: tag = '_slashdot' model_tag = '_slashdot' for featureV in [8]: y_train = load_numpy_matrix(feature_set_path + r'valueVector' + tag + '_train.npy') y_test = load_numpy_matrix(feature_set_path + r'valueVector' + tag + '_test.npy') if featureV == 0: X_train = load_numpy_matrix(feature_set_path + r'featureArray' + tag + '_train.npy') sd = load_numpy_matrix(feature_set_path + r'socialVector' + tag + '_train.npy') X_train = np.hstack((X_train, sd)) X_test = load_numpy_matrix(feature_set_path + r'featureArray' + tag + '_test.npy') sd2 = load_numpy_matrix(feature_set_path + r'socialVector' + tag + '_test.npy') X_test = np.hstack((X_test, sd2)) perc = 80
def saveIt(filename, matrix, header, delimiter=","): with open(filename, 'w') as fh: fh.write(delimiter.join(header) +"\n") for row in matrix: line = delimiter.join(str(int(value)) if value.is_integer() else "%.6f" % value for value in row) fh.write(line+"\n") if __name__ == '__main__': #get_new_test_X() #get_new_test_Y() tempCWD = os.getcwd() os.chdir('D:\Workspace\NLTK comments\src\FeatureExtraction') X_orig = load_numpy_matrix("featureArray.npy") y_orig = load_numpy_matrix("valueVector.npy") #wd = load_sparse_csr("freqWordData.npz").todense() print X_orig.shape print y_orig[:,3].shape #print wd.shape Xn = np.c_[X_orig, y_orig[:,3]] #wd = np.c_[wd, y_orig[:,3]] head = ["Lengthiness", "Questionfrequency", "Exclamationfrequency", "CapitalFrequency", "SentenceCapitalFrequency",
if LINEAR: clf = GridSearchCV(estimator=SVC(C=1, cache_size=1000), param_grid=linear_parameters, cv=cv, scoring='accuracy', verbose=1, n_jobs=2).fit(Xn, yn) print("Best parameters set found on development set:") print print(clf.best_estimator_) print(clf.best_score_) print() print confusion_matrix(yn, clf.predict(Xn)) if __name__ == '__main__': if manual_tests: Xn = csr_matrix(np.array((0,0))) yn = load_numpy_matrix(feature_set_path+ 'valueVector' + tag + '_train.npy') print Counter(yn) filepath = 'MANUAL' print load_numpy_matrix(feature_set_path+ 'featureArray' + tag + '_train.npy').shape print load_numpy_matrix(feature_set_path+ 'socialVector' + tag + '_train.npy').shape Xn = np.hstack((load_numpy_matrix(feature_set_path+ 'featureArray' + tag + '_train.npy'),load_numpy_matrix(feature_set_path+ 'socialVector' + tag + '_train.npy') )) Xn = SelectPercentile(score_func=f_classif, percentile=perc).fit_transform(Xn,yn) if split: sss = StratifiedShuffleSplit(yn, 1, test_size=0.85, random_state=42) for train, test in sss: Xn , yn = Xn[train], yn[train]
fh.write(delimiter.join(header) + "\n") for row in matrix: line = delimiter.join( str(int(value)) if value.is_integer() else "%.6f" % value for value in row) fh.write(line + "\n") if __name__ == '__main__': #get_new_test_X() #get_new_test_Y() tempCWD = os.getcwd() os.chdir('D:\Workspace\NLTK comments\src\FeatureExtraction') X_orig = load_numpy_matrix("featureArray.npy") y_orig = load_numpy_matrix("valueVector.npy") #wd = load_sparse_csr("freqWordData.npz").todense() print X_orig.shape print y_orig[:, 3].shape #print wd.shape Xn = np.c_[X_orig, y_orig[:, 3]] #wd = np.c_[wd, y_orig[:,3]] head = [ "Lengthiness", "Questionfrequency", "Exclamationfrequency", "CapitalFrequency", "SentenceCapitalFrequency", "SpelledCorrectly", "SpelledFreq", "BadWords", "BadWordsFreq", "spelledPerc*badWordsPerc", "Complexity", "Readibility", "AvgTermFreq", "Sentiment", "SubjObj", "VerbCount", "NounCount", "ThreadRelevance", "ArticleRelevence",
from sklearn.svm.classes import SVC, LinearSVC from RatingPrediction.main import runClassificationTest from config import feature_set_path import numpy as np from sklearn.multiclass import OneVsRestClassifier perc = 50 scale = 1 #tag = "_toy" #tag = '_main' tag = '_slashdot' Xn = csr_matrix(np.array((0,0))) yn = load_numpy_matrix(feature_set_path+ 'valueVector' + tag + '_train.npy') #yn = load_numpy_matrix(feature_set_path+ 'sentenceValueVector.npy')[:,valueV] ''' filepath = 'MANUAL' print load_numpy_matrix(feature_set_path+ 'featureArray' + tag + '_train.npy').shape print load_numpy_matrix(feature_set_path+ 'socialVector' + tag + '_train.npy').shape Xn = np.hstack((load_numpy_matrix(feature_set_path+ 'featureArray' + tag + '_train.npy'),load_numpy_matrix(feature_set_path+ 'socialVector' + tag + '_train.npy') )) ''' #filepath = feature_set_path+ 'binaryWordData' + tag + '_train.npz' #filepath = feature_set_path+ 'freqWordData' + tag + '_train.npz' filepath = feature_set_path+ 'tfidfWordData' + tag + '_train.npz' #filepath = feature_set_path+ 'bigramBinaryWordData' + tag + '_train.npz' #filepath = feature_set_path+ 'bigramTfidfWordData' + tag + '_train.npz' #filepath = feature_set_path+ 'trigramBinaryWordData' + tag + '_train.npz' #filepath = feature_set_path+ 'trigramTfidfWordData' + tag + '_train.npz'