filepaths.append(feature_set_path + 'bigramOnlyBinaryWordData' + tag + '_train.npz') # filepaths.append(feature_set_path + 'bigramOnlyTfidfWordData' + tag + '_train.npz') # filepaths.append(feature_set_path + 'trigramOnlyBinaryWordData' + tag + '_train.npz') # filepaths.append(feature_set_path + 'trigramOnlyTfidfWordData' + tag + '_train.npz') for file in filepaths: print file print tag Xn = csr_matrix(np.array((0, 0))) yn = load_numpy_matrix(feature_set_path + r'valueVector' + tag + '_train.npy') print Counter(yn) Xn = load_sparse_csr(file) Xn = SelectKBest(score_func=chi2, k=min(200000, int(Xn.shape[1] * (perc / 100.0)))).fit_transform(Xn, yn) if split: sss = StratifiedShuffleSplit(yn, 1, test_size=0.75) for train, test in sss: Xn, yn = Xn[train], yn[train] parameter_tuning(Xn, yn, scale=-1) if sparse_2_tests: filepaths = list() # filepaths.append(feature_set_path+ 'binaryCharacterData' + tag + '_train.npz')
#filepath = feature_set_path+ 'trigramOnlyBinaryWordData' + tag + '_train.npz' #filepath = feature_set_path+ 'trigramOnlyTfidfWordData' + tag + '_train.npz' #filepath = feature_set_path+ 'quadgramOnlyBinaryWordData' + tag + '_train.npz' #filepath = feature_set_path+ 'quadgramOnlyTfidfWordData' + tag + '_train.npz' #filepath = feature_set_path+ 'binaryCharacterData_train.npz' #filepath = feature_set_path+ 'tfidfCharacterData_train.npz' #filepath = feature_set_path+ 'binaryCharacterSkipgramData_train.npz' #filepath = feature_set_path+ 'tfidfCharacterSkipgramData_train.npz' #filepath = feature_set_path+ 'binaryLexicalBigramsData_train.npz' #filepath = feature_set_path+ 'tfidfLexicalBigramsData_train.npz' if filepath != 'MANUAL': Xn = load_sparse_csr(filepath) #filepath = feature_set_path+ 'Basic300_TfidfFeatures_train.npy' #filepath = feature_set_path+ 'Basic300_BOCFeatures_train.npy' #filepath = feature_set_path+ 'google_model_TfidfFeatures_train.npy' #filepath = feature_set_path+ 'google_model_BOCFeatures_train.npy' ''' if filepath != 'MANUAL': Xn = load_numpy_matrix(filepath) ''' print filepath sss = StratifiedShuffleSplit(yn, 1, test_size=0.90, random_state=0) for train, test in sss: Xn, yn = Xn[train], yn[train]
'_test.npy') if featureV == 0: X_train = load_numpy_matrix(feature_set_path + r'featureArray' + tag + '_train.npy') sd = load_numpy_matrix(feature_set_path + r'socialVector' + tag + '_train.npy') X_train = np.hstack((X_train, sd)) X_test = load_numpy_matrix(feature_set_path + r'featureArray' + tag + '_test.npy') sd2 = load_numpy_matrix(feature_set_path + r'socialVector' + tag + '_test.npy') X_test = np.hstack((X_test, sd2)) perc = 80 elif featureV == 1: X_train = load_sparse_csr(feature_set_path + r'binaryWordData' + tag + '_train.npz') X_test = load_sparse_csr(feature_set_path + r'binaryWordData' + tag + '_test.npz') elif featureV == 2: X_train = load_sparse_csr(feature_set_path + r'freqWordData' + tag + '_train.npz') X_test = load_sparse_csr(feature_set_path + r'freqWordData' + tag + '_test.npz') elif featureV == 3: X_train = load_sparse_csr(feature_set_path + r'tfidfWordData' + tag + '_train.npz') X_test = load_sparse_csr(feature_set_path + r'tfidfWordData' + tag + '_test.npz') elif featureV == 4: X_train = load_sparse_csr(feature_set_path + r'bigramBinaryWordData' + tag +
if __name__ == '__main__': for featureV in [1,2,3,4,5,6,7,10,11,12,13]: y_train = load_numpy_matrix(feature_set_path + r'valueVector'+tag+'_train.npy') y_test = load_numpy_matrix(feature_set_path + r'valueVector'+tag+'_test.npy') if featureV == 0: X_train = load_numpy_matrix(feature_set_path + r'featureArray'+tag+'_train.npy') sd = load_numpy_matrix(feature_set_path + r'socialVector'+tag+'_train.npy') X_train = np.hstack((X_train,sd)) X_test = load_numpy_matrix(feature_set_path + r'featureArray'+tag+'_test.npy') sd2 = load_numpy_matrix(feature_set_path + r'socialVector'+tag+'_test.npy') X_test = np.hstack((X_test,sd2)) perc = 50 elif featureV == 1: X_train = load_sparse_csr(feature_set_path + r'binaryWordData'+tag+'_train.npz') X_test = load_sparse_csr(feature_set_path + r'binaryWordData'+tag+'_test.npz') elif featureV == 2: X_train = load_sparse_csr(feature_set_path + r'freqWordData'+tag+'_train.npz') X_test = load_sparse_csr(feature_set_path + r'freqWordData'+tag+'_test.npz') elif featureV == 3: X_train = load_sparse_csr(feature_set_path + r'tfidfWordData'+tag+'_train.npz') X_test = load_sparse_csr(feature_set_path + r'tfidfWordData'+tag+'_test.npz') elif featureV == 4: X_train = load_sparse_csr(feature_set_path + r'bigramBinaryWordData'+tag+'_train.npz') X_test = load_sparse_csr(feature_set_path + r'bigramBinaryWordData'+tag+'_test.npz') elif featureV == 5: X_train = load_sparse_csr(feature_set_path + r'bigramTfidfWordData'+tag+'_train.npz') X_test = load_sparse_csr(feature_set_path + r'bigramTfidfWordData'+tag+'_test.npz') elif featureV == 6: X_train = load_sparse_csr(feature_set_path + r'trigramBinaryWordData'+tag+'_train.npz')
# filepaths.append(feature_set_path + 'trigramBinaryWordData' + tag + '_train.npz') # filepaths.append(feature_set_path + 'trigramTfidfWordData' + tag + '_train.npz') filepaths.append(feature_set_path + 'bigramOnlyBinaryWordData' + tag + '_train.npz') # filepaths.append(feature_set_path + 'bigramOnlyTfidfWordData' + tag + '_train.npz') # filepaths.append(feature_set_path + 'trigramOnlyBinaryWordData' + tag + '_train.npz') # filepaths.append(feature_set_path + 'trigramOnlyTfidfWordData' + tag + '_train.npz') for file in filepaths: print file print tag Xn = csr_matrix(np.array((0,0))) yn = load_numpy_matrix(feature_set_path + r'valueVector' + tag + '_train.npy') print Counter(yn) Xn = load_sparse_csr(file) Xn = SelectKBest(score_func=chi2, k=min(200000, int(Xn.shape[1]*(perc/100.0)))).fit_transform(Xn,yn) if split: sss = StratifiedShuffleSplit(yn, 1, test_size=0.75) for train, test in sss: Xn , yn = Xn[train], yn[train] parameter_tuning(Xn, yn, scale=-1) if sparse_2_tests: filepaths = list() # filepaths.append(feature_set_path+ 'binaryCharacterData' + tag + '_train.npz') # filepaths.append(feature_set_path+ 'tfidfCharacterData' + tag + '_train.npz') # # filepaths.append(feature_set_path+ 'binaryCharacterSkipgramData' + tag + '_train.npz')
#filepath = feature_set_path+ 'quadgramOnlyTfidfWordData' + tag + '_train.npz' #filepath = feature_set_path+ 'binaryCharacterData_train.npz' #filepath = feature_set_path+ 'tfidfCharacterData_train.npz' #filepath = feature_set_path+ 'binaryCharacterSkipgramData_train.npz' #filepath = feature_set_path+ 'tfidfCharacterSkipgramData_train.npz' #filepath = feature_set_path+ 'binaryLexicalBigramsData_train.npz' #filepath = feature_set_path+ 'tfidfLexicalBigramsData_train.npz' if filepath != 'MANUAL': Xn = load_sparse_csr(filepath) #filepath = feature_set_path+ 'Basic300_TfidfFeatures_train.npy' #filepath = feature_set_path+ 'Basic300_BOCFeatures_train.npy' #filepath = feature_set_path+ 'google_model_TfidfFeatures_train.npy' #filepath = feature_set_path+ 'google_model_BOCFeatures_train.npy' ''' if filepath != 'MANUAL': Xn = load_numpy_matrix(filepath) ''' print filepath sss = StratifiedShuffleSplit(yn, 1, test_size=0.90, random_state=0)