def svmTrain(datafile, featureNum, fold=10): import sys train, test = loaddata(datafile) row, col = train['counts'].shape if col < featureNum: featureNum = col X_train = train['counts'][:, 0:featureNum] y_train = train['labels'][0, :] X_test = test['counts'][:, 0:featureNum] y_test = test['labels'][0, :] tuned_parameters = [ #{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], #'C': [1, 10, 100, 1000,10000]}, { 'C': [1, 10, 100, 1000, 10000] } ] model = LinearSVC(dual=True, tol=1e-3) categories = train['category'] feature_names = np.array([k.strip() for k in train['feature_names']]) data = [ X_train, y_train, X_test, y_test, categories, feature_names, featureNum, model, tuned_parameters, fold ] clf, accuracy = cross_validation(*data) ''' for c in range(len(categories)): index = np.argsort(clf.best_estimator_.coef_[c]) if len(index) >= featureNum: topfeatures = index[-1*featureNum:] else: topfeatures = index print('%s:%s'%(categories[c], ' '.join(feature_names[topfeatures]))) ''' return accuracy
def NBTrain(datafile, featureNum, fold=10): import sys train, test = loaddata(datafile) row, col = train['counts'].shape if col < featureNum: featureNum = col X_train = train['counts'][:, 0:featureNum] y_train = train['labels'][0, :] X_test = test['counts'][:, 0:featureNum] y_test = test['labels'][0, :] tuned_parameters = [{'alpha': [0.01, 0.05, 1, 2, 5]}] model = MultinomialNB(fit_prior=True) categories = train['category'] feature_names = np.array([k.strip() for k in train['feature_names']]) data = [ X_train, y_train, X_test, y_test, categories, feature_names, featureNum, model, tuned_parameters, fold ] clf, accuracy = cross_validation(*data) for c in range(len(categories)): index = np.argsort(clf.best_estimator_.coef_[c]) if len(index) >= featureNum: topfeatures = index[-1 * featureNum:] else: topfeatures = index print('%s:%s' % (categories[c], ' '.join(feature_names[topfeatures]))) return accuracy
def svmTrain(datafile,featureNum,fold = 10): import sys train,test = loaddata(datafile) row,col = train['counts'].shape if col < featureNum: featureNum = col X_train = train['counts'][:,0:featureNum] y_train = train['labels'][0,:] X_test = test['counts'][:,0:featureNum] y_test = test['labels'][0,:] tuned_parameters = [#{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], #'C': [1, 10, 100, 1000,10000]}, {'C': [1, 10, 100, 1000,10000]}] model = LinearSVC(dual = True,tol=1e-3) categories = train['category'] feature_names =np.array([k.strip() for k in train['feature_names']]) data = [X_train,y_train,X_test,y_test,categories,feature_names,featureNum,model,tuned_parameters,fold] clf,accuracy = cross_validation(*data) ''' for c in range(len(categories)): index = np.argsort(clf.best_estimator_.coef_[c]) if len(index) >= featureNum: topfeatures = index[-1*featureNum:] else: topfeatures = index print('%s:%s'%(categories[c], ' '.join(feature_names[topfeatures]))) ''' return accuracy
def preprocess(prefix,totalfeatures): train,test = loaddata(prefix) X_train,X_test,feature_names=featureselection((train,test),totalfeatures) trainoutput = prefix + '_chi_train.mat' testoutput = prefix + '_chi_test.mat' train['counts'] = X_train train['feature_names'] = feature_names io.savemat(trainoutput,train) test['counts'] = X_test test['feature_names'] = feature_names print(feature_names) io.savemat(testoutput,test) return prefix+'_chi'
def preprocess(prefix, totalfeatures): train, test = loaddata(prefix) X_train, X_test, feature_names = featureselection((train, test), totalfeatures) trainoutput = prefix + '_chi_train.mat' testoutput = prefix + '_chi_test.mat' train['counts'] = X_train train['feature_names'] = feature_names io.savemat(trainoutput, train) test['counts'] = X_test test['feature_names'] = feature_names print(feature_names) io.savemat(testoutput, test) return prefix + '_chi'
def knnTrain(datafile,featureNum,fold = 10): import sys train,test = loaddata(datafile) row,col = train['counts'].shape if col < featureNum: featureNum = col X_train = train['counts'][:,0:featureNum] y_train = train['labels'][0,:] X_test = test['counts'][:,0:featureNum] y_test = test['labels'][0,:] tuned_parameters = [{'n_neighbors':[2,3,4,6,10,15,18,20,30,40,50]}] model = knn(n_neighbors = 1) categories = train['category'] feature_names =np.array([k.strip() for k in train['feature_names']]) data = [X_train,y_train,X_test,y_test,categories,feature_names,featureNum,model,tuned_parameters,fold] clf,accuracy = cross_validation(*data) return accuracy
def knnTrain(datafile, featureNum, fold=10): import sys train, test = loaddata(datafile) row, col = train['counts'].shape if col < featureNum: featureNum = col X_train = train['counts'][:, 0:featureNum] y_train = train['labels'][0, :] X_test = test['counts'][:, 0:featureNum] y_test = test['labels'][0, :] tuned_parameters = [{ 'n_neighbors': [2, 3, 4, 6, 10, 15, 18, 20, 30, 40, 50] }] model = knn(n_neighbors=1) categories = train['category'] feature_names = np.array([k.strip() for k in train['feature_names']]) data = [ X_train, y_train, X_test, y_test, categories, feature_names, featureNum, model, tuned_parameters, fold ] clf, accuracy = cross_validation(*data) return accuracy
def pca_pipeline(prefix, n_components, model): train, test = loaddata(prefix) X_train = train['counts'] X_test = test['counts'] pca_analysis(X_train, X_test, n_components) X_train, X_test, pcs = pca_analysis(X_train, X_test, n_components) train['counts'] = X_train test['counts'] = X_test # setup the categories name = 'pc_' categories = [] for i in range(n_components): categories.append(name + str(i)) train['category'] = categories test['category'] = categories outprefix = prefix + '_pca' io.savemat(outprefix + '_train.mat', train) io.savemat(outprefix + '_test.mat', test) #svmTrain(outprefix,n_components) accuracy = model(outprefix, n_components) return accuracy
def pca_pipeline(prefix,n_components,model): train,test = loaddata(prefix) X_train = train['counts'] X_test = test['counts'] pca_analysis(X_train,X_test,n_components) X_train,X_test,pcs = pca_analysis(X_train,X_test,n_components) train['counts'] = X_train test['counts'] = X_test # setup the categories name = 'pc_' categories = [] for i in range(n_components): categories.append(name + str(i)) train['category'] = categories test['category'] = categories outprefix = prefix + '_pca' io.savemat(outprefix + '_train.mat',train) io.savemat(outprefix + '_test.mat',test) #svmTrain(outprefix,n_components) accuracy = model(outprefix,n_components) return accuracy
def NBTrain(datafile,featureNum,fold = 10): import sys train,test = loaddata(datafile) row,col = train['counts'].shape if col < featureNum: featureNum = col X_train = train['counts'][:,0:featureNum] y_train = train['labels'][0,:] X_test = test['counts'][:,0:featureNum] y_test = test['labels'][0,:] tuned_parameters = [{'alpha':[0.01,0.05,1,2,5]}] model = MultinomialNB(fit_prior=True) categories = train['category'] feature_names =np.array([k.strip() for k in train['feature_names']]) data = [X_train,y_train,X_test,y_test,categories,feature_names,featureNum,model,tuned_parameters,fold] clf,accuracy = cross_validation(*data) for c in range(len(categories)): index = np.argsort(clf.best_estimator_.coef_[c]) if len(index) >= featureNum: topfeatures = index[-1*featureNum:] else: topfeatures = index print('%s:%s'%(categories[c], ' '.join(feature_names[topfeatures]))) return accuracy
# print(choice.shape,remaining.shape,split.shape) if first: #labeled = np.concatenate((labeled,data[choice,:]),axis = 0) labeled = vstack((labeled,data[choice,:])) unlabeled =vstack((unlabeled,data[remaining,:])) y_labeled = np.concatenate((y_labeled,label[choice])) y_unlabeled = np.concatenate((y_unlabeled,label[remaining])) else: labeled = data[choice,:] unlabeled = data[remaining,:] y_labeled = label[choice] y_unlabeled = label[remaining] first=True return ((labeled,y_labeled),(unlabeled,y_unlabeled)) if __name__ == '__main__': np.random.seed(511) snb = SemiNB() prefix = '../features/bagofword' data = loaddata(prefix) print(data[0]['counts'].shape,data[1]['counts'].shape) labeled,unlabeled = splitDataByClass(data[0]['counts'],data[0]['labels'][0,:],0.5) td,delta =dataTransformation(labeled[0],labeled[1]) print(td.shape,delta.shape) snb.train(td,delta) test_td,test_delta = dataTransformation(data[1]['counts'],data[1]['labels']) print(test_td.shape) result = snb.predict_all(np.transpose(test_td) ) print(result)