def testOthersByDir(fileName, algorithmName, flag=1): precisions = [] recalls = [] accuracys = [] allPredictLabels = [] allRealLabels = [] model = None for i in range(5): normTrainingMat, normTestMat, hmLabels, \ chmLabels, minVals, maxVals = loadFileData(fileName, i) if algorithmName == 'SVM': model = ar.testSVM(flag) elif algorithmName == 'RF': model = ar.testRandomForest() elif algorithmName == 'Bayes': model = ar.testBayes(flag) elif algorithmName == 'Tree': model = ar.testDecisionTree() else: pass if model is None: raise NameError('algorithm input error') model.fit(normTrainingMat, hmLabels) predictLabels = model.predict(normTestMat) precision, recall, accuracy = evaluate.evaluateClassifier( predictLabels, chmLabels) allPredictLabels.extend(predictLabels) allRealLabels.extend(chmLabels) precisions.append(precision) recalls.append(recall) accuracys.append(accuracy) avgPrecision, avgRecall, avgAccuracy = evaluate.output( precisions, recalls, accuracys) return \ avgPrecision, avgRecall, avgAccuracy, allPredictLabels, allRealLabels
def testKNNByFile(fileName, algorithmName, kValue=5): dataMat, labels, features = file2matrix(fileName) precisions = [] recalls = [] accuracys = [] allPredictLabels = [] allRealLabels = [] classifierResult = None for j in range(10): normTrainingMat, normTestMat, hmLabels, chmLabels = crossAuth( dataMat, labels, j) minVals = normTrainingMat.min(0) maxVals = normTrainingMat.max(0) normTrainingMat = autoNorm(minVals, maxVals, normTrainingMat) normTestMat = autoNorm(minVals, maxVals, normTestMat) errorCount = 0.0 m = len(normTestMat) predictLabels = [] for i in range(m): if algorithmName == 'KNN': classifierResult = ar.kNNClassify(normTestMat[i, :], normTrainingMat, hmLabels, kValue) elif algorithmName == 'IPDC_KNN': classifierResult = ar.IPDCKNNClassify(normTestMat[i, :], normTrainingMat, hmLabels, kValue) elif algorithmName == 'IPDS_KNN': classifierResult = ar.IPDSKNNClassify(normTestMat[i, :], normTrainingMat, hmLabels, kValue) elif algorithmName == 'IPDCS_KNN': classifierResult = ar.IPDCSKNNClassify(normTestMat[i, :], normTrainingMat, hmLabels, kValue) elif algorithmName == 'IPNC_KNN': classifierResult = ar.IPNCKNNClassify(normTestMat[i, :], normTrainingMat, hmLabels, kValue) else: pass if classifierResult is None: raise NameError('algorithm input error') if (classifierResult != chmLabels[i]): errorCount += 1.0 predictLabels.append(classifierResult) precision, recall, accuracy = evaluate.evaluateClassifier( predictLabels, chmLabels) allPredictLabels.extend(predictLabels) allRealLabels.extend(chmLabels) precisions.append(precision) recalls.append(recall) accuracys.append(accuracy) avgPrecision, avgRecall, avgAccuracy = evaluate.output( precisions, recalls, accuracys) return \ avgPrecision, avgRecall, avgAccuracy, allPredictLabels, allRealLabels
def testKNNByDir(fileName, algorithmName, kValue=5): precisions = [] recalls = [] accuracys = [] allPredictLabels = [] allRealLabels = [] classifierResult = None for j in range(5): normTrainingMat, normTestMat, hmLabels, \ chmLabels, minVals, maxVals = loadFileData(fileName, j) errorCount = 0.0 m = len(normTestMat) predictLabels = [] for i in range(m): if algorithmName == 'KNN': classifierResult = ar.kNNClassify(normTestMat[i, :], normTrainingMat, hmLabels, kValue) elif algorithmName == 'IPDC_KNN': classifierResult = ar.IPDCKNNClassify(normTestMat[i, :], normTrainingMat, hmLabels, kValue) elif algorithmName == 'IPDS_KNN': classifierResult = ar.IPDSKNNClassify(normTestMat[i, :], normTrainingMat, hmLabels, kValue) elif algorithmName == 'IPDCS_KNN': classifierResult = ar.IPDCSKNNClassify(normTestMat[i, :], normTrainingMat, hmLabels, kValue) elif algorithmName == 'IPNC_KNN': classifierResult = ar.IPNCKNNClassify(normTestMat[i, :], normTrainingMat, hmLabels, kValue) else: pass if classifierResult is None: raise NameError('algorithm input error') if (classifierResult != chmLabels[i]): errorCount += 1.0 predictLabels.append(classifierResult) precision, recall, accuracy = evaluate.evaluateClassifier( predictLabels, chmLabels) allPredictLabels.extend(predictLabels) allRealLabels.extend(chmLabels) precisions.append(precision) recalls.append(recall) accuracys.append(accuracy) avgPrecision, avgRecall, avgAccuracy = evaluate.output( precisions, recalls, accuracys) return \ avgPrecision, avgRecall, avgAccuracy, allPredictLabels, allRealLabels
def cotraining (model_one, model_two, n_iter = 100) : """ """ data, train_number, val_number, test_number, unlabel_number, label, uid = datahandler.clean_data () train = data[:train_number,:] validation = data[train_number:train_number+val_number:,:] test = data[train_number+val_number:-unlabel_number,:] unlabel = data[-unlabel_number:,:] train, validation, test, unlabel = decomposition.gbdt_dimreduce_threshold (train, label, validation, test, unlabel) # train, validation, test, unlabel = split.split_continuum_value_tvt (train, validation, test, unlabel) # train_number = 100 # unlabel_number = 1000 # # train = train[:100,:] # unlabel = unlabel[:1000,:] # label = label[:100] train_one = copy.deepcopy (train) label_one = copy.deepcopy (label) train_two = copy.deepcopy (train) label_two = copy.deepcopy (label) model_one.fit (train_one, label_one) model_two.fit (train_two, label_two) for iter in xrange (1 , n_iter + 1 , 1) : logging.info ('#%d iter for co-training :' % iter) unlabel_label = [-1] * unlabel_number unlabel_index = range (0, unlabel_number) step = 0 while len (unlabel_index) > 0 : step += 1 logging.info ('co-training step #%d , reamining unlabel: %d' % (step, len (unlabel_index))) model_one, model_two, unlabel_label, unlabel_index, train_two, label_two = training (model_one, model_two, unlabel, unlabel_label, unlabel_index, train_two, label_two) model_two, model_one, unlabel_label, unlabel_index, train_one, label_one = training (model_two, model_one, unlabel, unlabel_label, unlabel_index, train_one, label_one) evaluate.get_auc (model_one.predict_proba (validation)[:,1]) evaluate.get_auc (model_two.predict_proba (validation)[:,1]) evaluate.get_auc ((model_one.predict_proba (validation)[:,1] + model_two.predict_proba (validation)[:,1]) / 2.0) joblib.dump (model_one, ROOT + '/result/model/model_one_%d_%d.pkl' % (iter, step)) joblib.dump (model_two, ROOT + '/result/model/model_two_%d_%d.pkl' % (iter, step)) evaluate.output (uid, (model_one.predict_proba (test)[:,1] + model_two.predict_proba (test)[:,1]) / 2.0, ROOT + '/result/predict/cotraining_%d_%d.csv' % (iter, step)) evaluate.output (uid, model_one.predict_proba (test)[:,1], ROOT + '/result/predict/model_one_%d_%d.csv' % (iter, step)) evaluate.output (uid, model_two.predict_proba (test)[:,1], ROOT + '/result/predict/model_two_%d_%d.csv' % (iter, step))
# train_data = train_data[:100,:] # validation = validation[:100,:] # test = test[:100,:] # train_label = train_label[:100] train_data, validation, test, unlabel = feature_extract(train_data, train_label, validation, test, unlabel) # print new_train_data.shape train_data, validation, test, unlabel = feature_handler(train_data, validation, test, unlabel) rf = RandomForestClassifier(warm_start=True, n_jobs=2, n_estimators=2000, max_depth=3, min_samples_split=50) rf.fit(train_data, train_label) # joblib.dump (rf, ROOT + '/result/rf.pkl') evaluate.get_auc(rf.predict_proba(validation)[:, 1]) return rf.predict_proba(train_data)[:, 1] if __name__ == "__main__": data, train_number, val_number, test_number, unlabel_number, label, uid = datahandler.clean_data() assert data.shape[0] == train_number + test_number + val_number + unlabel_number predict = rf_solver( data[:train_number, :], label, data[train_number : train_number + val_number, :], data[train_number + val_number : -unlabel_number, :], data[-unlabel_number:, :], decomposition.gbdt_dimreduce_threshold, split.undo, ) evaluate.output(uid, predict, ROOT + "/result/rf.csv")
sys.path.insert (0, '../..') from configure import * def knn_solver(train_data, train_label, validation, test, dimreduce, convertbinary): """ """ logging.info('begin to train the knn classifier') # train_data = train_data[:100,:] # validation = validation[:100,:] # test = test[:100,:] # train_label = train_label[:100] train_data, validation, test = dimreduce(train_data, train_label, validation, test) # print new_train_data.shape # train_data, validation, test = convertbinary(train_data, validation, test) knn = KNeighborsClassifier (algorithm = 'auto', n_neighbors = 10, p = 3) knn.fit (train_data , train_label) tools.get_auc (knn.predict_proba (validation)[:,1]) return knn.predict_proba (test)[:,1] if __name__ == "__main__" : data, train_number, val_number, test_number, label, uid = datahandler.clean_data () assert data.shape[0] == train_number + test_number + val_number predict = knn_solver (data[:train_number,:], label, data[train_number:-test_number,:], data[-test_number:,:], decomposition.gbdt_dimreduce_threshold, split.split_continuum_value_tvt) evaluate.output (uid, predict, ROOT + '/result/knn.csv')
""" This evaluates how a differing symmetric window size affects the evaluation results. """ from config import base import evaluate as e config = base.get_config() config['test_filepath'] = 'resources/test/teddev/data-with-doc.csv' window_sizes = [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 6), (7, 7), (8, 8), (9, 9), (10, 10)] for window_size in window_sizes: print("Running {}".format(window_size)) config['window_size'] = window_size config['no_embeddings'] = 2 * (window_size[0] + window_size[1]) + 1 + config['n_tags'] * 2 predictions = e.evaluate(config) test_data = e.load_data(config['test_filepath']) e.output(predictions, test_data, config['classes'], 'results/base.dev.window_size.{}+{}.txt'.format(window_size[0], window_size[1])) print("Saving {}".format(window_size))
""" This evaluates how the number of preceding POS tags affects the evaluation results. """ from config import base import evaluate as e config = base.get_config() config['test_filepath'] = 'resources/test/teddev/data-with-doc.csv' ignores = ['ignore_pos_tags', 'ignore_target_context', 'ignore_source_context'] n_embeddings = {'ignore_pos_tags': config['n_tags'] * 2, 'ignore_target_context': config['window_size'][0] + config['window_size'][1], 'ignore_source_context': config['window_size'][0] + config['window_size'][1] + 1} for ignore in ignores: print("Running {}".format(ignore)) config[ignore] = True config['no_embeddings'] = sum(n_embeddings[feature] for feature in n_embeddings if not config.get(feature, False)) print("no_embeddings: {}".format(config['no_embeddings'])) print(config) predictions = e.evaluate(config) test_data = e.load_data(config['test_filepath']) e.output(predictions, test_data, config['classes'], 'results/base.dev.ignore.{}.txt'.format(ignore)) print("Saving {}".format(ignore)) config[ignore] = False
""" This evaluates how the number of preceding POS tags affects the evaluation results. """ from config import base import evaluate as e config = base.get_config() config['test_filepath'] = 'resources/test/teddev/data-with-doc.csv' n_tags = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] for n_tag in n_tags: print("Running {}".format(n_tag)) config['n_tags'] = n_tag config['no_embeddings'] = 2 * (config['window_size'][0] + config['window_size'][1]) + 1 + n_tag * 2 predictions = e.evaluate(config) test_data = e.load_data(config['test_filepath']) e.output(predictions, test_data, config['classes'], 'results/base.dev.n_tags.{}.txt'.format(n_tag)) print("Saving {}".format(n_tag))
from configure import * def nb_solver(train_data, train_label, validation, test, classifier, dimreduce, convertbinary): """ """ logging.info('begin to train the naive bayes classifier') # train_data = train_data[:100,:] # validation = validation[:100,:] # test = test[:100,:] # train_label = train_label[:100] train_data, validation, test = dimreduce(train_data, train_label, validation, test) # print new_train_data.shape train_data, validation, test = convertbinary(train_data, validation, test) nb = classifier () nb.fit(train_data , train_label) evaluate.get_auc (nb.predict_proba (validation)[:,1]) return nb.predict_proba (test)[:,1] if __name__ == "__main__" : data, train_number, val_number, test_number, label, uid = datahandler.clean_data () assert data.shape[0] == train_number + test_number + val_number predict = nb_solver (data[:train_number,:], label, data[train_number:-test_number,:], data[-test_number:,:], BernoulliNB, decomposition.undo, split.undo) evaluate.output (uid, predict, ROOT + '/result/naivebayes.csv')
""" This runs the final configuration as reported in the paper. """ from config import base import evaluate as e config = base.get_config() output_path = 'results/final.output.txt' print("Running configuration: {}".format(config)) predictions = e.evaluate(config) test_data = e.load_data(config['test_filepath']) e.output(predictions, test_data, config['classes'], output_path) print("Saved output to {}".format(output_path))