Example #1
0
def testOthersByDir(fileName, algorithmName, flag=1):
    precisions = []
    recalls = []
    accuracys = []
    allPredictLabels = []
    allRealLabels = []
    model = None
    for i in range(5):
        normTrainingMat, normTestMat, hmLabels, \
            chmLabels, minVals, maxVals = loadFileData(fileName, i)
        if algorithmName == 'SVM':
            model = ar.testSVM(flag)
        elif algorithmName == 'RF':
            model = ar.testRandomForest()
        elif algorithmName == 'Bayes':
            model = ar.testBayes(flag)
        elif algorithmName == 'Tree':
            model = ar.testDecisionTree()
        else:
            pass
        if model is None:
            raise NameError('algorithm input error')
        model.fit(normTrainingMat, hmLabels)
        predictLabels = model.predict(normTestMat)
        precision, recall, accuracy = evaluate.evaluateClassifier(
            predictLabels, chmLabels)
        allPredictLabels.extend(predictLabels)
        allRealLabels.extend(chmLabels)
        precisions.append(precision)
        recalls.append(recall)
        accuracys.append(accuracy)
    avgPrecision, avgRecall, avgAccuracy = evaluate.output(
        precisions, recalls, accuracys)
    return \
        avgPrecision, avgRecall, avgAccuracy, allPredictLabels, allRealLabels
Example #2
0
def testKNNByFile(fileName, algorithmName, kValue=5):
    dataMat, labels, features = file2matrix(fileName)
    precisions = []
    recalls = []
    accuracys = []
    allPredictLabels = []
    allRealLabels = []
    classifierResult = None
    for j in range(10):
        normTrainingMat, normTestMat, hmLabels, chmLabels = crossAuth(
            dataMat, labels, j)
        minVals = normTrainingMat.min(0)
        maxVals = normTrainingMat.max(0)
        normTrainingMat = autoNorm(minVals, maxVals, normTrainingMat)
        normTestMat = autoNorm(minVals, maxVals, normTestMat)
        errorCount = 0.0
        m = len(normTestMat)
        predictLabels = []
        for i in range(m):
            if algorithmName == 'KNN':
                classifierResult = ar.kNNClassify(normTestMat[i, :],
                                                  normTrainingMat, hmLabels,
                                                  kValue)
            elif algorithmName == 'IPDC_KNN':
                classifierResult = ar.IPDCKNNClassify(normTestMat[i, :],
                                                      normTrainingMat,
                                                      hmLabels, kValue)
            elif algorithmName == 'IPDS_KNN':
                classifierResult = ar.IPDSKNNClassify(normTestMat[i, :],
                                                      normTrainingMat,
                                                      hmLabels, kValue)
            elif algorithmName == 'IPDCS_KNN':
                classifierResult = ar.IPDCSKNNClassify(normTestMat[i, :],
                                                       normTrainingMat,
                                                       hmLabels, kValue)
            elif algorithmName == 'IPNC_KNN':
                classifierResult = ar.IPNCKNNClassify(normTestMat[i, :],
                                                      normTrainingMat,
                                                      hmLabels, kValue)
            else:
                pass
            if classifierResult is None:
                raise NameError('algorithm input error')
            if (classifierResult != chmLabels[i]):
                errorCount += 1.0
            predictLabels.append(classifierResult)
        precision, recall, accuracy = evaluate.evaluateClassifier(
            predictLabels, chmLabels)
        allPredictLabels.extend(predictLabels)
        allRealLabels.extend(chmLabels)
        precisions.append(precision)
        recalls.append(recall)
        accuracys.append(accuracy)
    avgPrecision, avgRecall, avgAccuracy = evaluate.output(
        precisions, recalls, accuracys)
    return \
        avgPrecision, avgRecall, avgAccuracy, allPredictLabels, allRealLabels
Example #3
0
def testKNNByDir(fileName, algorithmName, kValue=5):
    precisions = []
    recalls = []
    accuracys = []
    allPredictLabels = []
    allRealLabels = []
    classifierResult = None
    for j in range(5):
        normTrainingMat, normTestMat, hmLabels, \
            chmLabels, minVals, maxVals = loadFileData(fileName, j)
        errorCount = 0.0
        m = len(normTestMat)
        predictLabels = []
        for i in range(m):
            if algorithmName == 'KNN':
                classifierResult = ar.kNNClassify(normTestMat[i, :],
                                                  normTrainingMat, hmLabels,
                                                  kValue)
            elif algorithmName == 'IPDC_KNN':
                classifierResult = ar.IPDCKNNClassify(normTestMat[i, :],
                                                      normTrainingMat,
                                                      hmLabels, kValue)
            elif algorithmName == 'IPDS_KNN':
                classifierResult = ar.IPDSKNNClassify(normTestMat[i, :],
                                                      normTrainingMat,
                                                      hmLabels, kValue)
            elif algorithmName == 'IPDCS_KNN':
                classifierResult = ar.IPDCSKNNClassify(normTestMat[i, :],
                                                       normTrainingMat,
                                                       hmLabels, kValue)
            elif algorithmName == 'IPNC_KNN':
                classifierResult = ar.IPNCKNNClassify(normTestMat[i, :],
                                                      normTrainingMat,
                                                      hmLabels, kValue)
            else:
                pass
            if classifierResult is None:
                raise NameError('algorithm input error')
            if (classifierResult != chmLabels[i]):
                errorCount += 1.0
            predictLabels.append(classifierResult)
        precision, recall, accuracy = evaluate.evaluateClassifier(
            predictLabels, chmLabels)
        allPredictLabels.extend(predictLabels)
        allRealLabels.extend(chmLabels)
        precisions.append(precision)
        recalls.append(recall)
        accuracys.append(accuracy)
    avgPrecision, avgRecall, avgAccuracy = evaluate.output(
        precisions, recalls, accuracys)
    return \
        avgPrecision, avgRecall, avgAccuracy, allPredictLabels, allRealLabels
Example #4
0
def cotraining (model_one, model_two, n_iter = 100) :
    """
    """
    data, train_number, val_number, test_number, unlabel_number, label, uid = datahandler.clean_data ()

    train = data[:train_number,:]
    validation = data[train_number:train_number+val_number:,:]
    test = data[train_number+val_number:-unlabel_number,:]
    unlabel = data[-unlabel_number:,:]

    train, validation, test, unlabel = decomposition.gbdt_dimreduce_threshold (train, label, validation, test, unlabel) 
    # train, validation, test, unlabel = split.split_continuum_value_tvt (train, validation, test, unlabel)

#    train_number = 100
#    unlabel_number = 1000
#
#    train = train[:100,:]
#    unlabel = unlabel[:1000,:]
#    label = label[:100]

    train_one = copy.deepcopy (train)
    label_one = copy.deepcopy (label)
    train_two = copy.deepcopy (train)
    label_two = copy.deepcopy (label)

    model_one.fit (train_one, label_one)
    model_two.fit (train_two, label_two)

    for iter in xrange (1 , n_iter + 1 , 1) :
        logging.info ('#%d iter for co-training :' % iter)

        unlabel_label = [-1] * unlabel_number
        unlabel_index = range (0, unlabel_number)
        step = 0
        while len (unlabel_index) > 0 :
            step += 1
            logging.info ('co-training step #%d , reamining unlabel: %d' % (step, len (unlabel_index)))
            model_one, model_two, unlabel_label, unlabel_index, train_two, label_two = training (model_one, model_two, unlabel, unlabel_label, unlabel_index, train_two, label_two)
            model_two, model_one, unlabel_label, unlabel_index, train_one, label_one = training (model_two, model_one, unlabel, unlabel_label, unlabel_index, train_one, label_one)
            
            evaluate.get_auc (model_one.predict_proba (validation)[:,1])
            evaluate.get_auc (model_two.predict_proba (validation)[:,1])
            evaluate.get_auc ((model_one.predict_proba (validation)[:,1] + model_two.predict_proba (validation)[:,1]) / 2.0)

            joblib.dump (model_one, ROOT + '/result/model/model_one_%d_%d.pkl' % (iter, step))
            joblib.dump (model_two, ROOT + '/result/model/model_two_%d_%d.pkl' % (iter, step))
    
            evaluate.output (uid, (model_one.predict_proba (test)[:,1] + model_two.predict_proba (test)[:,1]) / 2.0, ROOT + '/result/predict/cotraining_%d_%d.csv' % (iter, step))
            evaluate.output (uid, model_one.predict_proba (test)[:,1], ROOT + '/result/predict/model_one_%d_%d.csv' % (iter, step))
            evaluate.output (uid, model_two.predict_proba (test)[:,1], ROOT + '/result/predict/model_two_%d_%d.csv' % (iter, step))
Example #5
0
    # train_data = train_data[:100,:]
    # validation = validation[:100,:]
    # test = test[:100,:]
    # train_label = train_label[:100]
    train_data, validation, test, unlabel = feature_extract(train_data, train_label, validation, test, unlabel)
    # print new_train_data.shape
    train_data, validation, test, unlabel = feature_handler(train_data, validation, test, unlabel)

    rf = RandomForestClassifier(warm_start=True, n_jobs=2, n_estimators=2000, max_depth=3, min_samples_split=50)
    rf.fit(train_data, train_label)
    # joblib.dump (rf, ROOT + '/result/rf.pkl')
    evaluate.get_auc(rf.predict_proba(validation)[:, 1])
    return rf.predict_proba(train_data)[:, 1]


if __name__ == "__main__":
    data, train_number, val_number, test_number, unlabel_number, label, uid = datahandler.clean_data()
    assert data.shape[0] == train_number + test_number + val_number + unlabel_number
    predict = rf_solver(
        data[:train_number, :],
        label,
        data[train_number : train_number + val_number, :],
        data[train_number + val_number : -unlabel_number, :],
        data[-unlabel_number:, :],
        decomposition.gbdt_dimreduce_threshold,
        split.undo,
    )

    evaluate.output(uid, predict, ROOT + "/result/rf.csv")
Example #6
0
sys.path.insert (0, '../..')
from configure import *


def knn_solver(train_data, train_label, validation, test, dimreduce, convertbinary):
    """
    """
    logging.info('begin to train the knn classifier')

    # train_data = train_data[:100,:]
    # validation = validation[:100,:]
    # test = test[:100,:]
    # train_label = train_label[:100]
    train_data, validation, test = dimreduce(train_data, train_label, validation, test)
    # print new_train_data.shape
    # train_data, validation, test = convertbinary(train_data, validation, test)

    knn = KNeighborsClassifier (algorithm = 'auto', n_neighbors = 10, p = 3)
    knn.fit (train_data , train_label)
    tools.get_auc (knn.predict_proba (validation)[:,1])
    return knn.predict_proba (test)[:,1]

if __name__ == "__main__" :
    data, train_number, val_number, test_number, label, uid = datahandler.clean_data ()
    assert data.shape[0] == train_number + test_number + val_number
    predict = knn_solver (data[:train_number,:], label, data[train_number:-test_number,:], data[-test_number:,:],  decomposition.gbdt_dimreduce_threshold, split.split_continuum_value_tvt)

    evaluate.output (uid, predict, ROOT + '/result/knn.csv')


"""
This evaluates how a differing symmetric window size affects the evaluation results.
"""

from config import base
import evaluate as e

config = base.get_config()
config['test_filepath'] = 'resources/test/teddev/data-with-doc.csv'

window_sizes = [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 6), (7, 7), (8, 8), (9, 9), (10, 10)]
for window_size in window_sizes:
    print("Running {}".format(window_size))
    config['window_size'] = window_size
    config['no_embeddings'] = 2 * (window_size[0] + window_size[1]) + 1 + config['n_tags'] * 2
    predictions = e.evaluate(config)
    test_data = e.load_data(config['test_filepath'])
    e.output(predictions, test_data, config['classes'],
             'results/base.dev.window_size.{}+{}.txt'.format(window_size[0], window_size[1]))
    print("Saving {}".format(window_size))
Example #8
0
"""
This evaluates how the number of preceding POS tags affects the evaluation results.
"""

from config import base
import evaluate as e

config = base.get_config()
config['test_filepath'] = 'resources/test/teddev/data-with-doc.csv'

ignores = ['ignore_pos_tags', 'ignore_target_context', 'ignore_source_context']
n_embeddings = {'ignore_pos_tags': config['n_tags'] * 2,
                'ignore_target_context': config['window_size'][0] + config['window_size'][1],
                'ignore_source_context': config['window_size'][0] + config['window_size'][1] + 1}
for ignore in ignores:
    print("Running {}".format(ignore))
    config[ignore] = True
    config['no_embeddings'] = sum(n_embeddings[feature] for feature in n_embeddings if not config.get(feature, False))
    print("no_embeddings: {}".format(config['no_embeddings']))
    print(config)
    predictions = e.evaluate(config)
    test_data = e.load_data(config['test_filepath'])
    e.output(predictions, test_data, config['classes'],
             'results/base.dev.ignore.{}.txt'.format(ignore))
    print("Saving {}".format(ignore))
    config[ignore] = False
Example #9
0
"""
This evaluates how the number of preceding POS tags affects the evaluation results.
"""

from config import base
import evaluate as e

config = base.get_config()
config['test_filepath'] = 'resources/test/teddev/data-with-doc.csv'

n_tags = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

for n_tag in n_tags:
    print("Running {}".format(n_tag))
    config['n_tags'] = n_tag
    config['no_embeddings'] = 2 * (config['window_size'][0] + config['window_size'][1]) + 1 + n_tag * 2
    predictions = e.evaluate(config)
    test_data = e.load_data(config['test_filepath'])
    e.output(predictions, test_data, config['classes'],
             'results/base.dev.n_tags.{}.txt'.format(n_tag))
    print("Saving {}".format(n_tag))
Example #10
0
from configure import *

def nb_solver(train_data, train_label, validation, test, classifier, dimreduce, convertbinary):
    """
    """
    logging.info('begin to train the naive bayes classifier')

    # train_data = train_data[:100,:]
    # validation = validation[:100,:]
    # test = test[:100,:]
    # train_label = train_label[:100]
    train_data, validation, test = dimreduce(train_data, train_label, validation, test)
    # print new_train_data.shape
    train_data, validation, test = convertbinary(train_data, validation, test)

    nb = classifier ()
    nb.fit(train_data , train_label)
    evaluate.get_auc (nb.predict_proba (validation)[:,1])
    return nb.predict_proba (test)[:,1]

if __name__ == "__main__" :
    data, train_number, val_number, test_number, label, uid = datahandler.clean_data ()
    assert data.shape[0] == train_number + test_number + val_number
    predict = nb_solver (data[:train_number,:], label, data[train_number:-test_number,:], data[-test_number:,:], BernoulliNB, decomposition.undo, split.undo)

    evaluate.output (uid, predict, ROOT + '/result/naivebayes.csv')




Example #11
0
"""
This runs the final configuration as reported in the paper.
"""

from config import base
import evaluate as e

config = base.get_config()
output_path = 'results/final.output.txt'
print("Running configuration: {}".format(config))

predictions = e.evaluate(config)
test_data = e.load_data(config['test_filepath'])
e.output(predictions, test_data, config['classes'],
         output_path)
print("Saved output to {}".format(output_path))