Example #1
0
def main():
    """ main """
    DATASET_LOCATION = '../datasets/'

    POS_DATASET_NAME = 'brown_pos_dataset.hdf5'

    POS_DATASET_PATH = os.path.join(DATASET_LOCATION, POS_DATASET_NAME)
    WORD_BY_WORD_MATRIX = 'brown.word-by-word.normalized.npy'
    VECTOR_INDEX_PATH = os.path.join(DATASET_LOCATION, WORD_BY_WORD_MATRIX)

    CUTOFF = 10000

    MODELS_PATH = "../trained_models/"
    MODEL_NAME = "mlp_model.pkl"
    classifier = os.path.join(MODELS_PATH, MODEL_NAME)
    test_dataset = prep_dataset(POS_DATASET_PATH, VECTOR_INDEX_PATH,
                                which_sets=['test'], cutoff=CUTOFF, cast_y=False)

    test_X, test_y = test_dataset
    test_y = test_y.get_value().astype('int32')
    predictions = predict(classifier, test_X.get_value())

    CORPUS_INDICES = 'brown_pos_dataset.indices'


    with open(os.path.join(DATASET_LOCATION, CORPUS_INDICES)) as indices_file:
        corpus_indices = cPickle.load(indices_file)

    # map tag ids back to strings
    y_test_actual = [corpus_indices['idx2tag'][tag_idx] for tag_idx in test_y]
    y_test_hat = [corpus_indices['idx2tag'][tag_idx] for tag_idx in predictions]

    # Quick Evaluation
    acc = sum([y==p for y,p in zip(predictions, test_y)]) / float(len(predictions))
    print "ACC: {}".format((acc))
    # get class names
    class_names = list(set(y_test_actual))

    # Compute confusion matrix
    cm = confusion_matrix(y_test_actual, y_test_hat, labels=class_names)

    # Normalize the confusion matrix by row (i.e by the number of samples in each class)
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.figure()
    plot_confusion_matrix(cm_normalized, class_names, title='Normalized confusion matrix')

    plt.show()
Example #2
0
def main():
    CUTOFF = 100000

    DATASET_LOCATION = "../datasets/"
    POS_DATASET_NAME = "brown_pos_dataset.hdf5"
    POS_DATASET_PATH = os.path.join(DATASET_LOCATION, POS_DATASET_NAME)

    WORD_BY_WORD_MATRIX = "brown.word-by-word.normalized.npy"
    VECTOR_INDEX_PATH = os.path.join(DATASET_LOCATION, WORD_BY_WORD_MATRIX)

    MODELS_PATH = "../trained_models/"
    MODEL_NAME = "logistic_regression_model.pkl"
    classifier = os.path.join(MODELS_PATH, MODEL_NAME)

    test_dataset = prep_dataset(POS_DATASET_PATH, VECTOR_INDEX_PATH, which_sets=["test"], cutoff=CUTOFF, cast_y=False)

    test_X, test_y = test_dataset
    test_y = test_y.get_value().astype("int32")
    predictions = predict(classifier, test_X.get_value())

    CORPUS_INDICES = "brown_pos_dataset.indices"
    # Indexes for mapping words and tags <--> ints
    with open(os.path.join(DATASET_LOCATION, CORPUS_INDICES)) as indices_file:
        corpus_indices = cPickle.load(indices_file)

    # map tag ids back to strings
    y_test_actual = [corpus_indices["idx2tag"][tag_idx] for tag_idx in test_y]
    y_test_hat = [corpus_indices["idx2tag"][tag_idx] for tag_idx in predictions]

    # quick check of our accuracy on the test dataset
    acc = sum([y == p for y, p in zip(predictions, test_y)]) / float(len(predictions))
    print "ACC: {}".format(acc)

    # get class names
    class_names = list(set(y_test_actual))

    # Compute confusion matrix
    cm = confusion_matrix(y_test_actual, y_test_hat, labels=class_names)

    cm_normalized = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis]
    plt.figure()
    plot_confusion_matrix(cm_normalized, class_names, title="Normalized confusion matrix")

    plt.show()

    f1_scores = sklearn.metrics.f1_score(y_test_actual, y_test_hat, labels=class_names, average=None)
    for class_name, score in zip(class_names, f1_scores):
        print "{}:\t{}".format(class_name, score)