def test_pandas_confusion_normalized():
    y_true = [2, 0, 2, 2, 0, 1, 1, 2, 2, 0, 1, 2]
    y_pred = [0, 0, 2, 1, 0, 2, 1, 0, 2, 0, 2, 2]
    cm = ConfusionMatrix(y_true, y_pred)
    df = cm.to_dataframe()
    df_norm = cm.to_dataframe(normalized=True)
    assert(df_norm.sum(axis=1).sum() == len(df))
def test_pandas_confusion_cm_stats_integers():
    y_true = [600, 200, 200, 200, 200, 200, 200, 200, 500, 500, 500, 200, 200, 200, 200, 200, 200, 200, 200, 200]
    y_pred = [100, 200, 200, 100, 100, 200, 200, 200, 100, 200, 500, 100, 100, 100, 100, 100, 100, 100, 500, 200]
    print("y_true: %s" % y_true)
    print("y_pred: %s" % y_pred)
    cm = ConfusionMatrix(y_true, y_pred)
    assert isinstance(cm.stats(), OrderedDict)
    cm.print_stats()
def test_pandas_confusion_max_min():
    y_true = ['rabbit', 'cat', 'rabbit', 'rabbit', 'cat', 'dog', 'dog', 'rabbit', 'rabbit', 'cat', 'dog', 'rabbit']
    y_pred = ['cat', 'cat', 'rabbit', 'dog', 'cat', 'rabbit', 'dog', 'cat', 'rabbit', 'cat', 'rabbit', 'rabbit']
    print("y_true: %s" % y_true)
    print("y_pred: %s" % y_pred)
    cm = ConfusionMatrix(y_true, y_pred)
    assert cm.max() == 3
    assert cm.min() == 0
def test_pandas_confusion_cm_int():
    y_true = [2, 0, 2, 2, 0, 1, 1, 2, 2, 0, 1, 2]
    y_pred = [0, 0, 2, 1, 0, 2, 1, 0, 2, 0, 2, 2]
    labels = ["ant", "bird", "cat"]
    cm = ConfusionMatrix(y_true, y_pred, labels=labels)
    print("Confusion matrix:\n%s" % cm)
    asserts(y_true, y_pred, cm)
    assert cm.len() == len(labels)
def test_pandas_confusion_cm_empty_row():
    y_true = [2, 0, 2, 2, 0, 0]
    y_pred = [0, 0, 2, 2, 1, 2]
    # cm = ConfusionMatrix(y_true, y_pred)
    cm = ConfusionMatrix(y_true, y_pred, labels=["ant", "bird", "cat"])

    cm = ConfusionMatrix(y_true, y_pred)
    print("Confusion matrix:\n%s" % cm)

    asserts(y_true, y_pred, cm)
def test_value_counts():
    df = pd.DataFrame({
        'Height': [150, 150, 151, 151, 152, 155, 155, 157, 157, 157, 157, 158, 158, 159, 159, 159, 160, 160, 162, 162, 163, 164, 165, 168, 169, 169, 169, 170, 171, 171, 173, 173, 174, 176, 177, 177, 179, 179, 179, 179, 179, 181, 181, 182, 183, 184, 186, 190, 190],
        'Weight': [54, 55, 55, 47, 58, 53, 59, 60, 56, 55, 62, 56, 55, 55, 64, 61, 59, 59, 63, 66, 64, 62, 66, 66, 72, 65, 75, 71, 70, 70, 75, 65, 79, 78, 83, 75, 84, 78, 74, 75, 74, 90, 80, 81, 90, 81, 91, 87, 100],
        'Size': ['S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'XL', 'XL', 'XL', 'XL', 'XL', 'XL', 'XL', 'XL'],
        'SizePred': ['S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'L', 'M', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'XL', 'L', 'L', 'XL', 'L', 'XL', 'XL', 'XL'],
    })
    cm = ConfusionMatrix(df["Size"], df["SizePred"])
    assert (cm.true - df.Size.value_counts()).sum() == 0
    assert (cm.pred - df.SizePred.value_counts()).sum() == 0
    cm.print_stats()
Esempio n. 7
0
 def accuracy_info(self):
     answerListB = self.testing_answer.tolist()
     answerList = [answer.index(1) for answer in answerListB]
     cm = ConfusionMatrix(answerList,
                          self.sess.run(self.predict_op,
                                        feed_dict={self.data_placeholder: self.testing_data,
                                                   self.answer_placeholder: self.testing_answer}))
     cmData = cm.to_array('a')
     acc = [cmData[0][0], cmData[1][1], cmData[2][2]]
     print cm
     print acc
Esempio n. 8
0
def train_test_and_evaluate(pipeline, X_train, y_train, X_test, y_test):
    pipeline.fit(X_train, y_train)
    y_pred_class = pipeline.predict(X_test)
    unique_label = np.unique(y_test)
    matrix = ConfusionMatrix(y_test,
                             y_pred_class,
                             labels=['True Value', 'Predicted Value'])
    print('-' * 75 + '\nConfusion Matrix\n')
    print(matrix)
    print('f1_score', f1_score(y_test, y_pred_class, average="macro"))
    print('precision', precision_score(y_test, y_pred_class, average="macro"))
    print('recall', recall_score(y_test, y_pred_class, average="macro"))

    return pipeline, matrix.to_dataframe(), y_pred_class
def test_pandas_confusion_cm_binarize():
    y_true = ['rabbit', 'cat', 'rabbit', 'rabbit', 'cat', 'dog', 'dog', 'rabbit', 'rabbit', 'cat', 'dog', 'rabbit']
    y_pred = ['cat', 'cat', 'rabbit', 'dog', 'cat', 'rabbit', 'dog', 'cat', 'rabbit', 'cat', 'rabbit', 'rabbit']

    cm = ConfusionMatrix(y_true, y_pred)
    print("Confusion matrix:\n%s" % cm)

    select = ['cat', 'dog']
    print("Binarize with %s" % select)
    binary_cm = cm.binarize(select)

    print("Binary confusion matrix:\n%s" % binary_cm)

    assert cm.sum() == binary_cm.sum()
Esempio n. 10
0
def write_score_file(score_file, f1_mean, f1, model, Y_test_indices, y_prediction_classes):
    with open(score_file, 'w') as score_handler:
        score_handler.write("Micro-averaged F1: {}\n".format(f1_mean))
        score_handler.write("Individual scores: {}\n".format(f1))
        score_handler.write("Confusion matrix:\n")
        score_handler.write(str(ConfusionMatrix(Y_test_indices, y_prediction_classes)))
        score_handler.write("\n\n\nModel summary\n")
        model.summary(print_fn=lambda x: score_handler.write(x + '\n'))
def test_pandas_confusion_normalized_issue1():
    # should insure issue 1 is fixed
    # see http://stackoverflow.com/questions/19233771/sklearn-plot-confusion-matrix-with-labels/31720054#31720054

    y_true = ['business', 'business', 'business', 'business', 'business',
              'business', 'business', 'business', 'business', 'business',
              'business', 'business', 'business', 'business', 'business',
              'business', 'business', 'business', 'business', 'business']

    y_pred = ['health', 'business', 'business', 'business', 'business',
              'business', 'health', 'health', 'business', 'business', 'business',
              'business', 'business', 'business', 'business', 'business',
              'health', 'health', 'business', 'health']

    cm = ConfusionMatrix(y_true, y_pred)
    df = cm.to_dataframe()
    df_norm = cm.to_dataframe(normalized=True)
    assert(df_norm.sum(axis=1, skipna=False).fillna(1).sum() == len(df))
Esempio n. 12
0
def train_test_and_evaluate(pipeline, X_train, y_train, X_test, y_test):
    pipeline.fit(X_train, y_train)
    y_pred_class = pipeline.predict(X_test)
    confusion_matrix = ConfusionMatrix(list(y_test), list(y_pred_class))
    display_accuracy_difference(y_test, y_pred_class)
    classification_report = confusion_matrix.classification_report
    print '-' * 75 + '\nConfusion Matrix\n'
    print confusion_matrix
    print '-' * 75 + '\nClassification Report\n'
    print classification_report

    return pipeline, confusion_matrix
def evaluateModel(pp, preds, fold):
    # ************ PROCESSING THE PREDICTIONS
    preds[preds >= 0.5] = 1
    preds[preds < 0.5] = 0
    print("F1 SCORE:")
    print(f1_score(pp.test_y, preds, average=None))
    print("Hamming Loss:")
    print(hamming_loss(pp.test_y, preds))
    print("Zero-one loss:")
    print(zero_one_loss(pp.test_y, preds))
    # I reach here in plus_one_hot_encode, I want to transform it in one hot
    y_test_preds = utils.from_plus_to_one_hot(np.array(pp.test_y))
    preds_transf = utils.from_plus_to_one_hot(np.array(preds))
    cm = ConfusionMatrix(
        np.array(y_test_preds).argmax(1),
        np.array(preds_transf).argmax(1))
    print(cm)
    ax = cm.plot()
    ax.set_xticklabels(classes, rotation="vertical")
    ax.set_yticklabels(classes)
    plt.savefig("cmpre{0}.png".format(fold))
Esempio n. 14
0
def number_pred(save, show):
    y_true = [600, 200, 200, 200, 200, 200, 200, 200, 500, 500, 500, 200, 200, 200, 200, 200, 200, 200, 200, 200]
    y_pred = [100, 200, 200, 100, 100, 200, 200, 200, 100, 200, 500, 100, 100, 100, 100, 100, 100, 100, 500, 200]
    cm = ConfusionMatrix(y_true, y_pred)

    # print(cm.binarize(100).P)
    # cm.enlarge(300)
    # cm.enlarge([300, 400])

    print(cm)

    cm.plot()
    filename = 'numbers.png'
    if save:
        plt.savefig(os.path.join(basepath, '..', 'screenshots', filename))
    if show:
        plt.show()

    # print("")

    # print(cm.classes)

    # print("")

    # cm.print_stats(None)
    cm.print_stats()
def test_pandas_confusion_get():
    y_true = ['rabbit', 'cat', 'rabbit', 'rabbit', 'cat', 'dog', 'dog', 'rabbit', 'rabbit', 'cat', 'dog', 'rabbit']
    y_pred = ['cat', 'cat', 'rabbit', 'dog', 'cat', 'rabbit', 'dog', 'cat', 'rabbit', 'cat', 'rabbit', 'rabbit']
    print("y_true: %s" % y_true)
    print("y_pred: %s" % y_pred)
    cm = ConfusionMatrix(y_true, y_pred)
    assert cm.get("cat") == cm.get("cat", "cat")
    assert cm.get("cat") == 3
    assert cm.get("dog") == 1
    assert cm.get("rabbit") == 3
    assert cm.get("dog", "rabbit") == 2
def get_confusion_matrix(_results_file):
    df = pd.read_csv(results_file, sep='\t', header=None)
    true_lbls = df[1]
    pred_lbls = df[2]
    confusion_matrix = ConfusionMatrix(true_lbls, pred_lbls)
    confusion_matrix.plot()
    cm_file = _results_file.replace('.txt', '_cm.jpg')
    plt.savefig(cm_file)

    print()
    print(confusion_matrix)
    print()
    cm = confusion_matrix.to_dataframe()
    correct = 0
    for i in range(cm.shape[0]):
        correct += cm.iloc[i][i]
        recall = cm.iloc[i][i] * 100 / cm.sum(axis=0)[i]
        prec = cm.iloc[i][i] * 100 / cm.sum(axis=1)[i]
        print('Class %s recall = %.4f precision = %.4f' %
              (cm.columns[i], recall, prec))
    print('Overall accuracy = %.4f' %
          float(correct * 100 / sum(cm.sum(axis=0))))
Esempio n. 17
0
def eval_fish_net(weights, test_iters=10):
    prediction = []
    test_net = caffe.Net(fish_net(train=False), weights, caffe.TEST)
    accuracy = 0
    for i in range(0,30):
        rainbow = test_net.forward()['probs'][i][0]
        blue = test_net.forward()['probs'][i][1]
        chain = test_net.forward()['probs'][i][2]
        if (rainbow> blue) and (rainbow> chain):
            prediction.append("rainbow_trout")
        if (blue> rainbow) and (blue > chain):
            prediction.append("bluegill")
        if (chain> blue) and (chain > rainbow):
            prediction.append("chain_pickerel")
    
    for it in xrange(test_iters):
        accuracy += test_net.forward()['acc']
    accuracy /= test_iters
    confusion_matrix = ConfusionMatrix(actual, prediction)
    print("Confusion matrix:\n%s" % confusion_matrix)    
    return test_net, accuracy
Esempio n. 18
0
def size_pred(save, show):
    df = pd.DataFrame({
        'Height': [150, 150, 151, 151, 152, 155, 155, 157, 157, 157, 157, 158, 158, 159, 159, 159, 160, 160, 162, 162, 163, 164, 165, 168, 169, 169, 169, 170, 171, 171, 173, 173, 174, 176, 177, 177, 179, 179, 179, 179, 179, 181, 181, 182, 183, 184, 186, 190, 190],
        'Weight': [54, 55, 55, 47, 58, 53, 59, 60, 56, 55, 62, 56, 55, 55, 64, 61, 59, 59, 63, 66, 64, 62, 66, 66, 72, 65, 75, 71, 70, 70, 75, 65, 79, 78, 83, 75, 84, 78, 74, 75, 74, 90, 80, 81, 90, 81, 91, 87, 100],
        'Size': ['S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'XL', 'XL', 'XL', 'XL', 'XL', 'XL', 'XL', 'XL'],
        'SizePred': ['S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'L', 'M', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'XL', 'L', 'L', 'XL', 'L', 'XL', 'XL', 'XL'],
    })
    cm = ConfusionMatrix(df["Size"], df["SizePred"])
    print(cm)

    cm.print_stats()

    cm.plot()
    filename = 'size.png'
    if save:
        plt.savefig(os.path.join(basepath, '..', 'screenshots', filename))
    if show:
        plt.show()
Esempio n. 19
0
       'business', 'health', 'health', 'business', 'business', 'business',\
              'business', 'business', 'business', 'business', 'business',\
'health', 'health', 'business', 'health']
"""
y_test = [2, 1, 1, 5, 5, 2, 7, 5, 3, 3, 1, 1, 5, 1, 3, 7, 5, 7, 3, 2, 1, 7, 1,\
3, 2, 5, 2, 7, 2, 1, 7, 3, 3, 3, 7, 5, 7, 5, 7, 1, 7, 2, 5, 1, 2, 2, 1, 7, 5,\
3, 7, 3, 7, 3, 3, 2, 7, 3, 1, 7, 2, 1, 7, 5, 7, 3, 2, 5, 1, 2, 3, 2, 7, 7, 3,\
7, 1, 3, 5, 1, 7, 1, 7, 1, 7, 7, 5, 3, 7, 2, 1, 5, 7, 1, 3, 7, 2, 5, 2, 1, 3,\
5, 2, 5, 2, 5, 3, 1, 7, 3, 1, 2, 3, 2, 5, 5, 7, 1, 1, 3, 5, 2, 3, 7, 7]
y_pred = [2, 5, 1, 5, 5, 2, 7, 5, 3, 3, 1, 1, 5, 1, 3, 7, 5, 7, 3, 2, 1, 7, 1,\
3, 2, 5, 2, 5, 2, 1, 7, 3, 3, 3, 7, 5, 7, 5, 3, 5, 7, 2, 5, 1, 2, 2, 1, 7, 5,\
3, 7, 3, 7, 3, 3, 2, 7, 3, 1, 7, 2, 1, 7, 5, 7, 3, 2, 5, 1, 2, 3, 2, 7, 7, 3,\
7, 1, 2, 5, 1, 7, 3, 7, 3, 7, 7, 5, 3, 7, 2, 1, 5, 5, 1, 3, 1, 2, 5, 2, 1, 3,\
5, 2, 5, 2, 1, 3, 2, 7, 3, 1, 2, 1, 2, 5, 5, 7, 1, 1, 3, 5, 2, 3, 7, 7]
"""
y_test = [600, 200, 200, 200, 200, 200, 200, 200, 500, 500, 500, 200, 200,200,\
200, 200, 200, 200, 200, 200]
y_pred = [100, 200, 200, 100, 100, 200, 200, 200, 100, 200, 500, 100, 100,100,\
100, 100, 100, 100, 500, 200]
"""

#print type(y_pred)
#test_str = map(str,y_test)
#pred_str = map(str,y_pred)

cm = ConfusionMatrix(y_test, y_pred)
#cm = ConfusionMatrix(test_str, pred_str)
print cm

cm.print_stats()
Esempio n. 20
0
def benchmark(subtask, config_parameters):
    t0 = time.time()
    logger = logging.getLogger()
    np.random.seed(14021993)
    # Step 1) Load dataset
    train_path = 'data/THF/sentence/subtask{}_v3_train.json'.format(subtask)
    test_path = 'data/THF/sentence/subtask{}_v3_test.json'.format(subtask)
    number_of_classes = 2 if subtask == 'A' else 3
    # config_parameters['number_of_classes'] = number_of_classes
    embedding_cache = None
    if config_parameters['embeddings_cache_name']:
        embedding_cache_path = 'data/embedding_cache/{}'.format(config_parameters['embeddings_cache_name'])
        logger.info('Loading embedding cache: {}'.format(embedding_cache_path))
        embedding_cache = pickle.load(open(embedding_cache_path, "rb"))
        logger.info('Embedding cache loaded')
    logger.debug('Create mapping')
    word_to_index_mapping, index_to_embedding_mapping = vocabulary_builder.create_mappings(train_path=train_path,
                                                                                           test_path=test_path,
                                                                                           word_to_embedding_cache=embedding_cache)
    logger.info('The embedding layer has {} entries'.format(len(word_to_index_mapping)))
    logger.debug('Loading train and test set')
    X_train, Y_train, train_unique_ids, Y_train_indices = load_dataset(train_path, word_to_index_mapping,
                                                                       subtask,
                                                                       config_parameters['padding_length'])

    X_test, Y_test, test_unique_ids, Y_test_indices = load_dataset(test_path, word_to_index_mapping,
                                                                   subtask,
                                                                   config_parameters['padding_length'])
    # Step 2) Create model with parameters
    model_parameters = config_parameters['keras_model_parameters']
    logger.info(config_parameters)
    if config_parameters['embeddings_cache_name']:
        model_parameters['index_to_embedding_mapping'] = index_to_embedding_mapping
    model = model_selector.get_model(config_parameters['keras_model_name'], number_of_classes, model_parameters)

    # Step 3) Train the model
    logger.info('Train...')
    current_time = time.strftime('%Y%m%d_%H%M%S')
    model_save_path = 'results/sentence_deeplearning/temp/{}_{}_{}_{}'.format(subtask,
                                                                              config_parameters['keras_model_name'],
                                                                              '{:03}'.format(
                                                                                  config_parameters['evaluation_ID']),
                                                                              current_time)
    checkpoint_save_path = model_save_path + "_best.hdf5"
    checkpoint = ModelCheckpoint(checkpoint_save_path,
                                 monitor='val_acc', verbose=0,
                                 save_best_only=True, mode='auto')
    model.fit(X_train, Y_train,
              batch_size=config_parameters['batch_size'],
              epochs=config_parameters['epochs'],
              verbose=1,
              callbacks=[checkpoint],
              validation_data=(X_test, Y_test))

    # Step 4) Save the last model
    model.save(model_save_path + "_last.hdf5")
    model.summary()
    # Calculate results for the best and the last model
    saved_models = [{'name': 'best', 'extension': '_best.hdf5'}, {'name': 'last', 'extension': '_last.hdf5'}]
    for saved_model in saved_models:
        model_load_path = model_save_path + saved_model['extension']
        logger.info('Loading model for prediction: {}'.format(model_load_path))
        model = load_model(model_load_path)

        # Step 5) Predict the test set
        score, acc = model.evaluate(X_test, Y_test, batch_size=config_parameters['batch_size'])
        y_prediction = model.predict(X_test, batch_size=config_parameters['batch_size'])
        y_prediction_classes = np.argmax(y_prediction, axis=1)
        # Step 6) Print results
        logger.info(y_prediction_classes)
        logger.info('Test score: {}'.format(score))
        logger.info('Test accuracy: {}'.format(acc))
        f1 = f1_score(Y_test_indices, y_prediction_classes, average=None)
        f1_mean = np.mean(f1)
        logger.info("Macro-averaged F1: {}".format(f1_mean))
        logger.info("Individual scores: {}".format(f1))
        logger.info("Confusion matrix:")
        logger.info(ConfusionMatrix(Y_test_indices, y_prediction_classes))

        output_path_base = 'results/sentence_deeplearning/temp/{}_{}_{}_{}_{}'.format(subtask,
                                                                                      config_parameters[
                                                                                          'keras_model_name'],
                                                                                      '{:03}'.format(config_parameters[
                                                                                                         'evaluation_ID']),
                                                                                      current_time,
                                                                                      saved_model['name'])

        # Step 7) Print results to the file system
        utils.write_prediction_file(path=output_path_base + '.predictions', test_unique_ids=test_unique_ids,
                                    Y_test_indices=Y_test_indices, y_prediction_classes=y_prediction_classes)
        utils.write_score_file(score_file=output_path_base + '.score', f1_mean=f1_mean, f1=f1, model=model,
                               Y_test_indices=Y_test_indices, y_prediction_classes=y_prediction_classes)

    print("Total execution time in %0.3fs" % (time.time() - t0))
    print("*****************************************")
Esempio n. 21
0
mean_fare = data_test_clean['Fare'].dropna().mean()
data_test_clean.iloc[:, 5][data_test_clean.iloc[:, 5].isnull()] = mean_fare

# data_test_clean = data_test_clean.dropna()
data_test_clean['Age'] = data_test_clean['Age'].astype('int64')
data_test_clean['Fare'] = data_test_clean['Fare'].astype('int64')
data_test_clean['Sex'] = lb.fit_transform(data_test_clean['Sex'])
data_test_clean['Embarked'] = lb2.fit_transform(data_test_clean['Embarked'])

# LOGISTIC REGRESSION
logr = LogisticRegression(class_weight='balanced')
logr = logr.fit(x_train, y_train)
logr_predictions_train = logr.predict(x_train)
accuracy_train = accuracy_score(y_train, logr_predictions_train)
print accuracy_train  #0.78675
cm1 = ConfusionMatrix(y_train, logr_predictions_train)
# cm1.print_stats()

logr_predictions_test = logr.predict(data_test_clean)
data_test_id = data_test['PassengerId'].values

prediction_file = open("RESULTS_LR4.csv", "wb")
prediction_file_object = csv.writer(prediction_file)
prediction_file_object.writerow(["PassengerId", "Survived"])
print 'This needs to be 418: ', len(data_test_id)
print 'This needs to be 418: ', len(logr_predictions_test)
prediction_file_object.writerows(zip(data_test_id, logr_predictions_test))
prediction_file.close()
#Accuracy = 0.73206

# REGULARIZATION - search for the best parameter
Esempio n. 22
0
def main():
    
    filename = "mushroom.csv"
    
    idtree = id3_tree.id3_tree()

    '''
    split the data into data features, output classes and label names.
    '''
    data, classes, feature_names = idtree.read_data(filename)
    data, classes = shuffle(data, classes, random_state = 0)
   
    target_label = dict(Counter(classes))

    print "target datase contains Poisonous: %d and Edible: %d" % (target_label.values()[0], target_label.values()[1])
    ''' 
    Split mushroom data into training and test data set  
    3/4 training and 1/4 test dataset
    '''
    ntrain = int(0.75 * np.shape(data)[0])

       
    train_data = data[:ntrain]
    test_data = data[ntrain:]
    train_target = classes[:ntrain]
    test_target = classes[ntrain:]
    
    

    print "\n\nThe data split into train data: (%s, %s)" % np.shape(train_data)
    print "and Test dataset: : (%s, % s) \n\n" % np.shape(test_data)
    
    
    feature_info_gain = []
    feature = np.shape(feature_names)[0]
    feature_entropy = []  
    for i in range(feature):
        feature_entropy = idtree.calc_information_gain(data, classes, i)
    
    for i in range(feature):
    		feature_info_gain.append([feature_names[i],feature_entropy[i]])
    print "Information gain for the whole features in mushroom dataset\n"
    df = pd.DataFrame(feature_info_gain)   
    print  tabulate(feature_info_gain, headers=("Feature Name", "Information Gain"),
		tablefmt="orgtbl") + "\n \n \n "

    '''
    train the dataset and creating tree from the training data
    '''
    mtree = idtree.make_tree(train_data, train_target, feature_names,maxlevel=4)

    idtree.printTree(mtree,'')
    
    predicted_output = idtree.classifyAll(mtree, test_data)
    confusion_matrix = ConfusionMatrix(test_target, predicted_output)
    print("\nConfusion matrix:\n\n%s" % confusion_matrix)
    
    accuracyScore = accuracy_score(test_target, predicted_output)
    print "\nAccurecy Score: %0.2f " % (accuracyScore * 100) + "%"

    '''
    calculating the precision and recall from the confusion matrix
    '''
    precision, recall = calc_precision_recall(test_target, predicted_output)

    print "\nprecision and recall of the classifier is: \nPrecision %0.2f \nRecall %0.4f " % (precision, recall)

    print 
    accuracyscores, perf_measure = early_stopping(train_data, train_target, test_data,test_target, feature_names)

    print "Performance of the classifier with early stopping"
    print tabulate(perf_measure[:6], headers=( "TP", "FP", "TN", "FN"),
                   tablefmt="orgtbl") + "\n \n \n "
def test_pandas_confusion_cm_strings():
    y_true = ['rabbit', 'cat', 'rabbit', 'rabbit', 'cat', 'dog', 'dog', 'rabbit', 'rabbit', 'cat', 'dog', 'rabbit']
    y_pred = ['cat', 'cat', 'rabbit', 'dog', 'cat', 'rabbit', 'dog', 'cat', 'rabbit', 'cat', 'rabbit', 'rabbit']
    cm = ConfusionMatrix(y_true, y_pred)
    print("Confusion matrix:\n%s" % cm)
    asserts(y_true, y_pred, cm)
Esempio n. 24
0
##Import Test data created on R
gname = "NewsDataTest.csv"
full_file_test = path + "/" + gname

data_test = np.loadtxt(full_file_test,
                       delimiter=",",
                       skiprows=1,
                       usecols=tuple(cols[1:]))
X_test = data_test[:, 1:-2]
y_test = data_test[:, -1]

#I set up a model with prior weights
logistic = linear_model.LogisticRegression(class_weight='balanced')
logistic = logistic.fit(X, y)
y_predicted = logistic.predict(X_test)  #predicted class
cm = ConfusionMatrix(y_test, y_predicted)
cm.print_stats()
acc = accuracy_score(y_test, y_predicted)
#print(classification_report(y_test, y_predicted))
cmatrix = confusion_matrix(y_test, y_predicted)
ROI = cmatrix[1, 1] * 100 + cmatrix[0, 0] * 15 + cmatrix[0, 1] * (
    -15) + cmatrix[1, 0] * (-30)

#I then tried regularization (Using grid search to search for the most optimal regularization parameter c)
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
clf = GridSearchCV(LogisticRegression(penalty='l2'), param_grid)
GridSearchCV(cv=None,
             estimator=LogisticRegression(C=1.0,
                                          intercept_scaling=1,
                                          dual=False,
                                          fit_intercept=True,
Esempio n. 25
0
if __name__ == '__main__':
    flag = 2
    # save_feature()
    df = pd.read_csv(path.join(feature_data), header=0)

    if flag == 1:
        features = df[df.columns.difference(['label', 'app'])].values
        labels = df['label'].values

        X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)
        clf = LinearSVC(C=1.0, loss='squared_hinge', penalty='l2',multi_class='ovr')
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        print("Accuracy: {0:0.1f}%".format(accuracy_score(y_test, y_pred) * 100))

        confusion_matrix = ConfusionMatrix(y_test, y_pred, display_sum=False)
        print(confusion_matrix)
    elif flag ==2:
        cates = ['other', 'video', 'live', 'audio', 'radio']
        for cate in cates:
            total_num = 0.0
            wright_num = 0
            accuracy_list = []
            apps = np.unique(df[df.label == cate]['app'].values)
            for i, app in enumerate(apps):
                df_train = df[df.app != app]
                df_test = df[df.app == app]
                x_train, y_train = df_train[df.columns.difference(['label', 'app'])].values, df_train['label'].values
                x_test, y_test = df_test[df.columns.difference(['label', 'app'])].values, df_test['label'].values
                clf = Pipeline([('std', StandardScaler()), ('clf', SVC())])
                clf.fit(x_train, y_train)
Esempio n. 26
0
def main():
	path=sys.argv[1]
	with open(path) as f:
		config=json.load(f)
	batch_size=int(config['batch_size'])
	nb_classes=int(config['nb_classes'])
	weight_path=config['weights']

	
	
	

	#####################First level of Classification ################################

	##### load model
	model=None

	model=new.load_model(nb_classes,weight_path)
		


	####### specify the loss function
	sgd = SGD(lr=0.00005, decay = 1e-5, momentum=0.99, nesterov=True)
	#sgd = SGD(lr=0.00005, decay = 1e-6, momentum=0.9, nesterov=True)
	model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])
	#model.compile(loss='sparse_categorical_crossentropy', optimizer=sgd, metrics=[metrics.mae,  metrics.sparse_categorical_accuracy])



	######## load data
	test={}
	with open(config['data_path']+'/'+config['dataset_name']+'.test','rb') as f:
		test=pickle.load(f)
	
	x_test,y_test,imgname=test['data'],test['labels'],test['imgname']
	x_ts = x_test.reshape((-1,227,227,1))
	
	print(x_ts.shape, 'test samples')
	print(y_test.shape, 'test sample labels')



	##### evalution and prediction and confusion matrix formation
	scores=model.evaluate(x_ts,y_test,batch_size=batch_size,verbose=0)
	print("model %s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
	prediction= model.predict_classes(x_ts,verbose=1)
	#print(prediction)
	np.save('prediction.npy', prediction)
	pre=np.array(prediction)
	pre=MultiLabelBinarizer().fit_transform(pre.reshape(-1, 1))
	orig=y_test
	print('')
	print('')
	print('score for first level classification:   ',scores)
	'''
	count = 0
	for i in range(0,len(pre)):
		if not np.array_equal(orig[i],pre[i]):
			print(imgname[i],"_",orig[i],"_",pre[i],"_False")
			count = count + 1 
	print (count)
	'''
	aa=[0,1]
	aa = np.array(aa)
	print('')
	print('')
	print(MultiLabelBinarizer().fit_transform(aa.reshape(-1, 1)))
	print("0-Nontumor    1-Tumor")
	a=[0,1]
	a=np.array(a)
	b=[1,0]
	b=np.array(b)
	y_true = []
	y_pred = []
	print(range(len(prediction)))

	for i in range(len(prediction)):
		if np.array_equal(orig[i],a):
			y_true.append(1)
		elif np.array_equal(orig[i],b):
			y_true.append(0)
		
	for i in range(len(prediction)):
		if np.array_equal(pre[i],a):
			y_pred.append(1)
		elif np.array_equal(pre[i],b):
			y_pred.append(0)

	cm = ConfusionMatrix(y_true, y_pred)
	print('')
	print('')
	print('*****************************Confusion Matrix for first level Classification****************************')
	print(cm)

	print('')

	print('')
	



############################ Second level Classification ###############################
	'''
def test_pandas_confusion_cm_stats_animals():
    y_true = ['rabbit', 'cat', 'rabbit', 'rabbit', 'cat', 'dog', 'dog', 'rabbit', 'rabbit', 'cat', 'dog', 'rabbit']
    y_pred = ['cat', 'cat', 'rabbit', 'dog', 'cat', 'rabbit', 'dog', 'cat', 'rabbit', 'cat', 'rabbit', 'rabbit']
    print("y_true: %s" % y_true)
    print("y_pred: %s" % y_pred)
    cm = ConfusionMatrix(y_true, y_pred)
    assert isinstance(cm.stats(), OrderedDict)
    assert cm.population == len(y_true)  # 12
    cm.print_stats()
    cm_stats = cm.stats()  # noqa
    assert cm.binarize("cat").TP == cm.get("cat")  # cm.get("cat", "cat")
    assert cm.binarize("cat").TP == 3
    assert cm.binarize("dog").TP == cm.get("dog")  # 1
    assert cm.binarize("rabbit").TP == cm.get("rabbit")  # 3
plt.plot(c, cv_scores, '-o')
plt.xscale('log')


# In[ ]:

predicted = clf.predict(X_test)
expected = y_test
print(accuracy_score(expected, predicted))


# In[ ]:

predicted_probs = clf.predict_proba(X_test)
print(log_loss(y_test, predicted_probs))


# In[ ]:

cm = ConfusionMatrix(expected, predicted)
cm_stats = cm.to_dataframe().apply(lambda x: x/sum(x), axis=1)
cm_stats.to_csv('data/confusion_matrix_stats.csv')


# In[ ]:

mpl.rcParams['figure.figsize'] = (10.0, 5.0)
cm.plot(normalized=True)

Esempio n. 29
0
  from pandas_confusion import ConfusionMatrix

  cm=ConfusionMatrix(y_test,pred)

  cm.plot(cmap=c.get_cmap('PuBu'))
Esempio n. 30
0
    # collect perceptron votes to build confusion matrix
    # collect_votes runs perceptron for instances of letters in the testing data set
    # returns the winning letter by vote to store into predicted
    predicted = collect_votes(letter)
    #print letter.value[0], predicted

    # append to confusion matrix using pandas
    y_pred = y_pred.append(pd.Series(predicted, index=[letter_increment]))
    y_actu = y_actu.append(pd.Series(letter.value[0],
                                     index=[letter_increment]))

    # append pandas_confusion
    y_pred_stats.append(predicted)
    y_actu_stats.append(letter.value[0])

    # increment counter for next letter
    letter_increment += 1

# make confusion matrix using pandas
df_confusion = pd.crosstab(y_actu,
                           y_pred,
                           rownames=['Actual'],
                           colnames=['Predicted'],
                           margins=True)
print df_confusion

# make confusion matrix and print stats using pandas_confusion
cm = ConfusionMatrix(y_actu_stats, y_pred_stats)
# print("Confusion matrix:\n%s" % cm)
cm.print_stats()
Esempio n. 31
0
    votes = dict.fromkeys(alpha, 0) # reset dictionary for next test case
    y.append(count) # add classfication to array for confusion matrix
    if count == tcf.index[i]: #if the vote matches the known value of the target incremement correct
        t_correct += 1

# ***************************************************************************************************
# **************************** Creates and Displays Confusion Matrix ********************************
# ***************************************************************************************************

# uses pandas_confusion library to generate confusion matrix
print '\n\nConfusion Matrix:\n\n'

print '\tAccuracy is: ', m.ceil(float(t_correct) / 10000 * 100), '\n\n'


y_actul = pd.Series(y_true, name='Actual')
y_pred = pd.Series(y, name='Predicted')

confusion1 = ConfusionMatrix(y_actul, y_pred)
#
confusion1.print_stats()

confusion2 = pd.crosstab(y_actul, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)

print confusion2

print confusion_matrix(y_actul, y_pred)



Esempio n. 32
0
from sklearn.metrics import f1_score
import numpy as np
from pandas_confusion import ConfusionMatrix

with open('C_baseline.predictions') as file:
    y_true = []
    y_pred = []
    next(file)
    for line in file:
        row = line.replace('\n', '').split('\t')
        y_true.append(row[1])
        y_pred.append(row[2])
    f1 = f1_score(y_true, y_pred, average=None)
    f1_mean = np.mean(f1)
    print("Micro-averaged F1: {}".format(f1_mean))
    print("Individual scores: {}".format(f1))
    print("Confusion matrix:")
    print(ConfusionMatrix(y_true, y_pred))
def main(save, show):
    basepath = os.path.dirname(__file__)

    # y_true = [2, 0, 2, 2, 0, 1, 1, 2, 2, 0, 1, 2]
    # y_pred = [0, 0, 2, 1, 0, 2, 1, 0, 2, 0, 2, 2]
    # cm = ConfusionMatrix(y_true, y_pred)
    # cm = ConfusionMatrix(y_true, y_pred, labels=["ant", "bird", "cat"])

    # y_true = [2, 0, 2, 2, 0, 1]
    # y_pred = [0, 0, 2, 2, 0, 2]
    # cm = ConfusionMatrix(y_true, y_pred)
    # cm = ConfusionMatrix(y_true, y_pred, labels=["ant", "bird", "cat"])

    y_true = ['rabbit', 'cat', 'rabbit', 'rabbit', 'cat', 'dog', 'dog', 'rabbit', 'rabbit', 'cat', 'dog', 'rabbit']
    y_pred = ['cat', 'cat', 'rabbit', 'dog', 'cat', 'rabbit', 'dog', 'cat', 'rabbit', 'cat', 'rabbit', 'rabbit']
    cm = ConfusionMatrix(y_true, y_pred)

    # y_true = ["cat", "ant", "cat", "cat", "ant", "bird"]
    # y_pred = ["ant", "ant", "cat", "cat", "ant", "cat"]
    # >>> cm(y_true, y_pred, labels=["ant", "bird", "cat"])
    # array([[2, 0, 0],
    #       [0, 0, 1],
    #       [1, 0, 2]])
    # cm = ConfusionMatrix(y_true, y_pred)

    print("Confusion matrix:\n%s" % cm)
    df = cm.to_dataframe()
    print(df)
    print(df.dtypes)

    cm.plot()
    filename = 'cm.png'
    if save:
        plt.savefig(os.path.join(basepath, '..', 'screenshots', filename))
    if show:
        plt.show()

    cm.plot(normalized=True)
    filename = 'cm_norm.png'
    if save:
        plt.savefig(os.path.join(basepath, '..', 'screenshots', filename))
    if show:
        plt.show()

    cm.print_stats()
    print(cm.classification_report)

    print("sklearn confusion_matrix:\n%s" % confusion_matrix(y_true, y_pred))
    print(classification_report(y_true, y_pred))

    # stat = 'precision'
    # print(cm._avg_stat(stat))
    # print(cm.ACC)

    # import seaborn as sns
    # cm.plot(normalized=True, backend=Backend.Seaborn)
    # sns.plt.show()

    print("Binarize a confusion matrix")
    y_true = ["cat", "ant", "cat", "cat", "ant", "bird"]
    y_pred = ["ant", "ant", "cat", "cat", "ant", "cat"]
    cm = ConfusionMatrix(y_true, y_pred)
    print(cm)
    binary_cm = cm.binarize(['ant', 'cat'])
    # A bird is not a "land_animal"
    print(binary_cm)
Esempio n. 34
0
neigh = neigh.fit(X, y)
y_predicted_train = neigh.predict_proba(X)  #predicted class for training set

#obtain optimal probability threshold for classification
maxrev = 0
final_threshold = 0.5
for x in xrange(1, 100):
    thresh = 0.01 * x
    predicted_y_train = np.array(
        [1 if x > thresh else 0 for x in list(y_predicted_train[:, 1])])
    cmatrix = confusion_matrix(y, predicted_y_train)
    newROI = cmatrix[1, 1] * 100 + cmatrix[0, 0] * 15 + cmatrix[0, 1] * (
        -15) + cmatrix[1, 0] * (-30)
    if newROI > maxrev:
        maxrev = newROI
        final_threshold = thresh

y_predicted_test = neigh.predict_proba(
    X_test)  #predicted probability for test set
predicted_y_test = np.array([
    1 if x > final_threshold else 0 for x in list(y_predicted_test[:, 1])
])  #apply threshold to classify the test set

#obtain relevant statistics
cm = ConfusionMatrix(y_test, predicted_y_test)
cm.print_stats()
acc = accuracy_score(y_test, predicted_y_test)
cmatrix = confusion_matrix(y_test, predicted_y_test)
ROI = cmatrix[1, 1] * 100 + cmatrix[0, 0] * 15 + cmatrix[0, 1] * (
    -15) + cmatrix[1, 0] * (-30)
Esempio n. 35
0
# for every letter, run through every perceptron
# and record votes for which letter perceptron returns
for letter in letters_list_testing:
    # text = "\rTesting instance "+str((letter_increment)+1)+"/"+str(len(letters_list_testing))
    # sys.stdout.write(text)

    # collect perceptron votes to build confusion matrix
    # collect_votes runs perceptron for instances of letters in the testing data set
    # returns the winning letter by vote to store into predicted
    predicted = collect_votes(letter)
    #print letter.value[0], predicted

    # append to confusion matrix using pandas
    y_pred = y_pred.append(pd.Series(predicted, index=[letter_increment]))
    y_actu = y_actu.append(pd.Series(letter.value[0], index=[letter_increment]))

    # append pandas_confusion
    y_pred_stats.append(predicted)
    y_actu_stats.append(letter.value[0])

    # increment counter for next letter
    letter_increment += 1

# make confusion matrix using pandas
df_confusion = pd.crosstab(y_actu, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)
print df_confusion

# make confusion matrix and print stats using pandas_confusion
cm = ConfusionMatrix(y_actu_stats, y_pred_stats)
# print("Confusion matrix:\n%s" % cm)
cm.print_stats()