Example #1
0
def run():
    start_time = time()
    data_cand, data_part, full_data = load_data()
    # numeric_parties  = full_data.party.map(party_map)
    train_c, test_c = train_test_split(data_cand, test_size=0.2)
    train_p, test_p = train_test_split(data_part, test_size=0.2)
    candidatos_clf = Classifier(train_c.drop('candidatoId', axis=1), train_c.candidatoId)
    partidos_clf = Classifier(train_p.drop('idPartido', axis=1), train_p.idPartido)

    cand_solver = candidatos_clf._predict()
    n_cand, pca_cand_solver = candidatos_clf._pca()
    part_solver = partidos_clf._predict()
    n_part, pca_part_solver = partidos_clf._pca()

    cand_pred = candidatos_clf.classify(test_c.drop('candidatoId', axis=1), test_c.candidatoId, cand_solver)
    pca_cand_pred = candidatos_clf.classify(test_c.drop('candidatoId', axis=1), test_c.candidatoId, pca_cand_solver, n_cand)
    part_pred = partidos_clf.classify(test_p.drop('idPartido', axis=1), test_p.idPartido, part_solver)
    pca_part_pred = partidos_clf.classify(test_p.drop('idPartido', axis=1), test_p.idPartido, pca_part_solver, n_part)

    output_results(f'CANDIDATOS | {cand_solver}', test_c.candidatoId, cand_pred)
    output_results(f'CANDIDATOS_PCA | {pca_cand_solver}, {n_cand}', test_c.candidatoId, pca_cand_pred)
    output_results(f'PARTIDOS | {part_solver}', test_p.idPartido, part_pred)
    output_results(f'PARTIDOS_PCA | {pca_part_solver}, {n_part}', test_p.idPartido, pca_part_pred)
    cand_part_target, cand_part_pred = candidato_mapper(test_c.candidatoId, cand_pred)
    output_results(f'PARTIDOS CON CANDIDATO | {cand_solver}', cand_part_target, cand_part_pred)

    cm_cand = ConfusionMatrix(test_c.candidatoId, cand_pred)
    cm_pca_cand = ConfusionMatrix(test_c.candidatoId, pca_cand_pred)
    cm_part = ConfusionMatrix(test_p.idPartido, part_pred)
    cm_pca_part = ConfusionMatrix(test_p.idPartido, pca_part_pred)
    cm_cand_part = ConfusionMatrix(cand_part_target, cand_part_pred)

    elapsed_time = time() - start_time
    print(f'----------------------------------------')
    print(f'TOTAL TIME: {datetime.timedelta(seconds=elapsed_time)}')

    result = {
        'data': {
            'candidatos': (train_c, test_c),
            'partidos': (train_p, test_p),
        },
        'results': {
            'candidatos': (test_c.candidatoId, cand_pred),
            'candidatos_pca': (test_c.candidatoId, pca_cand_pred),
            'partidos': (test_p.idPartido, part_pred),
            'partidos_pca': (test_p.idPartido, pca_part_pred),
            'partidos_candidatos': (cand_part_target, cand_part_pred)
        },
        'matrices': {
            'candidatos': cm_cand,
            'candidatos_pca': cm_pca_cand,
            'partidos': cm_part,
            'partidos_pca': cm_pca_part,
            'partidos_candidatos': cm_cand_part
        }
    }
    return result
def test_pandas_confusion_cm_empty_row():
    y_true = [2, 0, 2, 2, 0, 0]
    y_pred = [0, 0, 2, 2, 1, 2]
    # cm = LabeledConfusionMatrix(y_true, y_pred)
    cm = ConfusionMatrix(y_true, y_pred, labels=["ant", "bird", "cat"])
    assert isinstance(cm, pdml.confusion_matrix.LabeledConfusionMatrix)

    cm = ConfusionMatrix(y_true, y_pred)
    assert isinstance(cm, pdml.confusion_matrix.LabeledConfusionMatrix)

    print("Confusion matrix:\n%s" % cm)
    asserts(y_true, y_pred, cm)
def random_forest():
    l=1
    if(l==1):
        print("------------------------RANDOM FOREST-----------------------")
        df = pd.read_csv(var.get(), low_memory=False)
        df = df.sample(frac=1).reset_index(drop=True)
        frauds = df.loc[df['Class'] == 1]
        non_frauds = df.loc[df['Class'] == 0]
        print("\nWe have", len(frauds), "fraud data points and", len(non_frauds), "nonfraudulent data points.")
        X = df.iloc[:,:-1]
        y = df['Class']

        print("X and y sizes, respectively:", len(X), len(y))
    
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35)
        print("Train and test sizes, respectively:", len(X_train), len(y_train), "|", len(X_test), len(y_test))
        print("Total number of frauds:", len(y.loc[df['Class'] == 1]))
        print("Number of frauds on y_test:", len(y_test.loc[df['Class'] == 1]))
        print("Number of frauds on y_train:", len(y_train.loc[df['Class'] == 1]))
        clf= RandomForestClassifier()
        clf.fit(X_train, y_train)
        y_predicted1 =np.array(clf.predict(X_test))
        y_right1=np.array(y_test)
        confusion_matrix1=ConfusionMatrix(y_right1,y_predicted1)
        print("\n\nConfusion matrix:\n%s" % confusion_matrix1)
        #confusion_matrix1.plot(normalized=True)
        T = Text(root, height=60, width=60)
        T.pack(pady=20,side=BOTTOM, fill=Y)
        for l in confusion_matrix1.stats():
            T.insert(END,[l,confusion_matrix1.stats()[l]])
            T.insert(END,"\n")
        d['ACC'].append(confusion_matrix1.stats()['ACC']*100)
        d['TPR'].append(confusion_matrix1.stats()['TPR']*100)
        fpr,tpr,thresholds=roc_curve(y_right1, y_predicted1)
        aucarr['auc'].append(auc(fpr,tpr))
Example #4
0
    def test(self, test, test_targets, pdconf=False, filename="", legend=None):

        if self.net_type == "classification":
            pred = self.forward_classification(test)
            acc = self.cal_acc(pred, test_targets)
            conf = self.confusion_table(pred, test_targets)
            if pdconf:
                temp_pred = self.predict(test)
                if legend != None:
                    predict = np.empty(len(temp_pred))
                    targets = np.empty(len(test_targets))
                    for i in range(len(targets)):
                        predict[i] = legend[np.argmax(temp_pred[i])]
                        targets[i] = legend[np.argmax(test_targets[i])]
                confus = ConfusionMatrix(targets, predict, display_sum=True)
        elif self.net_type == "regression":
            pred = self.forward_regression(test)
            r2 = self.cal_r2(pred, test_targets)

        err = self.cal_err(pred, test_targets, self.cost_function)

        print("The test error is: ", err)

        if self.net_type == "classification":
            print("The test accuracy is: ", acc)
            print("Confusion matrix:")
            print(conf)
            if pdconf:
                confus.plot(backend="seaborn")
                plt.savefig(filename)
                plt.clf()
            return err, acc, conf
        elif self.net_type == "regression":
            print("The test R2-score is: ", r2)
            return err, r2
    def test_pandas_confusion_binary_cm_inverse(self):
        y_true = [True, True, False, False, False, True, False, True, True,
                  False, True, False, False, False, False, False, True, False,
                  True, True, True, True, False, False, False, True, False,
                  True, False, False, False, False, True, True, False, False,
                  False, True, True, True, True, False, False, False, False,
                  True, False, False, False, False, False, False, False, False,
                  False, True, True, False, True, False, True, True, True,
                  False, False, True, False, True, False, False, True, False,
                  False, False, False, False, False, False, False, True, False,
                  True, True, True, True, False, False, True, False, True,
                  True, False, True, False, True, False, False, True, True,
                  False, False, True, True, False, False, False, False, False,
                  False, True, True, False]

        y_pred = [False, False, False, False, False, True, False, False, True,
                  False, True, False, False, False, False, False, False, False,
                  True, True, True, True, False, False, False, False, False,
                  False, False, False, False, False, True, False, False, False,
                  False, True, False, False, False, False, False, False, False,
                  True, False, False, False, False, False, False, False, False,
                  False, True, False, False, False, False, False, False, False,
                  False, False, True, False, False, False, False, True, False,
                  False, False, False, False, False, False, False, True, False,
                  False, True, False, False, False, False, True, False, True,
                  True, False, False, False, True, False, False, True, True,
                  False, False, True, True, False, False, False, False, False,
                  False, True, False, False]

        binary_cm = ConfusionMatrix(y_true, y_pred)
        assert isinstance(binary_cm, pdml.confusion_matrix.BinaryConfusionMatrix)
        bcm_sum = binary_cm.sum()

        binary_cm_r = binary_cm.inverse()  # reverse not in place
        assert bcm_sum == binary_cm_r.sum()
def one_vs_one():
    X_train0, X_train1, X_train2, X_train3, X_train4, X_train5, X_train6, X_train7, X_train8, X_train9 = data_clustering(X_train, y_train)
    prediction = []
    numpy_list = []
    numpy_predict = [X_train0, X_train1, X_train2, X_train3, X_train4, X_train5, X_train6, X_train7, X_train8, X_train9]
    combination  = list(itertools.combinations([0,1,2,3,4,5,6,7,8,9], 2))

    for pair in combination:
        y1, y2 = generate_data(numpy_predict[pair[0]], numpy_predict[pair[1]])
        training_data = np.vstack((numpy_predict[pair[0]] , numpy_predict[pair[1]]))
        test_data = np.hstack((y1, y2))
        clf = SVM(C=0.1)
        clf.train(training_data, test_data)
        y_predict = clf.predict(X_test)
        numpy_list.append(transform_data(y_predict, pair[0], pair[1]))

    numpy_list = np.array(numpy_list).astype(int)
    transpose = np.transpose(numpy_list)

    for row in xrange(transpose.shape[0]):
        counts = np.bincount(transpose[row])
        prediction.append(np.argmax(counts))


    prediction = np.array(prediction)
    correct = np.sum(prediction == y_test)
    confusion_matrix = ConfusionMatrix(y_test, prediction)
    print("Confusion matrix:\n%s" % confusion_matrix)
    size = len(y_predict)
    accuracy = (correct/float(size)) * 100
    print "%d out of %d predictions correct" % (correct, len(y_predict))
    print "The accuracy in percentage is  "
    print(accuracy)
Example #7
0
def generate_metrics(y_true, y_pred, scores, class_labels):
    # One-hot encode the truth (for multiclass metrics, if needed)
    y_true_onehot = label_binarize(y_true, classes=class_labels)
    m = {}
    m["sklearn"] = {}
    m["pandas-ml"] = {}

    # Calculate accuracy
    m["sklearn"]["acc"] = metrics.accuracy_score(y_true, y_pred)

    # Confusion matrix
    m["sklearn"]["confmat"] = metrics.confusion_matrix(y_true,
                                                       y_pred,
                                                       labels=class_labels)

    # Generate classification report
    m["sklearn"]["report"] = metrics.classification_report(
        y_true, y_pred, target_names=class_labels)

    # Get AUCs
    auc_indiv = metrics.roc_auc_score(y_true_onehot, scores, average=None)
    m["sklearn"]["auc_indiv"] = auc_indiv
    m["sklearn"]["auc_avg"] = np.mean(auc_indiv)

    # Get pandas-ml metrics
    m["pandas-ml"]["cm"] = ConfusionMatrix(y_true, y_pred, labels=class_labels)

    return m
Example #8
0
def GBoosting(kf, tfidf, data, verbose=False, normalized=False):
    GBC = GradientBoostingClassifier(n_estimators=100,
                                     learning_rate=1.0,
                                     max_depth=1,
                                     random_state=0)
    scores = cross_val_score(GBC,
                             tfidf,
                             data['condition_predict'],
                             cv=kf,
                             scoring='accuracy')
    print("Accuracy: %0.6f (+/- %0.6f)" % (scores.mean(), scores.std() * 2))
    y_pred = cross_val_predict(GBC, tfidf, data['condition_predict'], cv=kf)
    df = pd.DataFrame({
        'prediction': y_pred,
        'obsevred': data['condition_predict']
    })
    confusion_matrix = ConfusionMatrix(df.obsevred, df.prediction)
    if verbose:
        print("Confusion matrix:\n%s" % confusion_matrix)
        confusion_matrix.print_stats()
    confusion_matrix.plot(normalized=normalized,
                          backend='seaborn',
                          cmap='Blues',
                          annot=True)
    print(scores)
    return scores
Example #9
0
def metrics(df):
    """generate a standard set of metrics
    Columns in dataframe required:
    - animal_reference
    - animal_predicted
    - session
    """

    out = {}

    # counts
    out["images"] = len(df.index)
    out["animals_ref"] = len(df[df.animal_reference == True].index)
    out["blank_ref"] = len(df[df.animal_reference == False].index)
    out["animals_pred"] = len(df[df.animal_predicted == True].index)
    out["blank_pred"] = len(df[df.animal_predicted == False].index)

    # accuracy and precision
    cm = ConfusionMatrix(df.animal_reference.values,
                         df.animal_predicted.values)
    out["true_positive"] = cm.TP
    out["false_positive"] = cm.FP
    out["true_negative"] = cm.TN
    out["false_negative"] = cm.FN

    #missed sessions
    visits = list(df.visit.unique())
    visits.remove(np.nan)
    out["visits"] = visits
    out["visits_recognised"] = {}
    for visit in visits:
        out["visits_recognised"][visit] = (visit_animals(df, visit) > 0)

    return out
def logistic_regression():
    print("------------------------LOGISTIC REGRESSION-----------------------")
    df = pd.read_csv(var.get(), low_memory=False)
    df = df.sample(frac=1).reset_index(drop=True)
    frauds = df.loc[df['Class'] == 1]
    non_frauds = df.loc[df['Class'] == 0]
    print("\n")
    print("We have", len(frauds), "fraud data points and", len(non_frauds), "nonfraudulent data points.\n")
    X = df.iloc[:,:-1]
    y = df['Class']
    print("X and y sizes, respectively:", len(X), len(y))
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35)
    '''print("\nTrain and test sizes, respectively:", len(X_train), len(y_train), "|", len(X_test), len(y_test))
    print("Total number of frauds:", len(y.loc[df['Class'] == 1]))
    print("Number of frauds on y_test:", len(y_test.loc[df['Class'] == 1]))
    print("Number of frauds on y_train:", len(y_train.loc[df['Class'] == 1]))'''
    logistic = linear_model.LogisticRegression(C=1e5)
    logistic.fit(X_train, y_train)
    print("\nScore: ", logistic.score(X_test, y_test))
    y_predicted = np.array(logistic.predict(X_test))
    y_right = np.array(y_test)
    confusion_matrix = ConfusionMatrix(y_right, y_predicted)
    print("\n\nConfusion matrix:\n%s" % confusion_matrix)
    #confusion_matrix.plot(normalized=True)
    T = Text(root, height=60, width=60)
    T.pack(pady=20,side=BOTTOM, fill=Y)
    for l in confusion_matrix.stats():
        T.insert(END,[l,confusion_matrix.stats()[l]])
        T.insert(END,"\n")
    d['ACC'].append(confusion_matrix.stats()['ACC']*100)
    d['TPR'].append(confusion_matrix.stats()['TPR']*100)
    fpr,tpr,thresholds=roc_curve(y_right, y_predicted)
    aucarr['auc'].append(auc(fpr,tpr))
Example #11
0
    def save_confusion_matrix(self, truth_res, pred_res):

        #truth_res = [self.label_map[i+1] for i in truth_res]
        #pred_res = [self.label_map[i+1] for i in pred_res]
        '''
        print(len(truth_res))
        print(len(pred_res))
        confusion_matrix = ConfusionMatrix(truth_res, pred_res)
        plt.figure(dpi=200, figsize=(10, 7))
        confusion_matrix.plot()
        plt.savefig(self.confusion_matrix_file_path)
        '''
        s = sklearn.metrics.confusion_matrix(truth_res, pred_res)
        list_label = self.label_map[1:]
        df_cm = pd.DataFrame(data=s, columns=list_label, index=list_label)
        plt.figure(dpi=100)

        heatmap = sns.heatmap(df_cm, annot=True, fmt='d')
        heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(),
                                     rotation=70,
                                     ha='right',
                                     fontsize=5)
        heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(),
                                     rotation=20,
                                     ha='right',
                                     fontsize=5)

        plt.savefig(self.confusion_matrix_file_path)

        confusion_matrix = ConfusionMatrix(truth_res, pred_res)
        confusion_matrix.print_stats()
Example #12
0
def get_pd_ml_cf_matrix(y_actual, y_predicted):

    data = {'y_Actual': y_actual, 'y_Predicted': y_predicted}
    df = pd.DataFrame(data, columns=['y_Actual', 'y_Predicted'])
    pd_ml_cf_matrix = ConfusionMatrix(df['y_Actual'], df['y_Predicted'])

    return pd_ml_cf_matrix
Example #13
0
        def eval_test_set(step, conf_matrix=False):
            final_test_accuracy, test_loss = 0, 0
            y_pred, y_true = [], []
            for i in range(test_batch_num):
                images_batch, labels_batch = test_data_buffer.next_batch(
                    shuffle_data=False)
                feed_dict_test = {
                    images_pl: images_batch,
                    labels_pl: labels_batch,
                    keep_prob: 1,
                    phase_train: False
                }
                batch_loss, batch_correct_predictions, batch_predictions = \
                    sess.run([loss, tf.reduce_sum(correct_predictions), predictions], feed_dict=feed_dict_test)
                test_loss += batch_loss
                final_test_accuracy += batch_correct_predictions

                if conf_matrix:
                    y_pred.extend(batch_predictions)
                    y_true.extend(labels_batch)

            final_test_accuracy /= test_size
            test_loss /= test_batch_num
            summ_test = sess.run(merged_summ_test, {
                xent_var: test_loss,
                acc_var: final_test_accuracy
            })
            test_writer.add_summary(summ_test, step)

            if conf_matrix:
                cm = ConfusionMatrix(y_true, y_pred)
                return final_test_accuracy, cm
            else:
                return final_test_accuracy
Example #14
0
    def validate_epoch(self, val_model, epoch_cm):
        """
        Computes the batch validation confusion matrix
        and then updates the epoch confusion matrix.
        """
        # Loop through validation set
        for n in range(self.validation_steps):

            # Grab next batch
            X, y_true, _ = next(self.validation_data)

            # Make prediction with model
            y_pred = val_model([X])[0]

            # Find highest classes prediction
            y_true = np.argmax(y_true, axis=-1)
            y_pred = np.argmax(y_pred, axis=-1)

            # Flatten batch into single array
            y_true = np.ndarray.flatten(y_true)
            y_pred = np.ndarray.flatten(y_pred)

            # Create batch CM
            batch_cm = ConfusionMatrix(y_true, y_pred)

            # Get all classes in batch
            all_classes = list(batch_cm.classes)

            batch_cm = batch_cm.to_array()

            # Update epoch CM
            for i in all_classes:
                for j in all_classes:
                    epoch_cm[i, j] += batch_cm[all_classes.index(i), all_classes.index(j)]
Example #15
0
def main():
    args = get_args()

    # prepare data
    X, Y = prepare_data(args.corpus, args.features)

    # train_test protocol: 8:2
    x_train, x_test, y_train, y_test = \
        train_test_split(X, Y, test_size=0.2, random_state=12)

    # build pipeline
    parameters = CLF_PARAM[args.classifier]
    clf = CLF[args.classifier](**parameters)
    vectorizer = TfidfVectorizer(max_features=int(args.feature_size) if args.
                                 feature_size else FEATURE_SIZE)
    pipeline = Pipeline([('tfidf', vectorizer), ('clf', clf)])

    # experiment
    pipeline.fit(x_train, y_train)
    y_pred = pipeline.predict(x_test)

    # evaluation
    print(metrics.classification_report(y_test, y_pred))
    print(ConfusionMatrix(y_test, y_pred))

    # save model
    if args.output:
        joblib.dump(pipeline, args.output)
def test_pandas_confusion_matrix_auto_binary():
    y_true = [
        True, True, False, False, False, True, False, True, True, False, True,
        False, False, False, False, False, True, False, True, True, True, True,
        False, False, False, True, False, True, False, False, False, False,
        True, True, False, False, False, True, True, True, True, False, False,
        False, False, True, False, False, False, False, False, False, False,
        False, False, True, True, False, True, False, True, True, True, False,
        False, True, False, True, False, False, True, False, False, False,
        False, False, False, False, False, True, False, True, True, True, True,
        False, False, True, False, True, True, False, True, False, True, False,
        False, True, True, False, False, True, True, False, False, False,
        False, False, False, True, True, False
    ]

    y_pred = [
        False, False, False, False, False, True, False, False, True, False,
        True, False, False, False, False, False, False, False, True, True,
        True, True, False, False, False, False, False, False, False, False,
        False, False, True, False, False, False, False, True, False, False,
        False, False, False, False, False, True, False, False, False, False,
        False, False, False, False, False, True, False, False, False, False,
        False, False, False, False, False, True, False, False, False, False,
        True, False, False, False, False, False, False, False, False, True,
        False, False, True, False, False, False, False, True, False, True,
        True, False, False, False, True, False, False, True, True, False,
        False, True, True, False, False, False, False, False, False, True,
        False, False
    ]

    cm = ConfusionMatrix(y_true, y_pred)
    assert (isinstance(cm, pdml.confusion_matrix.BinaryConfusionMatrix))
Example #17
0
def compute_metrics(task_name, preds, labels):
    assert len(preds) == len(labels)
    if task_name == "cola":
        return {"mcc": matthews_corrcoef(labels, preds)}
    elif task_name == "sst-2":
        return {"acc": simple_accuracy(preds, labels)}
    elif task_name == "mrpc":
        return acc_and_f1(preds, labels)
    elif task_name == "sts-b":
        return pearson_and_spearman(preds, labels)
    elif task_name == "qqp":
        return acc_and_f1(preds, labels)
    elif task_name == "mnli":
        return {"acc": simple_accuracy(preds, labels)}
    elif task_name == "mnli-mm":
        return {"acc": simple_accuracy(preds, labels)}
    elif task_name == "qnli":
        return {"acc": simple_accuracy(preds, labels)}
    elif task_name == "rte":
        return {"acc": simple_accuracy(preds, labels)}
    elif task_name == "wnli":
        return {"acc": simple_accuracy(preds, labels)}
    elif task_name == "sa" or task_name == 'sa_csv':
        from pandas_ml import ConfusionMatrix
        pcm = ConfusionMatrix(labels, preds)
        pcm.print_stats()
        precision, recall, f1, _ = precision_recall_fscore_support(
            labels, preds, average='weighted')
        #return {"acc": simple_accuracy(preds, labels)}
        return {"acc": pcm.stats_overall['Accuracy']}
    elif task_name == "arg_mining":
        return {}
    else:
        raise KeyError(task_name)
def calculate_accuracy(csv_filename):

    # Loading csv information into a data frame
    data = pd.read_csv(csv_filename)
    # assigning actual sentiment data to y_test
    y_test = data['Actual_Statement']
    # assigning predicted sentiment data to y_pred
    y_pred = data['Prediction']

    score = accuracy_score(y_test, y_pred)
    # calling accuracy_score method to get the accuracy_score
    print 'Accuracy Score : ', score

    # calling confusion_matrix method from pandas_ml to show the output
    confusion_matrix = ConfusionMatrix(y_test, y_pred)
    output = confusion_matrix.to_dataframe()

    writer = pd.ExcelWriter("azure_text_confusion_matrix_output.xlsx")
    output.to_excel(writer, startrow=4, startcol=0)
    Acuracy_Score = 'Accuracy Score : ' + str(score)
    worksheet = writer.sheets['Sheet1']
    worksheet.write(1, 0, Acuracy_Score)

    writer.save()

    print("Confusion matrix:\n%s" % confusion_matrix)
def one_vs_all():
    X_train0, X_train1, X_train2, X_train3, X_train4, X_train5, X_train6, X_train7, X_train8, X_train9 = data_clustering(
        X_train, y_train)
    numpy_predict = []

    for number in range(10):
        train_number, train_rest, test_number, test_rest = join_cluster(
            X_train0, X_train1, X_train2, X_train3, X_train4, X_train5,
            X_train6, X_train7, X_train8, X_train9, number)
        training_data = np.vstack((train_number, train_rest))
        test_data = np.hstack((test_number, test_rest))
        clf = SVM(C=0.1)
        clf.train(training_data, test_data)
        y_predict = clf.compute(X_test)
        numpy_predict.append(y_predict)

    prediction = np.argmax(np.array(numpy_predict), axis=0)
    correct = np.sum(prediction == y_test)
    confusion_matrix = ConfusionMatrix(y_test, prediction)
    print("Confusion matrix:\n%s" % confusion_matrix)
    size = len(y_predict)
    accuracy = (correct / float(size)) * 100
    print "%d out of %d predictions correct" % (correct, len(y_predict))
    print "The accuracy in percentage is  "
    print(accuracy)
def process_results(mode, file, thrshld):
    threshold = thrshld
    with open(file) as json_file:  
        data = json.load(json_file)
        accuracy = 0.0
        actual = []
        predicted = []
        for p in data['results']:
            labellingScore = int(p['labellingScore']) 
            score = float(p['score'])
            if labellingScore == 1 and score > threshold:
                accuracy = accuracy + 1
            elif labellingScore == 0 and score < threshold:
                accuracy = accuracy + 1
            if labellingScore == 1:
                actual.append(1)
            else:
                actual.append(0)
            if score > threshold:
                predicted.append(1)
            else:
                predicted.append(0)
    if mode is 1:
        cm = ConfusionMatrix(actual, predicted)
        cm.print_stats()
    return accuracy/len(data['results'])
def test_pandas_confusion_binary_cm():
    y_true = [
        True, True, False, False, False, True, False, True, True, False, True,
        False, False, False, False, False, True, False, True, True, True, True,
        False, False, False, True, False, True, False, False, False, False,
        True, True, False, False, False, True, True, True, True, False, False,
        False, False, True, False, False, False, False, False, False, False,
        False, False, True, True, False, True, False, True, True, True, False,
        False, True, False, True, False, False, True, False, False, False,
        False, False, False, False, False, True, False, True, True, True, True,
        False, False, True, False, True, True, False, True, False, True, False,
        False, True, True, False, False, True, True, False, False, False,
        False, False, False, True, True, False
    ]

    y_pred = [
        False, False, False, False, False, True, False, False, True, False,
        True, False, False, False, False, False, False, False, True, True,
        True, True, False, False, False, False, False, False, False, False,
        False, False, True, False, False, False, False, True, False, False,
        False, False, False, False, False, True, False, False, False, False,
        False, False, False, False, False, True, False, False, False, False,
        False, False, False, False, False, True, False, False, False, False,
        True, False, False, False, False, False, False, False, False, True,
        False, False, True, False, False, False, False, True, False, True,
        True, False, False, False, True, False, False, True, True, False,
        False, True, True, False, False, False, False, False, False, True,
        False, False
    ]

    binary_cm = ConfusionMatrix(y_true, y_pred)
    assert isinstance(binary_cm, pdml.confusion_matrix.BinaryConfusionMatrix)

    print("Binary confusion matrix:\n%s" % binary_cm)
    asserts(y_true, y_pred, binary_cm)
Example #22
0
def trainingHMM(training_set):
	# Count of words from training data
	freqOfWords = countFreqOfWords(training_set)
	# Extract unique tags from training data 
	uniqTags = countUniqTags(training_set)
	# Add a value of 0 for key '<UNK>'
	freqOfWords['<UNK>'] = 0
	training_set_mod, freqOfWords_mod = handlingUNK(training_set, freqOfWords)
	# Count tag frequency
	tagFrequencyList = countTagFrequency(training_set_mod)
	# Calculate bigram list
	tagtagBigram, tagWordsBigram = calcBigram(training_set_mod)
	# Calculate transition and emission probability
	transitionList,emissionList = hmm_train_tagger(freqOfWords_mod, tagtagBigram, tagWordsBigram, uniqTags, tagFrequencyList, len(training_set)-1)
	# Decoding and Apply viterbi
	applyViterbi(uniqTags, testing_set, transitionList, emissionList, freqOfWords_mod)
	# Evaluation Script
	ourPredict = [line.rstrip('\n') for line in open('predict_out.txt')]
	samplePredict = [line.rstrip('\n') for line in open('predict_out.txt')]
	# Our Predictions
	predictSet = []
	for eachPair in ourPredict:
		if eachPair:
			predictSet.append(eachPair.split()[1])
	# Sample Set
	sampleSet = []
	for eachPair in samplePredict:
		if eachPair:
			sampleSet.append(eachPair.split()[1])
	#confusion matrix
	cm = ConfusionMatrix(sampleSet, predictSet)
	print cm
def test_plot():

    try:
        import matplotlib.pyplot  # noqa
    except ImportError:
        import nose
        raise nose.SkipTest()

    y_true = [
        'rabbit', 'cat', 'rabbit', 'rabbit', 'cat', 'dog', 'dog', 'rabbit',
        'rabbit', 'cat', 'dog', 'rabbit'
    ]
    y_pred = [
        'cat', 'cat', 'rabbit', 'dog', 'cat', 'rabbit', 'dog', 'cat', 'rabbit',
        'cat', 'rabbit', 'rabbit'
    ]

    cm = ConfusionMatrix(y_true, y_pred)

    # check plot works
    cm.plot()
    cm.plot(backend='seaborn')

    with tm.assertRaises(ValueError):
        cm.plot(backend='xxx')
Example #24
0
def execute_test(sm, test_folder, skip_unknown=False):
    sr = Schema_Reader()
    accuracies = []
    total_input_headers = []
    total_output_headers = []
    for filename in sorted(os.listdir(test_folder)):
        print(filename)
        path = test_folder + filename
        if (isfile(path)):
            try:
                headers, columns = sr.get_duplicate_columns(path, skip_unknown)
                result_headers = None
                if skip_unknown:
                    result_headers = sm.test_schema_matcher(columns, 0, False)
                else:
                    result_headers = sm.test_schema_matcher(columns, 0.4, True)
                total_output_headers += result_headers
                total_input_headers += headers
                print(list(zip(headers, result_headers)))
                accuracy = accuracy_score(headers, result_headers)
                print(accuracy)
                accuracies.append(accuracy)
            except:
                print("fail")
        break
    print(accuracies)
    print(accuracy_score(total_input_headers, total_output_headers))
    print(len(total_input_headers))
    print(ConfusionMatrix(total_input_headers, total_output_headers))
def test_value_counts():
    df = pd.DataFrame({
        'Height': [
            150, 150, 151, 151, 152, 155, 155, 157, 157, 157, 157, 158, 158,
            159, 159, 159, 160, 160, 162, 162, 163, 164, 165, 168, 169, 169,
            169, 170, 171, 171, 173, 173, 174, 176, 177, 177, 179, 179, 179,
            179, 179, 181, 181, 182, 183, 184, 186, 190, 190
        ],
        'Weight': [
            54, 55, 55, 47, 58, 53, 59, 60, 56, 55, 62, 56, 55, 55, 64, 61, 59,
            59, 63, 66, 64, 62, 66, 66, 72, 65, 75, 71, 70, 70, 75, 65, 79, 78,
            83, 75, 84, 78, 74, 75, 74, 90, 80, 81, 90, 81, 91, 87, 100
        ],
        'Size': [
            'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S',
            'S', 'S', 'S', 'S', 'S', 'S', 'M', 'M', 'M', 'M', 'M', 'M', 'M',
            'M', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L',
            'L', 'L', 'XL', 'XL', 'XL', 'XL', 'XL', 'XL', 'XL', 'XL'
        ],
        'SizePred': [
            'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S',
            'S', 'S', 'S', 'S', 'S', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M',
            'M', 'M', 'M', 'M', 'L', 'M', 'L', 'L', 'L', 'L', 'L', 'L', 'L',
            'L', 'L', 'XL', 'L', 'L', 'XL', 'L', 'XL', 'XL', 'XL'
        ],
    })
    cm = ConfusionMatrix(df["Size"], df["SizePred"])
    assert isinstance(cm, pdml.confusion_matrix.LabeledConfusionMatrix)

    assert (cm.true - df.Size.value_counts()).sum() == 0
    assert (cm.pred - df.SizePred.value_counts()).sum() == 0
    cm.print_stats()
Example #26
0
def confmtx(y_true, y_pred):
    from pandas_ml import ConfusionMatrix
    confusion_matrix = ConfusionMatrix(list(y_true), list(y_pred))
    classification_report = confusion_matrix.classification_report
    print('-' * 75 + '\nConfusion Matrix\n')
    print(confusion_matrix)
    print('-' * 75 + '\nClassification Report\n')
    print(classification_report)
Example #27
0
    def confusion_matrix(self, ground_truth, predictions, display=True):
        matrix = ConfusionMatrix(ground_truth, predictions)
        if display == True:
            print("Confusion matrix:\n%s" % matrix)

        if self.save_plots == True:
            matrix.plot()
            plt.savefig(self.evaluation_path)
def one_vs_one():

    X_train0, X_train1, X_train2, X_train3, X_train4, X_train5, X_train6, X_train7, X_train8, X_train9 = data_clustering(
        X_train, y_train)

    prediction = []
    numpy_list = []

    numpy_predict = [
        X_train0, X_train1, X_train2, X_train3, X_train4, X_train5, X_train6,
        X_train7, X_train8, X_train9
    ]

    combination = list(
        itertools.combinations([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 2))

    for pair in combination:

        y1, y2 = generate_data(numpy_predict[pair[0]], numpy_predict[pair[1]])

        training_data = np.vstack(
            (numpy_predict[pair[0]], numpy_predict[pair[1]]))
        test_data = np.hstack((y1, y2))

        clf = SVM(C=0.1)
        clf.train(training_data, test_data)

        y_predict = clf.compute(X_test)
        numpy_list.append(y_predict)

    numpy_list = np.array(numpy_list)

    transpose = np.transpose(numpy_list)

    mix = list(itertools.combinations([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 2))

    for row in transpose:

        newdict = {}
        for i in range(len(mix)):
            newdict[mix[i]] = row[i]

        result = decision_tree([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], newdict)
        prediction.append(result)

    prediction = np.array(prediction)

    correct = np.sum(prediction == y_test)

    confusion_matrix = ConfusionMatrix(y_test, prediction)
    print("Confusion matrix:\n%s" % confusion_matrix)

    size = len(y_predict)
    accuracy = (correct / float(size)) * 100

    print "%d out of %d predictions correct" % (correct, len(y_predict))
    print "The accuracy in percentage is  "
    print(accuracy)
def test_pandas_confusion_cm_int():
    y_true = [2, 0, 2, 2, 0, 1, 1, 2, 2, 0, 1, 2]
    y_pred = [0, 0, 2, 1, 0, 2, 1, 0, 2, 0, 2, 2]
    labels = ["ant", "bird", "cat"]
    cm = ConfusionMatrix(y_true, y_pred, labels=labels)
    assert isinstance(cm, pdml.confusion_matrix.LabeledConfusionMatrix)
    print("Confusion matrix:\n%s" % cm)
    asserts(y_true, y_pred, cm)
    assert cm.len() == len(labels)
def test_pandas_confusion_normalized():
    y_true = [2, 0, 2, 2, 0, 1, 1, 2, 2, 0, 1, 2]
    y_pred = [0, 0, 2, 1, 0, 2, 1, 0, 2, 0, 2, 2]
    cm = ConfusionMatrix(y_true, y_pred)
    assert isinstance(cm, pdml.confusion_matrix.LabeledConfusionMatrix)

    df = cm.to_dataframe()
    df_norm = cm.to_dataframe(normalized=True)
    assert (df_norm.sum(axis=1).sum() == len(df))