def test_recall(self): # confusion matrix cm = rl.confusion_matrix(LINKS_TRUE, LINKS_PRED) assert rl.recall(LINKS_TRUE, LINKS_PRED) == 1 / 3 assert rl.recall(cm) == 1 / 3
def test_recall(self): # confusion matrix cm = rl.confusion_matrix(LINKS_TRUE, LINKS_PRED) self.assertEqual(rl.recall(LINKS_TRUE, LINKS_PRED), 1 / 3) self.assertEqual(rl.recall(cm), 1 / 3)
def metrics(links_true, links_pred, pairs): if len(links_pred) > 0: # precision precision = rl.precision(links_true, links_pred) #recall recall = rl.recall(links_true, links_pred) # The F-score for this classification is fscore = rl.fscore(links_true, links_pred) return { 'pairs': len(pairs), '#duplicates': len(links_pred), 'precision': precision, 'recall': recall, 'fscore': fscore } else: return { 'pairs': 0, '#duplicates': 0, 'precision': 0, 'recall': 0, 'fscore': 0 }
def test_fscore(self): # confusion matrix cm = rl.confusion_matrix(LINKS_TRUE, LINKS_PRED, len(FULL_INDEX)) prec = rl.precision(LINKS_TRUE, LINKS_PRED) rec = rl.recall(LINKS_TRUE, LINKS_PRED) expected = float(2 * prec * rec / (prec + rec)) self.assertEqual(rl.fscore(LINKS_TRUE, LINKS_PRED), expected) self.assertEqual(rl.fscore(cm), expected)
def _compute_performance(test_index, predictions, test_vectors_size): LOGGER.info('Running performance evaluation ...') confusion_matrix = rl.confusion_matrix(test_index, predictions, total=test_vectors_size) precision = rl.precision(test_index, predictions) recall = rl.recall(test_index, predictions) f_score = rl.fscore(confusion_matrix) LOGGER.info('Precision: %f - Recall: %f - F-score: %f', precision, recall, f_score) LOGGER.info('Confusion matrix: %s', confusion_matrix) return precision, recall, f_score, confusion_matrix
def cross_val_score(classifier, comparison_vector, link_true, cv=5, method='fscore'): skfolds = StratifiedKFold(n_splits=cv) y = pandas.Series(0, index=comparison_vector.index) y.loc[link_true.index & comparison_vector.index] = 1 X_train = comparison_vector.values y_train = y.values scores = [] for train_index, test_index in skfolds.split(X_train, y_train): #clone_clf = clone(classifier) classifier_copy = copy.deepcopy(classifier) X_train_folds = comparison_vector.iloc[ train_index] #X_train[train_index] X_test_folds = comparison_vector.iloc[test_index] #X_train[test_index] y_train_folds = X_train_folds.index & link_true.index #y_train[train_index] y_test_folds = X_test_folds.index & link_true.index # Train the classifier #print(y_train_folds.shape) classifier_copy.fit(X_train_folds, y_train_folds) # predict matches for the test #print(X_test_folds) y_pred = classifier_copy.predict(X_test_folds) if (method == 'fscore'): score = recordlinkage.fscore(y_test_folds, y_pred) elif (method == 'precision'): score = recordlinkage.precision(y_test_folds, y_pred) elif (method == 'recall'): score = recordlinkage.recall(y_test_folds, y_pred) elif (method == 'accuracy'): score = recordlinkage.accuracy(y_test_folds, y_pred, len(comparison_vector)) elif (method == 'specificity'): score = recordlinkage.specificity(y_test_folds, y_pred, len(comparison_vector)) scores.append(score) scores = numpy.array(scores) return scores
def log_quality_results(logger, result, true_links, total_pairs, params=None): logger.info("Number of Results %d", len(result)) logger.info("Confusion Matrix %s", str( recordlinkage.confusion_matrix(true_links, result, total_pairs))) try: fscore = recordlinkage.fscore(true_links, result) accuracy = recordlinkage.accuracy(true_links, result, total_pairs) precision = recordlinkage.precision(true_links, result) recall = recordlinkage.recall(true_links, result) logger.info("FScore: %.2f Accuracy : %.2f", fscore, accuracy) logger.info("Precision: %.2f Recall %.2f", precision, recall) logger.info("For params : %s", str(params)) write_results(logger.name, fscore, accuracy, precision, recall, params) except ZeroDivisionError: logger.error("ZeroDivisionError!!") return fscore
def print_experiment_evaluation(matches, description): precision = 0 recall = 0 fscore = 0 if len(matches) > 0: precision = recordlinkage.precision(links_test, matches) recall = recordlinkage.recall(links_test, matches) fscore = recordlinkage.fscore(links_test, matches) if recall + precision > 0 else 0 print(f"Configuration: {description}") print(f"Precision: {precision}") print(f"Recall: {recall}") print(f"F-score: {fscore}") print(recordlinkage.confusion_matrix(links_test, matches)) return precision, recall, fscore
def evalution(X_data, links_true): # 这里用逻辑回归分类器做分类, cl = recordlinkage.LogisticRegressionClassifier() cl.fit(X_data, links_true) # 用得到的模型做预测 links_pred = cl.predict(X_data) print("links_pred:{}".format(links_pred.shape)) # 输出混淆矩阵,confusion_matrix cm = recordlinkage.confusion_matrix(links_true, links_pred, total=len(X_data)) print("Confusion matrix:\n", cm) # compute the F-score for this classification fscore = recordlinkage.fscore(cm) print('fscore', fscore) # compute recall for this classification recall = recordlinkage.recall(links_true, links_pred) print('recall', recall) # compute precision for this classification precision = recordlinkage.precision(links_true, links_pred) print('precision', precision)
def metrics(links_true, links_pred, comparison_vector): if len(links_pred) > 0: # confusion matrix matrix = recordlinkage.confusion_matrix(links_true, links_pred, len(comparison_vector)) # precision precision = recordlinkage.precision(links_true, links_pred) # precision recall = recordlinkage.recall(links_true, links_pred) # The F-score for this classification is fscore = recordlinkage.fscore(links_true, links_pred) return matrix, precision, recall, fscore else: return 0, 0, 0, 0
print('feature shape', features.shape) # use the Logistic Regression Classifier # this classifier is equivalent to the deterministic record linkage approach intercept = -9.5 coefficients = [2.0, 3.0, 7.0, 6.0, 2.5, 5.0, 5.5] print('Deterministic classifier') print('intercept', intercept) print('coefficients', coefficients) logreg = rl.LogisticRegressionClassifier( coefficients=coefficients, intercept=intercept) links = logreg.predict(features) print(len(links), 'links/matches') # return the confusion matrix conf_logreg = rl.confusion_matrix(true_links, links, len(candidate_links)) print('confusion matrix') print(conf_logreg) # compute the F-score for this classification fscore = rl.fscore(conf_logreg) print('fscore', fscore) recall = rl.recall(true_links, links) print('recall', recall) precision = rl.precision(true_links, links) print('precision', precision)
print('Num. of many-to-many predicted links: {}'.format(len(links_pred))) # Take the match with highest probability for each Twitter user links_prob = classifier.prob(comparison_vectors) links_prob = links_prob[links_prob.index.isin(links_pred.values)] links_prob = links_prob.to_frame() links_prob.index.names = ['index_twitter', 'index_voter'] links_prob.columns = ['match_prob'] links_prob.reset_index(inplace=True) links_prob = links_prob.sort_values( 'match_prob', ascending=False).drop_duplicates('index_twitter') links_prob.set_index(['index_twitter', 'index_voter'], inplace=True) links_pred = links_prob.index print('Num. of many-to-one predicted links: {}'.format(len(links_pred))) cm = recordlinkage.confusion_matrix(links_true, links_pred, total=len(df_twitter) * len(df_voter)) print('TP: {}\nFN: {}\nFP: {}\nTN: {}\n'.format(cm[0][0], cm[0][1], cm[1][0], cm[1][1])) # compute the F-score for this classification fscore = recordlinkage.fscore(cm) print('F-score: {:.2f}'.format(fscore)) recall = recordlinkage.recall(links_true, links_pred) print('Recall: {:.2f}'.format(recall)) precision = recordlinkage.precision(links_true, links_pred) print('Precision: {:.2f}'.format(precision)) print(classifier.log_weights)
def test_recall(self): self.assertEqual(recordlinkage.recall(CONF_M1), 1.0) self.assertEqual(recordlinkage.recall(CONF_M5), 0.0)
cl = rl.NaiveBayesClassifier() cl.fit(X_data, links_true) # Print the parameters that are trained (m, u and p). Note that the estimates # are very good. print("p probability P(Match):", cl.p) print("m probabilities P(x_i=1|Match):", cl.m_probs) print("u probabilities P(x_i=1|Non-Match):", cl.u_probs) print("log m probabilities P(x_i=1|Match):", cl.log_m_probs) print("log u probabilities P(x_i=1|Non-Match):", cl.log_u_probs) print("log weights of features:", cl.log_weights) print("weights of features:", cl.weights) # evaluate the model links_pred = cl.predict(X_data) print("Predicted number of links:", len(links_pred)) cm = rl.confusion_matrix(links_true, links_pred, total=len(X_data)) print("Confusion matrix:\n", cm) # compute the F-score for this classification fscore = rl.fscore(cm) print('fscore', fscore) recall = rl.recall(links_true, links_pred) print('recall', recall) precision = rl.precision(links_true, links_pred) print('precision', precision) # Predict the match probability for each pair in the dataset. probs = cl.prob(X_data)