def test_fscore(self): # confusion matrix cm = rl.confusion_matrix(LINKS_TRUE, LINKS_PRED, len(FULL_INDEX)) prec = rl.precision(LINKS_TRUE, LINKS_PRED) rec = rl.recall(LINKS_TRUE, LINKS_PRED) expected = float(2 * prec * rec / (prec + rec)) self.assertEqual(rl.fscore(LINKS_TRUE, LINKS_PRED), expected) self.assertEqual(rl.fscore(cm), expected)
def metrics(links_true, links_pred, pairs): if len(links_pred) > 0: # precision precision = rl.precision(links_true, links_pred) #recall recall = rl.recall(links_true, links_pred) # The F-score for this classification is fscore = rl.fscore(links_true, links_pred) return { 'pairs': len(pairs), '#duplicates': len(links_pred), 'precision': precision, 'recall': recall, 'fscore': fscore } else: return { 'pairs': 0, '#duplicates': 0, 'precision': 0, 'recall': 0, 'fscore': 0 }
def _compute_performance(test_index, predictions, test_vectors_size): LOGGER.info('Running performance evaluation ...') confusion_matrix = rl.confusion_matrix(test_index, predictions, total=test_vectors_size) precision = rl.precision(test_index, predictions) recall = rl.recall(test_index, predictions) f_score = rl.fscore(confusion_matrix) LOGGER.info('Precision: %f - Recall: %f - F-score: %f', precision, recall, f_score) LOGGER.info('Confusion matrix: %s', confusion_matrix) return precision, recall, f_score, confusion_matrix
def log_quality_results(logger, result, true_links, total_pairs, params=None): logger.info("Number of Results %d", len(result)) logger.info("Confusion Matrix %s", str( recordlinkage.confusion_matrix(true_links, result, total_pairs))) try: fscore = recordlinkage.fscore(true_links, result) accuracy = recordlinkage.accuracy(true_links, result, total_pairs) precision = recordlinkage.precision(true_links, result) recall = recordlinkage.recall(true_links, result) logger.info("FScore: %.2f Accuracy : %.2f", fscore, accuracy) logger.info("Precision: %.2f Recall %.2f", precision, recall) logger.info("For params : %s", str(params)) write_results(logger.name, fscore, accuracy, precision, recall, params) except ZeroDivisionError: logger.error("ZeroDivisionError!!") return fscore
def cross_val_score(classifier, comparison_vector, link_true, cv=5, method='fscore'): skfolds = StratifiedKFold(n_splits=cv) y = pandas.Series(0, index=comparison_vector.index) y.loc[link_true.index & comparison_vector.index] = 1 X_train = comparison_vector.values y_train = y.values scores = [] for train_index, test_index in skfolds.split(X_train, y_train): #clone_clf = clone(classifier) classifier_copy = copy.deepcopy(classifier) X_train_folds = comparison_vector.iloc[ train_index] #X_train[train_index] X_test_folds = comparison_vector.iloc[test_index] #X_train[test_index] y_train_folds = X_train_folds.index & link_true.index #y_train[train_index] y_test_folds = X_test_folds.index & link_true.index # Train the classifier #print(y_train_folds.shape) classifier_copy.fit(X_train_folds, y_train_folds) # predict matches for the test #print(X_test_folds) y_pred = classifier_copy.predict(X_test_folds) if (method == 'fscore'): score = recordlinkage.fscore(y_test_folds, y_pred) elif (method == 'precision'): score = recordlinkage.precision(y_test_folds, y_pred) elif (method == 'recall'): score = recordlinkage.recall(y_test_folds, y_pred) elif (method == 'accuracy'): score = recordlinkage.accuracy(y_test_folds, y_pred, len(comparison_vector)) elif (method == 'specificity'): score = recordlinkage.specificity(y_test_folds, y_pred, len(comparison_vector)) scores.append(score) scores = numpy.array(scores) return scores
def diagnose_links(true_links, pred_links, total_n_links, similarity_df=None): confusion_mat = rl.confusion_matrix(true_links, pred_links, total=total_n_links) print("Confusion Matrix") print(confusion_mat) print("Comfusion Matrix (percentages)") print(confusion_mat * 100 / total_n_links) print("") matched_true_links = true_links.isin(pred_links) print("Recall Metrics:") print("Num of True Links: {:,}".format(len(true_links))) print("Num of True Links Matched: {:,}".format(matched_true_links.sum())) print("Num of True Links Unmatched: {:,}".format( len(matched_true_links) - matched_true_links.sum())) print("Percent of True Links Matched: {:.2f}%".format( matched_true_links.mean() * 100)) print("Percent of True Links Unmatched: {:.2f}%".format( (1 - matched_true_links.mean()) * 100)) print("") correct_predictions = pred_links.isin(true_links) print("Precision Metrics:") print("Num of Predicted Matches: {:,}".format(len(pred_links))) print("Num of Correct Predicted Matches: {:,}".format( correct_predictions.sum())) print("Num of Incorrect Predicted Matches: {:,}".format( len(correct_predictions) - correct_predictions.sum())) print("Percent of Predictions which are Correct: {:.2f}%".format( correct_predictions.mean() * 100)) print("Percent of Predictions which are Incorrect: {:.2f}%".format( (1 - correct_predictions.mean()) * 100)) print("") f1_score = rl.fscore(true_links, pred_links) print("F1 Score is {:.2f}%".format(f1_score * 100)) if similarity_df is not None: is_true_link = similarity_df.index.isin(true_links) auc = roc_auc_score(y_true=is_true_link, y_score=similarity_df["similarity_score"]) print("AUC of ROC of Similarity Scores is {:.2f}%".format(auc * 100))
def print_experiment_evaluation(matches, description): precision = 0 recall = 0 fscore = 0 if len(matches) > 0: precision = recordlinkage.precision(links_test, matches) recall = recordlinkage.recall(links_test, matches) fscore = recordlinkage.fscore(links_test, matches) if recall + precision > 0 else 0 print(f"Configuration: {description}") print(f"Precision: {precision}") print(f"Recall: {recall}") print(f"F-score: {fscore}") print(recordlinkage.confusion_matrix(links_test, matches)) return precision, recall, fscore
def evalution(X_data, links_true): # 这里用逻辑回归分类器做分类, cl = recordlinkage.LogisticRegressionClassifier() cl.fit(X_data, links_true) # 用得到的模型做预测 links_pred = cl.predict(X_data) print("links_pred:{}".format(links_pred.shape)) # 输出混淆矩阵,confusion_matrix cm = recordlinkage.confusion_matrix(links_true, links_pred, total=len(X_data)) print("Confusion matrix:\n", cm) # compute the F-score for this classification fscore = recordlinkage.fscore(cm) print('fscore', fscore) # compute recall for this classification recall = recordlinkage.recall(links_true, links_pred) print('recall', recall) # compute precision for this classification precision = recordlinkage.precision(links_true, links_pred) print('precision', precision)
def metrics(links_true, links_pred, comparison_vector): if len(links_pred) > 0: # confusion matrix matrix = recordlinkage.confusion_matrix(links_true, links_pred, len(comparison_vector)) # precision precision = recordlinkage.precision(links_true, links_pred) # precision recall = recordlinkage.recall(links_true, links_pred) # The F-score for this classification is fscore = recordlinkage.fscore(links_true, links_pred) return matrix, precision, recall, fscore else: return 0, 0, 0, 0
def get_optimal_threshold(result_prob, true_pairs, min_threshold=0.1, max_threshold=1.0, step=0.05): logger = get_logger('RL.OPTIMAL_THRESHOLD') max_fscore = 0.0 optimal_threshold = 0 for threshold in range(int(min_threshold*100),int(max_threshold*100), int(step*100)): threshold = threshold / 100.0 result = [(e1, e2) for (e1, e2, d) in result_prob if d <= threshold] if not len(result): logger.info("No results for threshold: %.2f", threshold) continue result = pd.MultiIndex.from_tuples(result) true_pairs = pd.MultiIndex.from_tuples(true_pairs) try: fscore = recordlinkage.fscore(true_pairs, result) except ZeroDivisionError: logger.info("ZeroDivisionError in recordlinkage.fscore") continue logger.debug("For threshold: %f fscore: %f", threshold, fscore) if fscore >= max_fscore: max_fscore = fscore optimal_threshold = threshold logger.info("Found optimal threshold %f with max_fscore: %f", optimal_threshold, max_fscore) return (optimal_threshold, max_fscore)
data = pandas.read_csv('/home/jake/Documents/matching/functions/comparison_index.csv', index_col=['sku_1', 'sku_2']) golden_pairs = data.sample(frac=1) golden_pairs = golden_pairs[0:5000] golden_matches_index = golden_pairs.index & matches print(golden_matches_index) data_2 = pandas.read_csv('/home/jake/Documents/matching/functions/comparison_index.csv', index_col=['sku_1', 'sku_2']) logreg = recordlinkage.LogisticRegressionClassifier() logreg.fit(golden_pairs, golden_matches_index) print ("Intercept: ", logreg.intercept) print ("Coefficients: ", logreg.coefficients) result_logreg = logreg.predict(data_2) print(len(result_logreg)) print(result_logreg) print(recordlinkage.confusion_matrix(matches, result_logreg, len(data_2))) print(recordlinkage.fscore(matches, result_logreg)) coefficients = [2, -0.08400654, -0.41432631, -0.12138752, -0.31617086, -0.42389099, -0.33185166, 0.02173983, 0] predicter = recordlinkage.LogisticRegressionClassifier(coefficients=coefficients, intercept=-5.379865263857996) y = predicter.predict(data_2)
print('feature shape', features.shape) # use the Logistic Regression Classifier # this classifier is equivalent to the deterministic record linkage approach intercept = -9.5 coefficients = [2.0, 3.0, 7.0, 6.0, 2.5, 5.0, 5.5] print('Deterministic classifier') print('intercept', intercept) print('coefficients', coefficients) logreg = rl.LogisticRegressionClassifier( coefficients=coefficients, intercept=intercept) links = logreg.predict(features) print(len(links), 'links/matches') # return the confusion matrix conf_logreg = rl.confusion_matrix(true_links, links, len(candidate_links)) print('confusion matrix') print(conf_logreg) # compute the F-score for this classification fscore = rl.fscore(conf_logreg) print('fscore', fscore) recall = rl.recall(true_links, links) print('recall', recall) precision = rl.precision(true_links, links) print('precision', precision)
print('Num. of many-to-many predicted links: {}'.format(len(links_pred))) # Take the match with highest probability for each Twitter user links_prob = classifier.prob(comparison_vectors) links_prob = links_prob[links_prob.index.isin(links_pred.values)] links_prob = links_prob.to_frame() links_prob.index.names = ['index_twitter', 'index_voter'] links_prob.columns = ['match_prob'] links_prob.reset_index(inplace=True) links_prob = links_prob.sort_values( 'match_prob', ascending=False).drop_duplicates('index_twitter') links_prob.set_index(['index_twitter', 'index_voter'], inplace=True) links_pred = links_prob.index print('Num. of many-to-one predicted links: {}'.format(len(links_pred))) cm = recordlinkage.confusion_matrix(links_true, links_pred, total=len(df_twitter) * len(df_voter)) print('TP: {}\nFN: {}\nFP: {}\nTN: {}\n'.format(cm[0][0], cm[0][1], cm[1][0], cm[1][1])) # compute the F-score for this classification fscore = recordlinkage.fscore(cm) print('F-score: {:.2f}'.format(fscore)) recall = recordlinkage.recall(links_true, links_pred) print('Recall: {:.2f}'.format(recall)) precision = recordlinkage.precision(links_true, links_pred) print('Precision: {:.2f}'.format(precision)) print(classifier.log_weights)
cl = rl.NaiveBayesClassifier() cl.fit(X_data, links_true) # Print the parameters that are trained (m, u and p). Note that the estimates # are very good. print("p probability P(Match):", cl.p) print("m probabilities P(x_i=1|Match):", cl.m_probs) print("u probabilities P(x_i=1|Non-Match):", cl.u_probs) print("log m probabilities P(x_i=1|Match):", cl.log_m_probs) print("log u probabilities P(x_i=1|Non-Match):", cl.log_u_probs) print("log weights of features:", cl.log_weights) print("weights of features:", cl.weights) # evaluate the model links_pred = cl.predict(X_data) print("Predicted number of links:", len(links_pred)) cm = rl.confusion_matrix(links_true, links_pred, total=len(X_data)) print("Confusion matrix:\n", cm) # compute the F-score for this classification fscore = rl.fscore(cm) print('fscore', fscore) recall = rl.recall(links_true, links_pred) print('recall', recall) precision = rl.precision(links_true, links_pred) print('precision', precision) # Predict the match probability for each pair in the dataset. probs = cl.prob(X_data)
def test_fscore(self): self.assertEqual(recordlinkage.fscore(CONF_M1), 1.0) self.assertRaises(ZeroDivisionError, recordlinkage.fscore, CONF_M5)
import recordlinkage as rl from recordlinkage.datasets import load_krebsregister krebs_X, krebs_true_links = load_krebsregister(missing_values=0) print(krebs_true_links) # Train the classifier ecm = rl.ECMClassifier(binarize=0.8) result_ecm = ecm.fit_predict(krebs_X) len(result_ecm) print(rl.confusion_matrix(krebs_true_links, result_ecm, len(krebs_X))) # The F-score for this classification is print(rl.fscore(krebs_true_links, result_ecm)) print(ecm.log_weights)
#nonmatches = data_sample[data_sample.sum(axis=1) < 4] #creating match index match_index = matches.index #creating a training dataset golden_pairs = data_sample[0:2000000] golden_matches_index = golden_pairs.index & match_index # Train the classifier svm = rl.SVMClassifier() svm.learn(golden_pairs, golden_matches_index) # Predict the match status for all record pairs result_svm = svm.predict(data) len(result_svm) #creating a confusion matrix conf_svm = rl.confusion_matrix(match_index, result_svm, len(data)) conf_svm # The F-score for this classification is rl.fscore(conf_svm) m_last = pd.DataFrame(result_svm) #loading data for review dfB = pd.read_csv('for_linkage_data1.csv', sep=',',encoding='utf-8') #after review the dataframe m_last #to review the matches dfB.loc[['LID000000000','LID000157274','LID000217044','LID000491999','LID000558481','LID000589541']]