def test_accuracy(self): # confusion matrix cm = rl.confusion_matrix(LINKS_TRUE, LINKS_PRED, len(FULL_INDEX)) assert rl.accuracy(LINKS_TRUE, LINKS_PRED, len(FULL_INDEX)) == 4 / 9 assert rl.accuracy(cm) == 4 / 9
def test_accuracy(self): # confusion matrix cm = rl.confusion_matrix(LINKS_TRUE, LINKS_PRED, len(FULL_INDEX)) self.assertEqual(rl.accuracy(LINKS_TRUE, LINKS_PRED, len(FULL_INDEX)), 4 / 9) self.assertEqual(rl.accuracy(cm), 4 / 9)
def cross_val_score(classifier, comparison_vector, link_true, cv=5, method='fscore'): skfolds = StratifiedKFold(n_splits=cv) y = pandas.Series(0, index=comparison_vector.index) y.loc[link_true.index & comparison_vector.index] = 1 X_train = comparison_vector.values y_train = y.values scores = [] for train_index, test_index in skfolds.split(X_train, y_train): #clone_clf = clone(classifier) classifier_copy = copy.deepcopy(classifier) X_train_folds = comparison_vector.iloc[ train_index] #X_train[train_index] X_test_folds = comparison_vector.iloc[test_index] #X_train[test_index] y_train_folds = X_train_folds.index & link_true.index #y_train[train_index] y_test_folds = X_test_folds.index & link_true.index # Train the classifier #print(y_train_folds.shape) classifier_copy.fit(X_train_folds, y_train_folds) # predict matches for the test #print(X_test_folds) y_pred = classifier_copy.predict(X_test_folds) if (method == 'fscore'): score = recordlinkage.fscore(y_test_folds, y_pred) elif (method == 'precision'): score = recordlinkage.precision(y_test_folds, y_pred) elif (method == 'recall'): score = recordlinkage.recall(y_test_folds, y_pred) elif (method == 'accuracy'): score = recordlinkage.accuracy(y_test_folds, y_pred, len(comparison_vector)) elif (method == 'specificity'): score = recordlinkage.specificity(y_test_folds, y_pred, len(comparison_vector)) scores.append(score) scores = numpy.array(scores) return scores
def log_quality_results(logger, result, true_links, total_pairs, params=None): logger.info("Number of Results %d", len(result)) logger.info("Confusion Matrix %s", str( recordlinkage.confusion_matrix(true_links, result, total_pairs))) try: fscore = recordlinkage.fscore(true_links, result) accuracy = recordlinkage.accuracy(true_links, result, total_pairs) precision = recordlinkage.precision(true_links, result) recall = recordlinkage.recall(true_links, result) logger.info("FScore: %.2f Accuracy : %.2f", fscore, accuracy) logger.info("Precision: %.2f Recall %.2f", precision, recall) logger.info("For params : %s", str(params)) write_results(logger.name, fscore, accuracy, precision, recall, params) except ZeroDivisionError: logger.error("ZeroDivisionError!!") return fscore
def test_accuracy(self): self.assertEqual(recordlinkage.accuracy(CONF_M1), 1.0) self.assertEqual(recordlinkage.accuracy(CONF_M5), 0.0)
def linkDB(df1, df2, type, classifier): # 1 - INDEXING indexer = recordlinkage.Index() if type == "sortedneighbourhood": indexer.sortedneighbourhood(left_on="0_restaurant", right_on="1_restaurant") elif type == "full": indexer.full() elif type == "block": indexer.block(left_on="0_addressGoogle", right_on="1_addressGoogle") candidate_links = indexer.index(df1, df2) test_pairs = candidate_links[0:100] #https://recordlinkage.readthedocs.io/en/latest/annotation.html """ df1.columns = df1.columns.str.replace(r'0_', '') df2.columns = df2.columns.str.replace(r'1_', '') recordlinkage.write_annotation_file( "check_matches.json", candidate_links[0:100], df1, df2, dataset_a_name="firstDF", dataset_b_name="secondDF") df1 = df1.add_prefix('0_') df2 = df2.add_prefix('1_') """ annotations = recordlinkage.read_annotation_file('result.json') # 2 - COMPARISON comp = recordlinkage.Compare() comp.string('0_restaurant', '1_restaurant', threshold=0.95, method='jarowinkler', label='ristorante') comp.string('0_neighborhood', '1_neighborhood', method='jarowinkler', threshold=0.85, label='quartiere') comp.exact('0_addressGoogle', '1_addressGoogle', label='indirizzoGoogle') features = comp.compute(candidate_links, df1, df2) test_features = comp.compute(test_pairs, df1, df2) # 3 - CLASSIFICATION # https://recordlinkage.readthedocs.io/en/latest/ref-classifiers.html#unsupervised matches = [] drop1 = [] drop2 = [] if classifier == "ecm": ecm = recordlinkage.ECMClassifier(init='jaro', binarize=None, max_iter=100, atol=0.0001, use_col_names=True) ecm.fit_predict(features, match_index=None) # Train the classifier e_matches = ecm.predict(features) for i, j in e_matches: if i not in drop1: drop1.append(i) if j not in drop2: drop2.append(j) record_1 = df1.loc[i] record_2 = df2.loc[j] record = tuple(record_1) + tuple(record_2) matches.append(record) elif classifier == "kmeans": kmeans = recordlinkage.KMeansClassifier() kmeans.fit_predict(features) k_matches = kmeans.predict(features) for i, j in k_matches: if i not in drop1: drop1.append(i) if j not in drop2: drop2.append(j) record_1 = df1.loc[i] record_2 = df2.loc[j] record = tuple(record_1) + tuple(record_2) matches.append(record) head = tuple(df1.head()) + tuple(df2.head()) matches_result = pd.DataFrame(matches) matches_result.columns = head df1t = df1.drop(drop1, axis=0) df2t = df2.drop(drop2, axis=0) result = df1t.append([df2t, matches_result]) new_index = [] for n in range(result.shape[0]): new_index.append(n) result.index = new_index # 4 - EVALUATION if classifier == "ecm": test_matches = ecm.predict(test_features) cm = recordlinkage.confusion_matrix(annotations.links, test_matches, total=100) acc = recordlinkage.accuracy(annotations.links, test_matches, total=100) elif classifier == "kmeans": test_matches = kmeans.fit_predict(test_features) cm = recordlinkage.confusion_matrix(annotations.links, test_matches, total=100) acc = recordlinkage.accuracy(annotations.links, test_matches, total=100) print(cm, acc) return result