def test_fast_strong_cluster_large(self): database = Database('test_annotations_10000_cleaned.csv', max_records=1000, header_path='test_annotations_10000_cleaned_header.csv') database_train = database.sample_and_remove(800) database_test = database labels_train = fast_strong_cluster(database_train) labels_test = fast_strong_cluster(database_test) self.assertEqual(len(labels_train), len(database_train.records)) self.assertEqual(len(labels_test), len(database_test.records))
def test_test(self): database = Database('test_annotations_10000_cleaned.csv', header_path='test_annotations_10000_cleaned_header.csv') database_train = database.sample_and_remove(5000) database_test = database labels_train = fast_strong_cluster(database_train) labels_test = fast_strong_cluster(database_test) train_seed = generate_pair_seed(database_train, labels_train, 0.5) match_function = LogisticMatchFunction(database_train, labels_train, train_seed, 0.7) roc = match_function.test(database_test, labels_test, 0.5) roc.make_plot()
def test_merge_duped_records(self): """ Merges all entities containing the same record identifier """ strong_clusters = fast_strong_cluster(self._database) database_copy = deepcopy(self._database) database_copy.merge(strong_clusters) self._er._match_function = self._match_function records = set() for _, record in database_copy.records.iteritems(): records.add(record) swooshed = self._er.rswoosh(records) # Compare to manually constructed clusters with duplicates r0 = self._database.records[0] r1 = self._database.records[1] r2 = self._database.records[2] r3 = self._database.records[3] r0.merge(r1) r1.merge(r3) premerge = {0: r0, 1: r1, 2: r2, 3: r3} merged = merge_duped_records(premerge) self.assertEqual(len(merged), len(swooshed)) self.assertTrue(test_object_set(merged, swooshed))
def test_completeness(self): database = Database('test_annotations_10000_cleaned.csv', max_records=1000, header_path='test_annotations_10000_cleaned_header.csv') database_train = database.sample_and_remove(800) database_test = database labels_train = fast_strong_cluster(database_train) labels_test = fast_strong_cluster(database_test) er = EntityResolution() pair_seed = generate_pair_seed(database_train, labels_train, 0.5) match_function = LogisticMatchFunction(database_train, labels_train, pair_seed, 0.99) blocking_scheme = BlockingScheme(database_test) labels_pred = er.run(database_test, match_function, blocking_scheme, cores=2) number_fast_strong_records = len(labels_train) + len(labels_test) self.assertEqual(number_fast_strong_records, 1000) self.assertEqual(sorted((labels_train.keys() + labels_test.keys())), range(0, 1000)) number_swoosh_records = len(get_ids(database_test.records)) self.assertEqual(number_swoosh_records, len(database_test.records)) self.assertEqual(get_ids(database_test.records), sorted(labels_test.keys())) self.assertEqual(get_ids(database_test.records), sorted(labels_pred.keys()))
def setUp(self): self._test_path = 'test_annotations_cleaned.csv' self._database = Database(self._test_path) self._labels = fast_strong_cluster(self._database) self._blocking = BlockingScheme(self._database, single_block=True) self._er = EntityResolution() decision_threshold = 1.0 pair_seed = generate_pair_seed(self._database, self._labels, 0.5) self._match_function = LogisticMatchFunction(self._database, self._labels, pair_seed, decision_threshold)
def test_fast_strong_cluster(self): labels_pred = fast_strong_cluster(self._database) labels_true = { 0: 0, 1: 0, 2: 1, 3: 0 } self.assertEqual(labels_pred, labels_true)
def test_pairs(self): database = Database('test_annotations_10000_cleaned.csv', header_path='test_annotations_10000_cleaned_header.csv') labels = fast_strong_cluster(database) pair_seed = generate_pair_seed(database, labels, 0.5) # x1_a, x2_a, m_a = _get_pairs(database, labels, 10, balancing=True) # x1_b, x2_b, m_b = _get_pairs(database, labels, 10, balancing=True) # self.assertNotEqual(x1_a, x1_b) # self.assertNotEqual(x2_a, x2_b) # self.assertNotEqual(m_a, m_b) x1_a, x2_a, m_a = get_pairwise_features(database, labels, pair_seed) x1_b, x2_b, m_b = get_pairwise_features(database, labels, pair_seed) np.testing.assert_array_equal(x1_a, x1_b) np.testing.assert_array_equal(x2_a, x2_b) np.testing.assert_array_equal(m_a, m_b)
def test_run(self): strong_clusters = fast_strong_cluster(self._database) database_copy = deepcopy(self._database) database_copy.merge(strong_clusters) blocking = BlockingScheme(database_copy, single_block=True) labels = self._er.run(database_copy, self._match_function, blocking, cores=2) database_copy.merge(labels) entities = set() for _, entity in database_copy.records.iteritems(): entities.add(entity) r0 = self._database.records[0] r1 = self._database.records[1] r2 = self._database.records[2] r3 = self._database.records[3] r0.merge(r1) r0.merge(r3) manual = {r0, r2} self.assertTrue(test_object_set(manual, entities))
def test_rswoosh(self): strong_clusters = fast_strong_cluster(self._database) database_copy = deepcopy(self._database) database_copy.merge(strong_clusters) records = set() for _, record in database_copy.records.iteritems(): records.add(record) self._er._match_function = self._match_function swooshed = self._er.rswoosh(records) # Compare to manually merged records r0 = self._database.records[0] r1 = self._database.records[1] r2 = self._database.records[2] r3 = self._database.records[3] r1.merge(r3) r0.merge(r1) merged = {r0, r2} self.assertEqual(len(swooshed), len(merged)) self.assertTrue(test_object_set(merged, swooshed))
def setUp(self): self._database = Database('test_annotations_cleaned.csv') labels = fast_strong_cluster(self._database) pair_seed = generate_pair_seed(self._database, labels, 0.5) self._blocking = BlockingScheme(self._database) self._match_function = LogisticMatchFunction(self._database, labels, pair_seed, 0.5)