def test_fast_strong_cluster_large(self):
     database = Database('test_annotations_10000_cleaned.csv', max_records=1000, header_path='test_annotations_10000_cleaned_header.csv')
     database_train = database.sample_and_remove(800)
     database_test = database
     labels_train = fast_strong_cluster(database_train)
     labels_test = fast_strong_cluster(database_test)
     self.assertEqual(len(labels_train), len(database_train.records))
     self.assertEqual(len(labels_test), len(database_test.records))
 def test_test(self):
     database = Database('test_annotations_10000_cleaned.csv', header_path='test_annotations_10000_cleaned_header.csv')
     database_train = database.sample_and_remove(5000)
     database_test = database
     labels_train = fast_strong_cluster(database_train)
     labels_test = fast_strong_cluster(database_test)
     train_seed = generate_pair_seed(database_train, labels_train, 0.5)
     match_function = LogisticMatchFunction(database_train, labels_train, train_seed, 0.7)
     roc = match_function.test(database_test, labels_test, 0.5)
     roc.make_plot()
 def test_merge_duped_records(self):
     """
     Merges all entities containing the same record identifier
     """
     strong_clusters = fast_strong_cluster(self._database)
     database_copy = deepcopy(self._database)
     database_copy.merge(strong_clusters)
     self._er._match_function = self._match_function
     records = set()
     for _, record in database_copy.records.iteritems():
         records.add(record)
     swooshed = self._er.rswoosh(records)
     # Compare to manually constructed clusters with duplicates
     r0 = self._database.records[0]
     r1 = self._database.records[1]
     r2 = self._database.records[2]
     r3 = self._database.records[3]
     r0.merge(r1)
     r1.merge(r3)
     premerge = {0: r0,
                 1: r1,
                 2: r2,
                 3: r3}
     merged = merge_duped_records(premerge)
     self.assertEqual(len(merged), len(swooshed))
     self.assertTrue(test_object_set(merged, swooshed))
 def test_completeness(self):
     database = Database('test_annotations_10000_cleaned.csv', max_records=1000, header_path='test_annotations_10000_cleaned_header.csv')
     database_train = database.sample_and_remove(800)
     database_test = database
     labels_train = fast_strong_cluster(database_train)
     labels_test = fast_strong_cluster(database_test)
     er = EntityResolution()
     pair_seed = generate_pair_seed(database_train, labels_train, 0.5)
     match_function = LogisticMatchFunction(database_train, labels_train, pair_seed, 0.99)
     blocking_scheme = BlockingScheme(database_test)
     labels_pred = er.run(database_test, match_function, blocking_scheme, cores=2)
     number_fast_strong_records = len(labels_train) + len(labels_test)
     self.assertEqual(number_fast_strong_records, 1000)
     self.assertEqual(sorted((labels_train.keys() + labels_test.keys())), range(0, 1000))
     number_swoosh_records = len(get_ids(database_test.records))
     self.assertEqual(number_swoosh_records, len(database_test.records))
     self.assertEqual(get_ids(database_test.records), sorted(labels_test.keys()))
     self.assertEqual(get_ids(database_test.records), sorted(labels_pred.keys()))
 def setUp(self):
     self._test_path = 'test_annotations_cleaned.csv'
     self._database = Database(self._test_path)
     self._labels = fast_strong_cluster(self._database)
     self._blocking = BlockingScheme(self._database, single_block=True)
     self._er = EntityResolution()
     decision_threshold = 1.0
     pair_seed = generate_pair_seed(self._database, self._labels, 0.5)
     self._match_function = LogisticMatchFunction(self._database, self._labels, pair_seed, decision_threshold)
 def test_fast_strong_cluster(self):
     labels_pred = fast_strong_cluster(self._database)
     labels_true = {
         0: 0,
         1: 0,
         2: 1,
         3: 0
     }
     self.assertEqual(labels_pred, labels_true)
 def test_pairs(self):
     database = Database('test_annotations_10000_cleaned.csv', header_path='test_annotations_10000_cleaned_header.csv')
     labels = fast_strong_cluster(database)
     pair_seed = generate_pair_seed(database, labels, 0.5)
     # x1_a, x2_a, m_a = _get_pairs(database, labels, 10, balancing=True)
     # x1_b, x2_b, m_b = _get_pairs(database, labels, 10, balancing=True)
     # self.assertNotEqual(x1_a, x1_b)
     # self.assertNotEqual(x2_a, x2_b)
     # self.assertNotEqual(m_a, m_b)
     x1_a, x2_a, m_a = get_pairwise_features(database, labels, pair_seed)
     x1_b, x2_b, m_b = get_pairwise_features(database, labels, pair_seed)
     np.testing.assert_array_equal(x1_a, x1_b)
     np.testing.assert_array_equal(x2_a, x2_b)
     np.testing.assert_array_equal(m_a, m_b)
 def test_run(self):
     strong_clusters = fast_strong_cluster(self._database)
     database_copy = deepcopy(self._database)
     database_copy.merge(strong_clusters)
     blocking = BlockingScheme(database_copy, single_block=True)
     labels = self._er.run(database_copy, self._match_function, blocking, cores=2)
     database_copy.merge(labels)
     entities = set()
     for _, entity in database_copy.records.iteritems():
         entities.add(entity)
     r0 = self._database.records[0]
     r1 = self._database.records[1]
     r2 = self._database.records[2]
     r3 = self._database.records[3]
     r0.merge(r1)
     r0.merge(r3)
     manual = {r0, r2}
     self.assertTrue(test_object_set(manual, entities))
 def test_rswoosh(self):
     strong_clusters = fast_strong_cluster(self._database)
     database_copy = deepcopy(self._database)
     database_copy.merge(strong_clusters)
     records = set()
     for _, record in database_copy.records.iteritems():
         records.add(record)
     self._er._match_function = self._match_function
     swooshed = self._er.rswoosh(records)
     # Compare to manually merged records
     r0 = self._database.records[0]
     r1 = self._database.records[1]
     r2 = self._database.records[2]
     r3 = self._database.records[3]
     r1.merge(r3)
     r0.merge(r1)
     merged = {r0, r2}
     self.assertEqual(len(swooshed), len(merged))
     self.assertTrue(test_object_set(merged, swooshed))
 def setUp(self):
     self._database = Database('test_annotations_cleaned.csv')
     labels = fast_strong_cluster(self._database)
     pair_seed = generate_pair_seed(self._database, labels, 0.5)
     self._blocking = BlockingScheme(self._database)
     self._match_function = LogisticMatchFunction(self._database, labels, pair_seed, 0.5)