Ejemplo n.º 1
0
 def rswoosh(self, I):
     """
     RSwoosh - Benjelloun et al. 2009
     Performs entity resolution on any set of records using merge and match functions
     :param I: Set of input records
     :return Inew: Set of resolved entities (records)
     """
     Inew = set()  # initialize the resolved entities
     while I:  # until entity resolution is complete
         currentrecord = I.pop()  # an arbitrary record
         buddy = False
         for rnew in Inew:  # iterate over Inew
             match, prob = self._match_function.match(currentrecord, rnew)
             if match:
                 buddy = rnew
                 break  # Found a match!
         if buddy:
             print 'Merging records with P(match) = ', prob
             print '   x2: ', get_weak_pairwise_features(currentrecord, rnew)
             currentrecord.display(indent='  ')
             print '   ----'
             buddy.display(indent='  ')
             currentrecord.merge(buddy)
             I.add(currentrecord)
             Inew.discard(buddy)
         else:
             Inew.add(currentrecord)
     return Inew
Ejemplo n.º 2
0
 def rswoosh(self, I):
     """
     RSwoosh - Benjelloun et al. 2009
     Performs entity resolution on any set of records using merge and match functions
     :param I: Set of input records
     :return Inew: Set of resolved entities (records)
     """
     Inew = set()  # initialize the resolved entities
     while I:  # until entity resolution is complete
         currentrecord = I.pop()  # an arbitrary record
         buddy = False
         for rnew in Inew:  # iterate over Inew
             match, prob = self._match_function.match(currentrecord, rnew)
             if match:
                 buddy = rnew
                 break  # Found a match!
         if buddy:
             print 'Merging records with P(match) = ', prob
             print '   x2: ', get_weak_pairwise_features(
                 currentrecord, rnew)
             currentrecord.display(indent='  ')
             print '   ----'
             buddy.display(indent='  ')
             currentrecord.merge(buddy)
             I.add(currentrecord)
             Inew.discard(buddy)
         else:
             Inew.add(currentrecord)
     return Inew
 def match(self, r1, r2):
     """
     Determines if two records match
     :param r1: Record object
     :param r2: Record object
     :return x1_hat: False or True, whether r1 and r2 match
     :return p_x1: Probability of weak match
     """
     # x1 = get_x1(r1, r2)
     # if np.isnan(x1):
     #     x1 = False
     x = get_weak_pairwise_features(r1, r2)
     np.copyto(x, self._x_mean, where=np.isnan(x))  # mean imputation
     prob = self._classifier.predict_proba(x)[0, 1]
     match = prob >= self._decision_threshold
     if r1 == r2:
         match = True  # if records are the same, to satisfy Idempotence property
     return match, prob
Ejemplo n.º 4
0
 def match(self, r1, r2):
     """
     Determines if two records match
     :param r1: Record object
     :param r2: Record object
     :return: False or True, whether r1 and r2 match
     :return p_x1: Probability of weak match
     """
     # x1 = get_x1(r1, r2)
     # if np.isnan(x1):
     #     x1 = False
     x2 = get_weak_pairwise_features(r1, r2)
     np.copyto(x2, self._x_mean, where=np.isnan(x2))  # mean imputation
     p_x1 = self._logreg.predict_proba(x2)[0, 1]
     x1_hat = p_x1 > self._decision_threshold
     if r1 == r2:
         x1_hat = True  # if records are the same, to satisfy Idempotence property
     return x1_hat, p_x1
 def batch_match(self, records):
     """
     Batch mode of match
     :param records: List of pairs of records
     :return match: List of booleans, whether the corresponding record tuple matches
     :return prob: Probability of weak match
     """
     n = len(records)
     if n == 0:
         return [], []
     m = records[0][0].feature_descriptor.number_weak
     X = np.empty([n, m])
     idempotence = list()
     for i, pair in enumerate(records):
         r1 = pair[0]
         r2 = pair[1]
         idempotence.append(r1 == r2)
         X[i, :] = get_weak_pairwise_features(r1, r2)
     np.copyto(X, self._x_mean, where=np.isnan(X))
     prob = self._classifier.predict_proba(X)[:, 1]
     match = list(prob >= self._decision_threshold) or idempotence
     return match, prob
 def test_get_x2(self):
     r0 = self._database.records[0]
     x2 = get_weak_pairwise_features(r0, r0)
     self.assertEqual(x2[0], 0) # [1], binary match
     self.assertEqual(x2[1], 0) # [2], date diff
     self.assertEqual(x2[2], 0) # [3], bin
     self.assertEqual(x2[3], 0) # [4], bin
     self.assertEqual(x2[4], 0) # [7] bin
     self.assertEqual(x2[5], 0) # [8] num diff
     self.assertTrue(isnan(x2[6]))  # [9] bin
     self.assertTrue(isnan(x2[7]))  # [10]  num diff
     self.assertTrue(isnan(x2[8]))  # [11]  num diff
     self.assertTrue(isnan(x2[9]))  # [12]  bin
     self.assertTrue(isnan(x2[10]))  # [13]   num diff
     self.assertTrue(isnan(x2[11]))  # [14]   num diff
     self.assertTrue(isnan(x2[12]))  # [15]   num diff
     self.assertTrue(isnan(x2[13]))  # [16] bin
     self.assertTrue(isnan(x2[14]))  # [17] bin
     self.assertTrue(isnan(x2[15]))  # [18] bin
     self.assertTrue(isnan(x2[16]))  # [19] bin
     self.assertTrue(isnan(x2[17]))  # [24] bin
     self.assertTrue(isnan(x2[18]))  # [25] bin
     self.assertEqual(x2[19], np.exp(-3))  # [26] number matches