Beispiel #1
0
    def __init__(self,
                 data_model,
                 data_1,
                 data_2,
                 blocked_proportion,
                 sample_size,
                 original_length_1,
                 original_length_2):

        self.data_model = data_model

        data_1 = core.index(data_1)

        offset = len(data_1)
        data_2 = core.index(data_2, offset)

        self.candidates = super().sample(data_1,
                                         data_2,
                                         blocked_proportion,
                                         sample_size)

        self.blocker = RecordLinkBlockLearner(data_model,
                                              self.candidates,
                                              data_1,
                                              data_2,
                                              original_length_1,
                                              original_length_2)

        self._common_init()
Beispiel #2
0
    def __init__(self, data_model, data_1, data_2, blocked_proportion,
                 sample_size, original_length_1, original_length_2,
                 index_include):

        self.data_model = data_model

        data_1 = core.index(data_1)

        offset = len(data_1)
        data_2 = core.index(data_2, offset)

        self.candidates = self._sample(data_1, data_2, blocked_proportion,
                                       sample_size)

        random_pair = random.choice(self.candidates)
        exact_match = (random_pair[0], random_pair[0])

        index_include = index_include.copy()
        index_include.append(exact_match)

        self.blocker = RecordLinkBlockLearner(data_model, self.candidates,
                                              data_1, data_2,
                                              original_length_1,
                                              original_length_2, index_include)
        self.classifier = RLRLearner(self.data_model)
        self.classifier.candidates = self.candidates

        self._common_init()

        self.mark([exact_match] * 4 + [random_pair], [1] * 4 + [0])
Beispiel #3
0
    def sample(self, data_1, data_2, sample_size=15000,
               blocked_proportion=.5, original_length_1=None,
               original_length_2=None):
        '''
        Draws a random sample of combinations of records from
        the first and second datasets, and initializes active
        learning with this sample

        Arguments:

        data_1      -- Dictionary of records from first dataset, where the
                       keys are record_ids and the values are dictionaries
                       with the keys being field names
        data_2      -- Dictionary of records from second dataset, same
                       form as data_1

        sample_size -- Size of the sample to draw
        '''
        self._checkData(data_1, data_2)
        
        data_1 = core.index(data_1)
        if original_length_1 is None:
            original_length_1 = len(data_1)
        self.sampled_records_1 = Sample(data_1, 600, original_length_1)

        offset = len(data_1)
        data_2 = core.index(data_2, offset)
        if original_length_2 is None:
            original_length_2 = len(data_2)
        self.sampled_records_2 = Sample(data_2, 600, original_length_2)

        self.active_learner = self.ActiveLearner(self.data_model)
        self.active_learner.sample_product(data_1, data_2,
                                           blocked_proportion, sample_size)
Beispiel #4
0
    def sample_product(self, data_1, data_2, blocked_proportion,
                       sample_size, original_length_1=None,
                       original_length_2=None):

        data_1 = core.index(data_1)

        offset = len(data_1)
        data_2 = core.index(data_2, offset)

        self.candidates = super().sample_product(data_1,
                                                 data_2,
                                                 blocked_proportion,
                                                 sample_size)

        self.classifier._init(self.candidates)

        sampled_records_1 = Sample(data_1, 600, original_length_1)
        sampled_records_2 = Sample(data_2, 600, original_length_2)

        self.blocker._init_product(self.candidates,
                                   sampled_records_1,
                                   sampled_records_2,
                                   data_2)

        return sampled_records_1, sampled_records_2
Beispiel #5
0
    def sample(self, data_1, data_2, sample_size=15000,
               blocked_proportion=.5, original_length_1=None,
               original_length_2=None):
        '''
        Draws a random sample of combinations of records from
        the first and second datasets, and initializes active
        learning with this sample

        Arguments:

        data_1      -- Dictionary of records from first dataset, where the
                       keys are record_ids and the values are dictionaries
                       with the keys being field names
        data_2      -- Dictionary of records from second dataset, same
                       form as data_1

        sample_size -- Size of the sample to draw
        '''
        self._checkData(data_1, data_2)
        
        data_1 = core.index(data_1)
        if original_length_1 is None:
            original_length_1 = len(data_1)
        self.sampled_records_1 = Sample(data_1, 600, original_length_1)

        offset = len(data_1)
        data_2 = core.index(data_2, offset)
        if original_length_2 is None:
            original_length_2 = len(data_2)
        self.sampled_records_2 = Sample(data_2, 600, original_length_2)

        self.active_learner = self.ActiveLearner(self.data_model)
        self.active_learner.sample_product(data_1, data_2,
                                           blocked_proportion, sample_size)
Beispiel #6
0
    def sample(self, data_1, data_2, sample_size=150000,
               blocked_proportion=.5):
        '''
        Draws a random sample of combinations of records from
        the first and second datasets, and initializes active
        learning with this sample

        Arguments:

        data_1      -- Dictionary of records from first dataset, where the
                       keys are record_ids and the values are dictionaries
                       with the keys being field names
        data_2      -- Dictionary of records from second dataset, same
                       form as data_1

        sample_size -- Size of the sample to draw
        '''
        if len(data_1) == 0:
            raise ValueError(
                'Dictionary of records from first dataset is empty.')
        elif len(data_2) == 0:
            raise ValueError(
                'Dictionary of records from second dataset is empty.')

        if len(data_1) > len(data_2):
            data_1, data_2 = data_2, data_1

        data_1 = core.index(data_1)
        self.sampled_records_1 = Sample(data_1, 500)

        offset = len(data_1)
        data_2 = core.index(data_2, offset)
        self.sampled_records_2 = Sample(data_2, 500)

        blocked_sample_size = int(blocked_proportion * sample_size)
        predicates = list(self.data_model.predicates(index_predicates=False))

        deque_1 = sampling.randomDeque(data_1)
        deque_2 = sampling.randomDeque(data_2)

        blocked_sample_keys = sampling.linkBlockedSample(blocked_sample_size,
                                                         predicates,
                                                         deque_1,
                                                         deque_2)

        random_sample_size = sample_size - len(blocked_sample_keys)
        random_sample_keys = core.randomPairsMatch(len(deque_1),
                                                   len(deque_2),
                                                   random_sample_size)

        random_sample_keys = {(a, b + offset)
                              for a, b in random_sample_keys}

        data_sample = ((data_1[k1], data_2[k2])
                       for k1, k2
                       in blocked_sample_keys | random_sample_keys)

        data_sample = core.freezeData(data_sample)

        self._loadSample(data_sample)
Beispiel #7
0
    def sample(self,
               data_1,
               data_2,
               sample_size=150000,
               blocked_proportion=.5):
        '''
        Draws a random sample of combinations of records from 
        the first and second datasets, and initializes active
        learning with this sample
        
        Arguments:
        
        data_1      -- Dictionary of records from first dataset, where the 
                       keys are record_ids and the values are dictionaries 
                       with the keys being field names
        data_2      -- Dictionary of records from second dataset, same 
                       form as data_1
        
        sample_size -- Size of the sample to draw
        '''
        if len(data_1) > len(data_2):
            data_1, data_2 = data_2, data_1

        data_1 = core.index(data_1)

        offset = len(data_1)
        data_2 = core.index(data_2, offset)

        blocked_sample_size = int(blocked_proportion * sample_size)
        predicates = [
            pred for pred in predicateGenerator(self.data_model)
            if pred.type == 'SimplePredicate'
        ]

        data_1 = sampling.randomDeque(data_1)
        data_2 = sampling.randomDeque(data_2)

        blocked_sample_keys = sampling.linkBlockedSample(
            blocked_sample_size, predicates, data_1, data_2)

        random_sample_size = sample_size - len(blocked_sample_keys)
        random_sample_keys = dedupe.core.randomPairsMatch(
            len(data_1), len(data_2), random_sample_size)

        random_sample_keys = set(
            (a, b + offset) for a, b in random_sample_keys)

        data_1 = dict(data_1)
        data_2 = dict(data_2)

        data_sample = ((data_1[k1], data_2[k2])
                       for k1, k2 in blocked_sample_keys | random_sample_keys)

        data_sample = core.freezeData(data_sample)

        self._loadSample(data_sample)
Beispiel #8
0
    def sample(self, data_1, data_2, sample_size=150000, 
               blocked_proportion=.5) :
        '''
        Draws a random sample of combinations of records from 
        the first and second datasets, and initializes active
        learning with this sample
        
        Arguments:
        
        data_1      -- Dictionary of records from first dataset, where the 
                       keys are record_ids and the values are dictionaries 
                       with the keys being field names
        data_2      -- Dictionary of records from second dataset, same 
                       form as data_1
        
        sample_size -- Size of the sample to draw
        '''
        if len(data_1) > len(data_2) :
            data_1, data_2 = data_2, data_1

        data_1 = core.index(data_1)

        offset = len(data_1)
        data_2 = core.index(data_2, offset)

        blocked_sample_size = int(blocked_proportion * sample_size)
        predicates = [pred for pred in predicateGenerator(self.data_model)
                      if pred.type == 'SimplePredicate']

        data_1 = sampling.randomDeque(data_1)
        data_2 = sampling.randomDeque(data_2)

        blocked_sample_keys = sampling.linkBlockedSample(blocked_sample_size,
                                                         predicates,
                                                         data_1, 
                                                         data_2)
        
        random_sample_size = sample_size - len(blocked_sample_keys)
        random_sample_keys = dedupe.core.randomPairsMatch(len(data_1),
                                                          len(data_2), 
                                                          random_sample_size)

        random_sample_keys = set((a, b + offset) 
                                 for a, b in random_sample_keys)

        data_1 = dict(data_1)
        data_2 = dict(data_2)
        
        data_sample = ((data_1[k1], data_2[k2])
                       for k1, k2 
                       in blocked_sample_keys | random_sample_keys)

        data_sample = core.freezeData(data_sample)

        self._loadSample(data_sample)
Beispiel #9
0
    def __init__(self, distances, data, blocked_proportion, sample_size,
                 original_length, index_include):

        logger.debug("Initializing DedupeDisagreementLearner class")
        self.distances = distances
        logger.debug(
            f"labeler.DedupeDisagreementLearner distances type: {type(distances)}"
        )
        logger.debug(
            f"labeler.DedupeDisagreementLearner self.distances type: {type(self.distances)}"
        )
        data = core.index(data)

        self.candidates = super().sample(data, blocked_proportion, sample_size)

        random_pair = random.choice(self.candidates)
        exact_match = (random_pair[0], random_pair[0])

        index_include = index_include.copy()
        index_include.append(exact_match)

        self.blocker = DedupeBlockLearner(distances, self.candidates, data,
                                          original_length, index_include)

        self._common_init()
        logger.debug("Initializing with 5 random values")
        self.mark([exact_match] * 4 + [random_pair], [1] * 4 + [0])
Beispiel #10
0
    def __init__(self,
                 data_model,
                 data,
                 blocked_proportion,
                 sample_size,
                 index_include):

        self.data_model = data_model

        data = core.index(data)

        self.candidates = self._sample(data, blocked_proportion, sample_size)

        random_pair = random.choice(self.candidates)
        exact_match = (random_pair[0], random_pair[0])

        index_include = index_include.copy()
        index_include.append(exact_match)

        self.blocker = DedupeBlockLearner(data_model,
                                          self.candidates,
                                          data,
                                          index_include)
        self.classifier = RLRLearner(self.data_model)
        self.classifier.candidates = self.candidates

        self._common_init()

        self.mark([exact_match] * 4 + [random_pair],
                  [1] * 4 + [0])
Beispiel #11
0
    def sample(self, data, sample_size=15000,
               blocked_proportion=0.5, original_length=None):
        '''Draw a sample of record pairs from the dataset
        (a mix of random pairs & pairs of similar records)
        and initialize active learning with this sample

        Arguments: data -- Dictionary of records, where the keys are
        record_ids and the values are dictionaries with the keys being
        field names

        sample_size         -- Size of the sample to draw
        blocked_proportion  -- Proportion of the sample that will be blocked
        original_length     -- Length of original data, should be set if `data` is 
                               a sample of full data
        '''
        self._checkData(data)
        
        data = core.index(data)

        if original_length is None:
            original_length = len(data)
        self.sampled_records = Sample(data, 2000, original_length)

        self.active_learner = self.ActiveLearner(self.data_model)
        self.active_learner.sample_combo(data, blocked_proportion, sample_size)
Beispiel #12
0
    def sample(self,
               data,
               sample_size=15000,
               blocked_proportion=0.5,
               original_length=None):
        '''Draw a sample of record pairs from the dataset
        (a mix of random pairs & pairs of similar records)
        and initialize active learning with this sample

        Arguments: data -- Dictionary of records, where the keys are
        record_ids and the values are dictionaries with the keys being
        field names

        sample_size         -- Size of the sample to draw
        blocked_proportion  -- Proportion of the sample that will be blocked
        original_length     -- Length of original data, should be set if `data` is 
                               a sample of full data
        '''
        self._checkData(data)

        data = core.index(data)

        if original_length is None:
            original_length = len(data)
        self.sampled_records = Sample(data, 2000, original_length)

        self.active_learner = self.ActiveLearner(self.data_model)
        self.active_learner.sample_combo(data, blocked_proportion, sample_size)
Beispiel #13
0
    def sample(self, data, sample_size=15000,
               blocked_proportion=0.5):
        '''Draw a sample of record pairs from the dataset
        (a mix of random pairs & pairs of similar records)
        and initialize active learning with this sample

        Arguments: data -- Dictionary of records, where the keys are
        record_ids and the values are dictionaries with the keys being
        field names

        sample_size         -- Size of the sample to draw
        blocked_proportion  -- Proportion of the sample that will be blocked
        '''
        data = core.index(data)
        self.sampled_records = Sample(data, 900)

        blocked_sample_size = int(blocked_proportion * sample_size)
        predicates = list(self.data_model.predicates(index_predicates=False))

        data = sampling.randomDeque(data)
        blocked_sample_keys = sampling.dedupeBlockedSample(blocked_sample_size,
                                                           predicates,
                                                           data)

        random_sample_size = sample_size - len(blocked_sample_keys)
        random_sample_keys = set(core.randomPairs(len(data),
                                                  random_sample_size))
        data = dict(data)

        data_sample = [(data[k1], data[k2])
                       for k1, k2
                       in blocked_sample_keys | random_sample_keys]

        self._loadSample(data_sample)
Beispiel #14
0
    def sample(self, data, sample_size=15000, blocked_proportion=0.5):
        '''Draw a sample of record pairs from the dataset
        (a mix of random pairs & pairs of similar records)
        and initialize active learning with this sample

        Arguments: data -- Dictionary of records, where the keys are
        record_ids and the values are dictionaries with the keys being
        field names

        sample_size         -- Size of the sample to draw
        blocked_proportion  -- Proportion of the sample that will be blocked
        '''
        data = core.index(data)
        self.sampled_records = Sample(data, 900)

        blocked_sample_size = int(blocked_proportion * sample_size)
        predicates = list(self.data_model.predicates(index_predicates=False))

        data = sampling.randomDeque(data)
        blocked_sample_keys = sampling.dedupeBlockedSample(
            blocked_sample_size, predicates, data)

        random_sample_size = sample_size - len(blocked_sample_keys)
        random_sample_keys = set(
            core.randomPairs(len(data), random_sample_size))
        data = dict(data)

        data_sample = [(data[k1], data[k2])
                       for k1, k2 in blocked_sample_keys | random_sample_keys]

        data_sample = core.freezeData(data_sample)

        self._loadSample(data_sample)
Beispiel #15
0
    def __init__(self,
                 distances,
                 data,
                 blocked_proportion,
                 sample_size,
                 original_length,
                 index_include):

        self.distances = distances
        self.sampler = DedupeSampler(distances)
        data = core.index(data)

        self.candidates = self.sampler.sample(data, blocked_proportion, sample_size)
        logger.info(f"self.candidates: {len(self.candidates)}")
        random_pair = random.choice(self.candidates)
        exact_match = (random_pair[0], random_pair[0])

        index_include = index_include.copy()
        index_include.append(exact_match)

        self.block_learner = BlockLearner(distances,
                                          self.candidates,
                                          data,
                                          original_length,
                                          index_include)

        self._common_init()
        logger.debug("Initializing with 5 random values")
        self.mark([exact_match] * 4 + [random_pair],
                  [1] * 4 + [0])
Beispiel #16
0
    def __init__(self, data_model, data_1, data_2, blocked_proportion,
                 sample_size, original_length_1, original_length_2):

        self.data_model = data_model

        data_1 = core.index(data_1)

        offset = len(data_1)
        data_2 = core.index(data_2, offset)

        self.candidates = super().sample(data_1, data_2, blocked_proportion,
                                         sample_size)

        self.blocker = RecordLinkBlockLearner(data_model, self.candidates,
                                              data_1, data_2,
                                              original_length_1,
                                              original_length_2)

        self._common_init()
Beispiel #17
0
    def __init__(self, data_model, data, blocked_proportion, sample_size,
                 original_length):

        self.data_model = data_model

        data = core.index(data)

        self.candidates = super().sample(data, blocked_proportion, sample_size)

        self.blocker = DedupeBlockLearner(data_model, self.candidates, data,
                                          original_length)

        self._common_init()
Beispiel #18
0
    def sample_combo(self, data, blocked_proportion,
                     sample_size, original_length=None):

        data = core.index(data)

        self.candidates = super().sample_combo(data,
                                               blocked_proportion,
                                               sample_size)

        self.classifier._init(self.candidates)

        sampled_records = Sample(data, 2000, original_length)

        self.blocker._init_combo(self.candidates, sampled_records, data)

        return sampled_records
Beispiel #19
0
    def __init__(self,
                 data_model,
                 data,
                 blocked_proportion,
                 sample_size,
                 original_length):

        self.data_model = data_model

        data = core.index(data)

        self.candidates = super().sample(data, blocked_proportion, sample_size)

        self.blocker = DedupeBlockLearner(data_model,
                                          self.candidates,
                                          data,
                                          original_length)

        self._common_init()
Beispiel #20
0
    def sample(self, data, sample_size=15000, 
               blocked_proportion=0.5) :
        '''Draw a sample of record pairs from the dataset
        (a mix of random pairs & pairs of similar records)
        and initialize active learning with this sample
        
        Arguments: data -- Dictionary of records, where the keys are
        record_ids and the values are dictionaries with the keys being
        field names
        
        sample_size         -- Size of the sample to draw
        blocked_proportion  -- Proportion of the sample that will be blocked
        '''
        data = core.index(data)

        blocked_sample_size = int(blocked_proportion * sample_size)
        predicates = list(predicateGenerator(self.data_model, 
                                             index_predicates=False,
                                             canopies=self.canopies))


        data = sampling.randomDeque(data)
        blocked_sample_keys = sampling.dedupeBlockedSample(blocked_sample_size,
                                                           predicates,
                                                           data)

        random_sample_size = sample_size - len(blocked_sample_keys)
        random_sample_keys = set(dedupe.core.randomPairs(len(data),
                                                         random_sample_size))
        data = dict(data)

        data_sample = [(data[k1], data[k2])
                       for k1, k2 
                       in blocked_sample_keys | random_sample_keys]

        data_sample = core.freezeData(data_sample)

        # data can be a very large object, so we'll free it up as soon
        # as possible
        del data

        self._loadSample(data_sample)
Beispiel #21
0
    def sample(self, data, sample_size=15000, 
               blocked_proportion=0.5) :
        '''Draw a sample of record pairs from the dataset
        (a mix of random pairs & pairs of similar records)
        and initialize active learning with this sample
        
        Arguments: data -- Dictionary of records, where the keys are
        record_ids and the values are dictionaries with the keys being
        field names
        
        sample_size         -- Size of the sample to draw
        blocked_proportion  -- Proportion of the sample that will be blocked
        '''
        data = core.index(data)

        blocked_sample_size = int(blocked_proportion * sample_size)
        predicates = [pred for pred in predicateGenerator(self.data_model)
                      if pred.type == 'SimplePredicate']

        data = sampling.randomDeque(data)
        blocked_sample_keys = sampling.dedupeBlockedSample(blocked_sample_size,
                                                           predicates,
                                                           data)

        random_sample_size = sample_size - len(blocked_sample_keys)
        random_sample_keys = set(dedupe.core.randomPairs(len(data),
                                                         random_sample_size))
        data = dict(data)

        data_sample = ((data[k1], data[k2])
                       for k1, k2 
                       in blocked_sample_keys | random_sample_keys)

        data_sample = core.freezeData(data_sample)

        self._loadSample(data_sample)