Example #1
0
    def covered_pairs(fingerprinter, records):
        """

        For each field, there are one or more predicates. A predicate is a class
        defined in dedupe.predicates.py. A predicate is defined by the field
        it is associated with, and the predicate type. A predicate is callable
        (see the __call__ function).

        Pseudo-Algorithm:

            For each predicate, loop through the records list.
            Call the predicate function on each record.

        Args:
            fingerprinter: (blocking.Fingerprinter)
            records: (dict)[dict] Records dictionary

        Returns:
            cover: (dict) {
                key: (dedupe.predicates class)
                value: (dedupe.training.Counter)
            }
        """
        cover = {}

        pair_enumerator = core.Enumerator()
        n_records = len(records)
        logger.info(f"fingerprint predicates: {len(fingerprinter.predicates)}")
        for predicate in fingerprinter.predicates:
            # logger.debug(predicate)
            pred_cover = collections.defaultdict(set)
            for id, record in records.items():
                blocks = predicate(record)
                for block in blocks:
                    pred_cover[block].add(id)

            if not pred_cover:
                continue

            max_cover = max(len(v) for v in pred_cover.values())
            if max_cover == n_records:
                continue

            pairs = (pair_enumerator[pair]
                     for block in pred_cover.values()
                     for pair in itertools.combinations(sorted(block), 2))
            cover[predicate] = Counter(pairs)
            # logger.debug(cover[predicate])
        # logger.debug(len(cover))
        return cover
Example #2
0
    def _loadIndices(self, settings_file):
        canopies = pickle.load(settings_file)
        indices = pickle.load(settings_file)
        doc_to_ids = pickle.load(settings_file)

        for full_predicate in self.predicates:
            for predicate in full_predicate:
                if hasattr(predicate, "index") and predicate.index is None:
                    predicate.index = predicate.initIndex()
                    max_id = max(doc_to_ids[predicate].values())
                    predicate.index._doc_to_id = core.Enumerator(
                        max_id + 1, doc_to_ids[predicate])

                    if hasattr(predicate, "canopy"):
                        predicate.canopy = canopies[predicate]
                    else:
                        try:
                            predicate.index._index = indices[predicate]
                        except KeyError:
                            pass

        self.loaded_indices = True
Example #3
0
    def _blockData(self, data_d):

        blocks = defaultdict(list)

        if not self.loaded_indices:
            self.blocker.indexAll(data_d)

        block_numbers = core.Enumerator(start=0)
        block_groups = itertools.groupby(self.blocker(viewitems(data_d)),
                                         lambda x: x[1])

        for record_id, block in block_groups:
            record = data_d[record_id]
            block_ids = sorted(block_numbers[block_key]
                               for block_key, _ in block)
            while block_ids:
                id = block_ids.pop()
                blocks[id].append((record_id, record, set(block_ids)))

        for block in viewvalues(blocks):
            if len(block) > 1:
                yield block