Python Enumerator Examples

Programming Language: Python

Namespace/Package Name: dedupe.core

Method/Function: Enumerator

Examples at hotexamples.com: 3

Python Enumerator - 3 examples found. These are the top rated real world Python examples of dedupe.core.Enumerator extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: learner.py Project: ahill187/dedupe

    def covered_pairs(fingerprinter, records):
        """

        For each field, there are one or more predicates. A predicate is a class
        defined in dedupe.predicates.py. A predicate is defined by the field
        it is associated with, and the predicate type. A predicate is callable
        (see the __call__ function).

        Pseudo-Algorithm:

            For each predicate, loop through the records list.
            Call the predicate function on each record.

        Args:
            fingerprinter: (blocking.Fingerprinter)
            records: (dict)[dict] Records dictionary

        Returns:
            cover: (dict) {
                key: (dedupe.predicates class)
                value: (dedupe.training.Counter)
            }
        """
        cover = {}

        pair_enumerator = core.Enumerator()
        n_records = len(records)
        logger.info(f"fingerprint predicates: {len(fingerprinter.predicates)}")
        for predicate in fingerprinter.predicates:
            # logger.debug(predicate)
            pred_cover = collections.defaultdict(set)
            for id, record in records.items():
                blocks = predicate(record)
                for block in blocks:
                    pred_cover[block].add(id)

            if not pred_cover:
                continue

            max_cover = max(len(v) for v in pred_cover.values())
            if max_cover == n_records:
                continue

            pairs = (pair_enumerator[pair]
                     for block in pred_cover.values()
                     for pair in itertools.combinations(sorted(block), 2))
            cover[predicate] = Counter(pairs)
            # logger.debug(cover[predicate])
        # logger.debug(len(cover))
        return cover

Example #2

Show file

    def _loadIndices(self, settings_file):
        canopies = pickle.load(settings_file)
        indices = pickle.load(settings_file)
        doc_to_ids = pickle.load(settings_file)

        for full_predicate in self.predicates:
            for predicate in full_predicate:
                if hasattr(predicate, "index") and predicate.index is None:
                    predicate.index = predicate.initIndex()
                    max_id = max(doc_to_ids[predicate].values())
                    predicate.index._doc_to_id = core.Enumerator(
                        max_id + 1, doc_to_ids[predicate])

                    if hasattr(predicate, "canopy"):
                        predicate.canopy = canopies[predicate]
                    else:
                        try:
                            predicate.index._index = indices[predicate]
                        except KeyError:
                            pass

        self.loaded_indices = True

Example #3

Show file

    def _blockData(self, data_d):

        blocks = defaultdict(list)

        if not self.loaded_indices:
            self.blocker.indexAll(data_d)

        block_numbers = core.Enumerator(start=0)
        block_groups = itertools.groupby(self.blocker(viewitems(data_d)),
                                         lambda x: x[1])

        for record_id, block in block_groups:
            record = data_d[record_id]
            block_ids = sorted(block_numbers[block_key]
                               for block_key, _ in block)
            while block_ids:
                id = block_ids.pop()
                blocks[id].append((record_id, record, set(block_ids)))

        for block in viewvalues(blocks):
            if len(block) > 1:
                yield block