def covered_pairs(fingerprinter, records): """ For each field, there are one or more predicates. A predicate is a class defined in dedupe.predicates.py. A predicate is defined by the field it is associated with, and the predicate type. A predicate is callable (see the __call__ function). Pseudo-Algorithm: For each predicate, loop through the records list. Call the predicate function on each record. Args: fingerprinter: (blocking.Fingerprinter) records: (dict)[dict] Records dictionary Returns: cover: (dict) { key: (dedupe.predicates class) value: (dedupe.training.Counter) } """ cover = {} pair_enumerator = core.Enumerator() n_records = len(records) logger.info(f"fingerprint predicates: {len(fingerprinter.predicates)}") for predicate in fingerprinter.predicates: # logger.debug(predicate) pred_cover = collections.defaultdict(set) for id, record in records.items(): blocks = predicate(record) for block in blocks: pred_cover[block].add(id) if not pred_cover: continue max_cover = max(len(v) for v in pred_cover.values()) if max_cover == n_records: continue pairs = (pair_enumerator[pair] for block in pred_cover.values() for pair in itertools.combinations(sorted(block), 2)) cover[predicate] = Counter(pairs) # logger.debug(cover[predicate]) # logger.debug(len(cover)) return cover
def _loadIndices(self, settings_file): canopies = pickle.load(settings_file) indices = pickle.load(settings_file) doc_to_ids = pickle.load(settings_file) for full_predicate in self.predicates: for predicate in full_predicate: if hasattr(predicate, "index") and predicate.index is None: predicate.index = predicate.initIndex() max_id = max(doc_to_ids[predicate].values()) predicate.index._doc_to_id = core.Enumerator( max_id + 1, doc_to_ids[predicate]) if hasattr(predicate, "canopy"): predicate.canopy = canopies[predicate] else: try: predicate.index._index = indices[predicate] except KeyError: pass self.loaded_indices = True
def _blockData(self, data_d): blocks = defaultdict(list) if not self.loaded_indices: self.blocker.indexAll(data_d) block_numbers = core.Enumerator(start=0) block_groups = itertools.groupby(self.blocker(viewitems(data_d)), lambda x: x[1]) for record_id, block in block_groups: record = data_d[record_id] block_ids = sorted(block_numbers[block_key] for block_key, _ in block) while block_ids: id = block_ids.pop() blocks[id].append((record_id, record, set(block_ids))) for block in viewvalues(blocks): if len(block) > 1: yield block