def _blockData(self, data_d): blocks = core.TempShelve('blocks') if not self.loaded_indices: self.blocker.indexAll(data_d) block_groups = itertools.groupby(self.blocker(viewitems(data_d)), lambda x: x[1]) for record_id, block in block_groups: record = data_d[record_id] block_ids = sorted(block_key for block_key, _ in block) while block_ids: id = block_ids.pop() if id in blocks: blocks[id] += [(record_id, record, set(block_ids))] else: blocks[id] = [(record_id, record, set(block_ids))] if not self.loaded_indices: self.blocker.resetIndices() for block in viewvalues(blocks): if len(block) > 1: yield block blocks.close()
def _blockData(self, data_1, data_2): blocked_records = core.TempShelve('blocked_records') if not self.loaded_indices: self.blocker.indexAll(data_2) for block_key, record_id in self.blocker(data_2.items(), target=True): block = blocked_records.get(block_key, {}) block[record_id] = data_2[record_id] blocked_records[block_key] = block for each in self._blockGenerator(data_1, blocked_records): yield each blocked_records.close()