def __init__(self, data_model, candidates, data, original_length): super().__init__(data_model, candidates) index_data = Sample(data, 50000, original_length) sampled_records = Sample(index_data, 2000, original_length) preds = self.data_model.predicates() self.block_learner = training.DedupeBlockLearner( preds, sampled_records, index_data) self._index_predicates(self.candidates)
def __init__(self, data_model, candidates, data, original_length, index_include): super().__init__(data_model, candidates) index_data = Sample(data, 50000, original_length) sampled_records = Sample(index_data, 5000, original_length) preds = self.data_model.predicates() self.block_learner = training.DedupeBlockLearner( preds, sampled_records, index_data) examples_to_index = candidates.copy() if index_include: examples_to_index += index_include self._index_predicates(examples_to_index)
def __init__(self, distances, candidates, data, original_length, index_include): logger.debug("Initializing labeler.DedupeBlockLearner") super().__init__(distances, candidates) index_data = Sample(data, 10000, original_length) sampled_records = Sample(index_data, 2000, original_length) preds = self.distances.predicates() self.block_learner = training.DedupeBlockLearner( preds, sampled_records, index_data) examples_to_index = candidates.copy() if index_include: examples_to_index += index_include self._index_predicates(examples_to_index)
def _blockLearner(self, predicates): return training.DedupeBlockLearner(predicates, self.sampled_records)
def _init_combo(self, candidates, *args): preds = self.data_model.predicates() self.block_learner = training.DedupeBlockLearner(preds, *args) self.candidates = candidates[:]