Ejemplo n.º 1
0
    def __init__(self, data_model, candidates, data, original_length):
        super().__init__(data_model, candidates)

        index_data = Sample(data, 50000, original_length)
        sampled_records = Sample(index_data, 2000, original_length)
        preds = self.data_model.predicates()

        self.block_learner = training.DedupeBlockLearner(
            preds, sampled_records, index_data)

        self._index_predicates(self.candidates)
Ejemplo n.º 2
0
    def __init__(self, data_model, candidates, data, original_length,
                 index_include):
        super().__init__(data_model, candidates)

        index_data = Sample(data, 50000, original_length)
        sampled_records = Sample(index_data, 5000, original_length)
        preds = self.data_model.predicates()

        self.block_learner = training.DedupeBlockLearner(
            preds, sampled_records, index_data)

        examples_to_index = candidates.copy()
        if index_include:
            examples_to_index += index_include

        self._index_predicates(examples_to_index)
Ejemplo n.º 3
0
    def __init__(self, distances, candidates, data, original_length,
                 index_include):
        logger.debug("Initializing labeler.DedupeBlockLearner")

        super().__init__(distances, candidates)

        index_data = Sample(data, 10000, original_length)
        sampled_records = Sample(index_data, 2000, original_length)
        preds = self.distances.predicates()

        self.block_learner = training.DedupeBlockLearner(
            preds, sampled_records, index_data)
        examples_to_index = candidates.copy()
        if index_include:
            examples_to_index += index_include

        self._index_predicates(examples_to_index)
Ejemplo n.º 4
0
 def _blockLearner(self, predicates):
     return training.DedupeBlockLearner(predicates, self.sampled_records)
Ejemplo n.º 5
0
    def _init_combo(self, candidates, *args):
        preds = self.data_model.predicates()
        self.block_learner = training.DedupeBlockLearner(preds, *args)

        self.candidates = candidates[:]