コード例 #1
0
ファイル: api.py プロジェクト: cojito/dedupe
    def _trainBlocker(self, ppc=1, uncovered_dupes=1) :
        training_pairs = copy.deepcopy(self.training_pairs)

        blocker_types = self._blockerTypes()

        confident_nonduplicates = training.semiSupervisedNonDuplicates(self.data_sample,
                                                                       self.data_model,
                                                                       sample_size=32000)

        training_pairs['distinct'].extend(confident_nonduplicates)

        predicate_set = blocking.predicateGenerator(blocker_types, 
                                                    self.data_model)

        (self.predicates, 
         self.stop_words) = dedupe.blocking.blockTraining(training_pairs,
                                                          predicate_set,
                                                          ppc,
                                                          uncovered_dupes,
                                                          self.pool,
                                                          self._linkage_type)

        self.blocker = self._Blocker(self.predicates,
                                     self.pool,
                                     self.stop_words) 
コード例 #2
0
ファイル: api.py プロジェクト: nilesh-c/dedupe
    def _learnBlocking(self, eta, epsilon):
        """Learn a good blocking of the data"""

        confident_nonduplicates = training.semiSupervisedNonDuplicates(self.data_sample, self.data_model)

        self.training_pairs[0].extend(confident_nonduplicates)

        predicate_functions = (
            predicates.wholeFieldPredicate,
            predicates.tokenFieldPredicate,
            predicates.commonIntegerPredicate,
            predicates.sameThreeCharStartPredicate,
            predicates.sameFiveCharStartPredicate,
            predicates.sameSevenCharStartPredicate,
            predicates.nearIntegersPredicate,
            predicates.commonFourGram,
            predicates.commonSixGram,
        )

        tfidf_thresholds = [0.2, 0.4, 0.6, 0.8]
        full_string_records = {}

        fields = [k for k, v in self.data_model["fields"].items() if v["type"] != "Missing Data"]

        for pair in self.data_sample[0:2000]:
            for (k, v) in pair:
                full_string_records[k] = " ".join(v[field] for field in fields)

        df_index = tfidf.documentFrequency(full_string_records)

        learned_predicates = dedupe.blocking.blockTraining(
            self.training_pairs, predicate_functions, fields, tfidf_thresholds, df_index, eta, epsilon
        )

        return learned_predicates
コード例 #3
0
ファイル: api.py プロジェクト: JoeGermuska/dedupe
    def _learnBlocking(self, eta, epsilon):
        """Learn a good blocking of the data"""

        confident_nonduplicates = training.semiSupervisedNonDuplicates(
            self.data_sample, self.data_model, sample_size=32000
        )

        self.training_pairs[0].extend(confident_nonduplicates)

        predicate_set = predicateGenerator(self.blocker_types, self.data_model)

        learned_predicates = dedupe.blocking.blockTraining(self.training_pairs, predicate_set, eta, epsilon)

        return learned_predicates
コード例 #4
0
    def _learnBlocking(self, eta, epsilon):
        """Learn a good blocking of the data"""

        confident_nonduplicates = training.semiSupervisedNonDuplicates(
            self.data_sample, self.data_model, sample_size=32000)

        self.training_pairs[0].extend(confident_nonduplicates)

        predicate_set = predicateGenerator(self.blocker_types, self.data_model)

        learned_predicates = dedupe.blocking.blockTraining(
            self.training_pairs, predicate_set, eta, epsilon)

        return learned_predicates
コード例 #5
0
ファイル: api.py プロジェクト: rkiddy/dedupe
    def _trainBlocker(self, ppc=1, uncovered_dupes=1):  # pragma : no cover
        training_pairs = copy.deepcopy(self.training_pairs)

        confident_nonduplicates = training.semiSupervisedNonDuplicates(
            self.data_sample, self.data_model, sample_size=32000)

        training_pairs[u'distinct'].extend(confident_nonduplicates)

        predicate_set = predicateGenerator(self.data_model)

        (self.predicates, self.stop_words) = dedupe.training.blockTraining(
            training_pairs, predicate_set, ppc, uncovered_dupes,
            self._linkage_type)

        self.blocker = self._Blocker(self.predicates, self.stop_words)
コード例 #6
0
    def _trainBlocker(self, ppc, uncovered_dupes,
                      index_predicates):  # pragma : no cover
        training_pairs = copy.deepcopy(self.training_pairs)

        confident_nonduplicates = training.semiSupervisedNonDuplicates(
            self.data_sample,
            self.data_model,
            self.classifier,
            sample_size=32000)

        training_pairs[u'distinct'].extend(confident_nonduplicates)

        predicate_set = self.data_model.predicates(index_predicates,
                                                   self.canopies)

        self.predicates = dedupe.training.blockTraining(
            training_pairs, predicate_set, ppc, uncovered_dupes,
            self._linkage_type)

        self.blocker = blocking.Blocker(self.predicates)
コード例 #7
0
ファイル: api.py プロジェクト: anukat2015/dedupe
    def _trainBlocker(self, ppc, uncovered_dupes, index_predicates) : # pragma : no cover
        training_pairs = copy.deepcopy(self.training_pairs)

        confident_nonduplicates = training.semiSupervisedNonDuplicates(self.data_sample,
                                                                       self.data_model,
                                                                       self.classifier,
                                                                       sample_size=32000)

        training_pairs[u'distinct'].extend(confident_nonduplicates)

        predicate_set = self.data_model.predicates(index_predicates,
                                                   self.canopies)

        self.predicates = dedupe.training.blockTraining(training_pairs,
                                                        predicate_set,
                                                        ppc,
                                                        uncovered_dupes,
                                                        self._linkage_type)

        self.blocker = blocking.Blocker(self.predicates)