Example #1
0
    def _addTrainingData(self, labeled_pairs):
        """
        Appends training data to the training data collection.
        """

        for label, examples in labeled_pairs.items():
            n_examples = len(examples)
            labels = [label] * n_examples

            new_data = numpy.empty(n_examples, dtype=self.training_data.dtype)

            new_data["label"] = labels
            new_data["distances"] = core.fieldDistances(examples, self.data_model)

            self.training_data = numpy.append(self.training_data, new_data)
Example #2
0
    def _addTrainingData(self, labeled_pairs):
        """
        Appends training data to the training data collection.
        """

        for label, examples in labeled_pairs.items():
            n_examples = len(examples)
            labels = [label] * n_examples

            new_data = numpy.empty(n_examples, dtype=self.training_data.dtype)

            new_data['label'] = labels
            new_data['distances'] = core.fieldDistances(
                examples, self.data_model)

            self.training_data = numpy.append(self.training_data, new_data)
Example #3
0
    def goodThreshold(self,
                      blocks,
                      constrained_matching=False,
                      recall_weight=1.5):
        """
        Returns the threshold that maximizes the expected F score,
        a weighted average of precision and recall for a sample of
        blocked data. 

        Keyword arguments:
        blocks --        Sequence of tuples of records, where each
                         tuple is a set of records covered by a blocking
                         predicate

        recall_weight -- Sets the tradeoff between precision and
                         recall. I.e. if you care twice as much about
                         recall as you do precision, set recall_weight
                         to 2.
        """

        blocked_records = (block.values() for block in blocks)

        candidates = core.blockedPairs(blocked_records, constrained_matching)

        field_distances = core.fieldDistances(candidates, self.data_model)
        probability = core.scorePairs(field_distances, self.data_model)

        probability.sort()
        probability = probability[::-1]

        expected_dupes = numpy.cumsum(probability)

        recall = expected_dupes / expected_dupes[-1]
        precision = expected_dupes / numpy.arange(1, len(expected_dupes) + 1)

        score = recall * precision / (recall + recall_weight**2 * precision)

        i = numpy.argmax(score)

        logging.info('Maximum expected recall and precision')
        logging.info('recall: %2.3f', recall[i])
        logging.info('precision: %2.3f', precision[i])
        logging.info('With threshold: %2.3f', probability[i])

        return probability[i]
Example #4
0
    def goodThreshold(self, blocks, recall_weight=1.5):
        """
        Returns the threshold that maximizes the expected F score,
        a weighted average of precision and recall for a sample of
        blocked data. 

        Keyword arguments:
        blocks --        Sequence of tuples of records, where each
                         tuple is a set of records covered by a blocking
                         predicate

        recall_weight -- Sets the tradeoff between precision and
                         recall. I.e. if you care twice as much about
                         recall as you do precision, set recall_weight
                         to 2.
        """

        blocked_records = (block.values() for block in blocks)

        candidates = core.blockedPairs(blocked_records)

        field_distances = core.fieldDistances(candidates, self.data_model)
        probability = core.scorePairs(field_distances, self.data_model)

        probability.sort()
        probability = probability[::-1]

        expected_dupes = numpy.cumsum(probability)

        recall = expected_dupes / expected_dupes[-1]
        precision = expected_dupes / numpy.arange(1, len(expected_dupes) + 1)

        score = recall * precision / (recall + recall_weight ** 2 * precision)

        i = numpy.argmax(score)

        logging.info("Maximum expected recall and precision")
        logging.info("recall: %2.3f", recall[i])
        logging.info("precision: %2.3f", precision[i])
        logging.info("With threshold: %2.3f", probability[i])

        return probability[i]