Esempio n. 1
0
    def __init__(self, *args, **kwargs):
        super(self.__class__, self).__init__(*args, **kwargs)

        self.algorithm = ECMEstimate()
Esempio n. 2
0
    def __init__(self, *args, **kwargs):
        super(self.__class__, self).__init__(*args, **kwargs)

        self.algorithm = ECMEstimate()
Esempio n. 3
0
class ECMClassifier(FellegiSunter):
    """Expectation/Conditional Maxisation vlassifier.

    [EXPERIMENTAL] Expectation/Conditional Maximisation algorithm used as
    classifier. This probabilistic record linkage algorithm is used in
    combination with Fellegi and Sunter model.

    """
    def __init__(self, *args, **kwargs):
        super(self.__class__, self).__init__(*args, **kwargs)

        self.algorithm = ECMEstimate()

    def learn(self, comparison_vectors, init='jaro', return_type='index'):
        """ Train the algorithm.

        Train the Expectation-Maximisation classifier. This method is well-
        known as the ECM-algorithm implementation in the context of record
        linkage.

        Parameters
        ----------
        comparison_vectors : pandas.DataFrame
            The dataframe with comparison vectors.
        params_init : dict
            A dictionary with initial parameters of the ECM algorithm
            (optional).
        return_type : 'index' (default), 'series', 'array'
            The format to return the classification result. The argument value
            'index' will return the pandas.MultiIndex of the matches. The
            argument value 'series' will return a pandas.Series with zeros
            (distinct) and ones (matches). The argument value 'array' will
            return a numpy.ndarray with zeros and ones.

        Returns
        -------
        pandas.Series
            A pandas Series with the labels 1 (for the matches) and 0 (for the
            non-matches).

        """

        logging.info("Classifying - start learning {}".format(
            self.__class__.__name__))

        # start timing
        start_time = time.time()

        probs = self.algorithm.train(comparison_vectors.as_matrix())

        n_matches = int(self.algorithm.p * len(probs))
        self.p_threshold = numpy.sort(probs)[len(probs) - n_matches]

        prediction = self._decision_rule(probs, self.p_threshold)

        result = self._return_result(prediction, return_type,
                                     comparison_vectors)

        # log timing
        logf_time = "Classifying - learning computation time: ~{:.2f}s"
        logging.info(logf_time.format(time.time() - start_time))

        return result

    def predict(self,
                comparison_vectors,
                return_type='index',
                *args,
                **kwargs):
        """Predict the class of reord pairs.

        Classify a set of record pairs based on their comparison vectors into
        matches, non-matches and possible matches. The classifier has to be
        trained to call this method.

        Parameters
        ----------
        comparison_vectors : pandas.DataFrame
            The dataframe with comparison vectors.
        return_type : 'index' (default), 'series', 'array'
            The format to return the classification result. The argument value
            'index' will return the pandas.MultiIndex of the matches. The
            argument value 'series' will return a pandas.Series with zeros
            (distinct) and ones (matches). The argument value 'array' will
            return a numpy.ndarray with zeros and ones.

        Returns
        -------
        pandas.Series
            A pandas Series with the labels 1 (for the matches) and 0 (for the
            non-matches).

        Note
        ----
        Prediction is risky for this unsupervised learning method. Be aware
        that the sample from the population is valid.


        """

        logging.info("Classifying - predict matches and non-matches")

        enc_vectors = self.algorithm._transform_vectors(
            comparison_vectors.as_matrix())

        probs = self.algorithm._expectation(enc_vectors)

        prediction = self._decision_rule(probs, self.p_threshold)

        return self._return_result(prediction, return_type, comparison_vectors)

    def prob(self, comparison_vectors):
        """Compute the probabilities for each record pair.

        For each pair of records, estimate the probability of being a match.

        Parameters
        ----------
        comparison_vectors : pandas.DataFrame
            The dataframe with comparison vectors.
        return_type : 'series' or 'array'
            Return a pandas series or numpy array. Default 'series'.

        Returns
        -------
        pandas.Series or numpy.ndarray
            The probability of being a match for each record pair.

        """

        logging.info("Classifying - compute probabilities")

        enc_vectors = self.algorithm._transform_vectors(
            comparison_vectors.as_matrix())

        return pandas.Series(self.algorithm._expectation(enc_vectors),
                             index=comparison_vectors.index)
Esempio n. 4
0
class ECMClassifier(FellegiSunter):
    """

    [EXPERIMENTAL] Expectation/Conditional Maximisation algorithm used as
    classifier. This probabilistic record linkage algorithm is used in
    combination with Fellegi and Sunter model.

    """

    def __init__(self, *args, **kwargs):
        super(self.__class__, self).__init__(*args, **kwargs)

        self.algorithm = ECMEstimate()

    def learn(self, comparison_vectors, init='jaro', return_type='index'):
        """

        Train the Expectation-Maximisation classifier. This method is well-
        known as the ECM-algorithm implementation in the context of record
        linkage.

        :param comparison_vectors: The dataframe with comparison vectors.
        :param params_init: A dictionary with initial parameters of the ECM
                algorithm (optional).
        :param return_type: The format to return the classification result.
                The argument value 'index' will return the pandas.MultiIndex
                of the matches. The argument value 'series' will return a
                pandas.Series with zeros (distinct) and ones (matches). The
                argument value 'array' will return a numpy.ndarray with zeros
                and ones.
        :type comparison_vectors: pandas.DataFrame
        :type params_init: dict
        :type return_type: 'index' (default), 'series', 'array'

        :return: A pandas Series with the labels 1 (for the matches) and 0
                (for the non-matches).
        :rtype: pandas.Series

        """

        probs = self.algorithm.train(comparison_vectors.as_matrix())

        n_matches = int(self.algorithm.p * len(probs))
        self.p_threshold = numpy.sort(probs)[len(probs) - n_matches]

        prediction = self._decision_rule(probs, self.p_threshold)

        return self._return_result(prediction, return_type, comparison_vectors)

    def predict(self, comparison_vectors, return_type='index', *args, **kwargs):
        """

        Classify a set of record pairs based on their comparison vectors into
        matches, non-matches and possible matches. The classifier has to be
        trained to call this method.

        :param comparison_vectors: The dataframe with comparison vectors.
        :param return_type: The format to return the classification result.
                The argument value 'index' will return the pandas.MultiIndex
                of the matches. The argument value 'series' will return a
                pandas.Series with zeros (distinct) and ones (matches). The
                argument value 'array' will return a numpy.ndarray with zeros
                and ones.
        :type comparison_vectors: pandas.DataFrame
        :type return_type: 'index' (default), 'series', 'array'

        :return: A pandas Series with the labels 1 (for the matches) and 0
                (for the non-matches).
        :rtype: pandas.Series

        .. note::

                Prediction is risky for this unsupervised learning method. Be
                aware that the sample from the population is valid.


        """

        enc_vectors = self.algorithm._transform_vectors(
            comparison_vectors.as_matrix())

        probs = self.algorithm._expectation(enc_vectors)

        prediction = self._decision_rule(probs, self.p_threshold)

        return self._return_result(prediction, return_type, comparison_vectors)

    def prob(self, comparison_vectors):
        """

        Estimate the probability for each record pairs of being a match.

        The method computes the probability for each given record pair of
        being a match. The probability of a non-match is 1 minus the result.
        This method is not implemented for all classifiers (for example
        K-means clustering).

        :param comparison_vectors: The dataframe with comparison vectors.
        :type comparison_vectors: pandas.DataFrame

        :return: A pandas Series with pandas.MultiIndex with the probability
                of being a match.
        :rtype: pandas.Series
        """

        enc_vectors = self.algorithm._transform_vectors(
            comparison_vectors.as_matrix())

        return pandas.Series(
            self.algorithm._expectation(enc_vectors),
            index=comparison_vectors.index
        )