def __init__(self, *args, **kwargs): super(self.__class__, self).__init__(*args, **kwargs) self.algorithm = ECMEstimate()
class ECMClassifier(FellegiSunter): """Expectation/Conditional Maxisation vlassifier. [EXPERIMENTAL] Expectation/Conditional Maximisation algorithm used as classifier. This probabilistic record linkage algorithm is used in combination with Fellegi and Sunter model. """ def __init__(self, *args, **kwargs): super(self.__class__, self).__init__(*args, **kwargs) self.algorithm = ECMEstimate() def learn(self, comparison_vectors, init='jaro', return_type='index'): """ Train the algorithm. Train the Expectation-Maximisation classifier. This method is well- known as the ECM-algorithm implementation in the context of record linkage. Parameters ---------- comparison_vectors : pandas.DataFrame The dataframe with comparison vectors. params_init : dict A dictionary with initial parameters of the ECM algorithm (optional). return_type : 'index' (default), 'series', 'array' The format to return the classification result. The argument value 'index' will return the pandas.MultiIndex of the matches. The argument value 'series' will return a pandas.Series with zeros (distinct) and ones (matches). The argument value 'array' will return a numpy.ndarray with zeros and ones. Returns ------- pandas.Series A pandas Series with the labels 1 (for the matches) and 0 (for the non-matches). """ logging.info("Classifying - start learning {}".format( self.__class__.__name__)) # start timing start_time = time.time() probs = self.algorithm.train(comparison_vectors.as_matrix()) n_matches = int(self.algorithm.p * len(probs)) self.p_threshold = numpy.sort(probs)[len(probs) - n_matches] prediction = self._decision_rule(probs, self.p_threshold) result = self._return_result(prediction, return_type, comparison_vectors) # log timing logf_time = "Classifying - learning computation time: ~{:.2f}s" logging.info(logf_time.format(time.time() - start_time)) return result def predict(self, comparison_vectors, return_type='index', *args, **kwargs): """Predict the class of reord pairs. Classify a set of record pairs based on their comparison vectors into matches, non-matches and possible matches. The classifier has to be trained to call this method. Parameters ---------- comparison_vectors : pandas.DataFrame The dataframe with comparison vectors. return_type : 'index' (default), 'series', 'array' The format to return the classification result. The argument value 'index' will return the pandas.MultiIndex of the matches. The argument value 'series' will return a pandas.Series with zeros (distinct) and ones (matches). The argument value 'array' will return a numpy.ndarray with zeros and ones. Returns ------- pandas.Series A pandas Series with the labels 1 (for the matches) and 0 (for the non-matches). Note ---- Prediction is risky for this unsupervised learning method. Be aware that the sample from the population is valid. """ logging.info("Classifying - predict matches and non-matches") enc_vectors = self.algorithm._transform_vectors( comparison_vectors.as_matrix()) probs = self.algorithm._expectation(enc_vectors) prediction = self._decision_rule(probs, self.p_threshold) return self._return_result(prediction, return_type, comparison_vectors) def prob(self, comparison_vectors): """Compute the probabilities for each record pair. For each pair of records, estimate the probability of being a match. Parameters ---------- comparison_vectors : pandas.DataFrame The dataframe with comparison vectors. return_type : 'series' or 'array' Return a pandas series or numpy array. Default 'series'. Returns ------- pandas.Series or numpy.ndarray The probability of being a match for each record pair. """ logging.info("Classifying - compute probabilities") enc_vectors = self.algorithm._transform_vectors( comparison_vectors.as_matrix()) return pandas.Series(self.algorithm._expectation(enc_vectors), index=comparison_vectors.index)
class ECMClassifier(FellegiSunter): """ [EXPERIMENTAL] Expectation/Conditional Maximisation algorithm used as classifier. This probabilistic record linkage algorithm is used in combination with Fellegi and Sunter model. """ def __init__(self, *args, **kwargs): super(self.__class__, self).__init__(*args, **kwargs) self.algorithm = ECMEstimate() def learn(self, comparison_vectors, init='jaro', return_type='index'): """ Train the Expectation-Maximisation classifier. This method is well- known as the ECM-algorithm implementation in the context of record linkage. :param comparison_vectors: The dataframe with comparison vectors. :param params_init: A dictionary with initial parameters of the ECM algorithm (optional). :param return_type: The format to return the classification result. The argument value 'index' will return the pandas.MultiIndex of the matches. The argument value 'series' will return a pandas.Series with zeros (distinct) and ones (matches). The argument value 'array' will return a numpy.ndarray with zeros and ones. :type comparison_vectors: pandas.DataFrame :type params_init: dict :type return_type: 'index' (default), 'series', 'array' :return: A pandas Series with the labels 1 (for the matches) and 0 (for the non-matches). :rtype: pandas.Series """ probs = self.algorithm.train(comparison_vectors.as_matrix()) n_matches = int(self.algorithm.p * len(probs)) self.p_threshold = numpy.sort(probs)[len(probs) - n_matches] prediction = self._decision_rule(probs, self.p_threshold) return self._return_result(prediction, return_type, comparison_vectors) def predict(self, comparison_vectors, return_type='index', *args, **kwargs): """ Classify a set of record pairs based on their comparison vectors into matches, non-matches and possible matches. The classifier has to be trained to call this method. :param comparison_vectors: The dataframe with comparison vectors. :param return_type: The format to return the classification result. The argument value 'index' will return the pandas.MultiIndex of the matches. The argument value 'series' will return a pandas.Series with zeros (distinct) and ones (matches). The argument value 'array' will return a numpy.ndarray with zeros and ones. :type comparison_vectors: pandas.DataFrame :type return_type: 'index' (default), 'series', 'array' :return: A pandas Series with the labels 1 (for the matches) and 0 (for the non-matches). :rtype: pandas.Series .. note:: Prediction is risky for this unsupervised learning method. Be aware that the sample from the population is valid. """ enc_vectors = self.algorithm._transform_vectors( comparison_vectors.as_matrix()) probs = self.algorithm._expectation(enc_vectors) prediction = self._decision_rule(probs, self.p_threshold) return self._return_result(prediction, return_type, comparison_vectors) def prob(self, comparison_vectors): """ Estimate the probability for each record pairs of being a match. The method computes the probability for each given record pair of being a match. The probability of a non-match is 1 minus the result. This method is not implemented for all classifiers (for example K-means clustering). :param comparison_vectors: The dataframe with comparison vectors. :type comparison_vectors: pandas.DataFrame :return: A pandas Series with pandas.MultiIndex with the probability of being a match. :rtype: pandas.Series """ enc_vectors = self.algorithm._transform_vectors( comparison_vectors.as_matrix()) return pandas.Series( self.algorithm._expectation(enc_vectors), index=comparison_vectors.index )