コード例 #1
0
ファイル: compare.py プロジェクト: wfranus/recordlinkage
    def _compute_vectorized(self, *data):

        result = []

        if isinstance(data, tuple):
            for col in data:
                result_i = _fillna(col, self.missing_value)
                result.append(result_i)
        else:
            result_0 = _fillna(data, self.missing_value)
            result.append(result_0)

        return tuple(result)
コード例 #2
0
ファイル: compare.py プロジェクト: wfranus/recordlinkage
    def _compute_vectorized(self, lat1, lng1, lat2, lng2):

        d = _haversine_distance(lat1, lng1, lat2, lng2)

        if self.method == 'step':
            num_sim_alg = partial(_step_sim, d, self.offset, self.origin)
        elif self.method in ['linear', 'lin']:
            num_sim_alg = partial(_linear_sim, d, self.scale, self.offset,
                                  self.origin)
        elif self.method == 'squared':
            num_sim_alg = partial(_squared_sim, d, self.scale, self.offset,
                                  self.origin)
        elif self.method in ['exp', 'exponential']:
            num_sim_alg = partial(_exp_sim, d, self.scale, self.offset,
                                  self.origin)
        elif self.method in ['gauss', 'gaussian']:
            num_sim_alg = partial(_gauss_sim, d, self.scale, self.offset,
                                  self.origin)
        else:
            raise ValueError("The algorithm '{}' is not known.".format(
                self.method))

        c = num_sim_alg()
        c = _fillna(c, self.missing_value)

        return c
コード例 #3
0
ファイル: compare.py プロジェクト: wfranus/recordlinkage
    def _compute_vectorized(self, s_left, s_right):

        if self.method == 'jaro':
            str_sim_alg = jaro_similarity
        elif self.method in ['jarowinkler', 'jaro_winkler', 'jw']:
            str_sim_alg = jarowinkler_similarity
        elif self.method == 'levenshtein':
            str_sim_alg = levenshtein_similarity
        elif self.method in [
                'dameraulevenshtein', 'damerau_levenshtein', 'dl'
        ]:
            str_sim_alg = damerau_levenshtein_similarity
        elif self.method in ['q_gram', 'qgram']:
            str_sim_alg = qgram_similarity
        elif self.method == 'cosine':
            str_sim_alg = cosine_similarity
        elif self.method in ['smith_waterman', 'smithwaterman', 'sw']:
            str_sim_alg = smith_waterman_similarity
        elif self.method in ['longest_common_substring', 'lcs']:
            str_sim_alg = longest_common_substring_similarity
        else:
            raise ValueError("The algorithm '{}' is not known.".format(
                self.method))

        c = str_sim_alg(s_left, s_right)

        if self.threshold is not None:
            c = c.where((c < self.threshold) | (pandas.isnull(c)), other=1.0)
            c = c.where((c >= self.threshold) | (pandas.isnull(c)), other=0.0)

        c = _fillna(c, self.missing_value)

        return c