Example #1
0
    def _compute_vectorized(self, source_column: pd.Series,
                            target_column: pd.Series):

        # add the superclasses and subclasses of each occupation to
        # the target column
        target_column = target_column.apply(self._expand_occupations)

        concatenated = pd.Series(list(zip(source_column, target_column)))

        # Given 2 sets, return the percentage of items that the
        # smaller set shares with the larger set
        def check_occupation_equality(pair: Tuple[Set[str], Set[str]]):
            if _pair_has_any_null(pair):
                LOGGER.debug(
                    "Can't compare occupations, "
                    "the pair contains null values: %s",
                    pair,
                )
                return np.nan

            s_item, t_item = pair

            min_length = min(len(s_item), len(t_item))
            n_shared_items = len(s_item & t_item)

            return n_shared_items / min_length

        return fillna(concatenated.apply(check_occupation_equality),
                      self.missing_value)
Example #2
0
    def _compute_vectorized(self, source_column, target_column):
        concatenated = pd.Series(list(zip(source_column, target_column)))

        def intersection_percentage_size(pair):
            if _pair_has_any_null(pair):
                LOGGER.debug(
                    "Can't compare tokens, the pair contains null values: %s",
                    pair,
                )
                return np.nan

            source_list, target_list = pair
            source_set, target_set = set(source_list), set()

            for value in target_list:
                if value:
                    target_set.update(filter(None, value.split()))

            intersection = source_set.intersection(target_set)
            count_intersect = len(intersection)
            count_total = len(source_set.union(target_set))

            # Penalize band stopwords
            count_low_score_words = len(
                text_utils.BAND_NAME_LOW_SCORE_WORDS.intersection(
                    intersection))

            return ((count_intersect - (count_low_score_words * 0.9)) /
                    count_total if count_total > 0 else np.nan)

        return fillna(concatenated.apply(intersection_percentage_size),
                      self.missing_value)
Example #3
0
    def _compute_vectorized(self, source_column, target_column):
        concatenated = pd.Series(list(zip(source_column, target_column)))

        def check_date_equality(pair: Tuple[List[pd.Period], List[pd.Period]]):
            if _pair_has_any_null(pair):
                LOGGER.debug(
                    "Can't compare dates, the pair contains null values: %s",
                    pair,
                )
                return np.nan

            source_list, target_list = pair
            # Keep track of the best score
            best = 0

            for source, target in itertools.product(source_list, target_list):
                # Get precision number for both dates
                s_precision = constants.PD_PERIOD_PRECISIONS.index(
                    source.freq.name)
                t_precision = constants.PD_PERIOD_PRECISIONS.index(
                    target.freq.name)

                # Minimum pair precision = maximum shared precision
                lowest_prec = min(s_precision, t_precision)
                # Result for the current `source`
                current_result = 0

                # Loop through `pandas.Period` attributes that we can compare
                # and the precision that stands for said attribute
                for min_required_prec, d_attr in enumerate(
                    ['year', 'month', 'day', 'hour', 'minute', 'second']):
                    # If both `source` and `target` have a precision which allows the
                    # current attribute to be compared then we do so. If the attribute
                    # matches then we add 1 to `current_result`, if not then we break the loop.
                    # We consider from lowest to highest precision. If a lowest
                    # precision attribute (e.g., year) doesn't match then we say that
                    # the dates don't match at all (we don't check if higher precision
                    # attributes match)
                    if lowest_prec >= min_required_prec and getattr(
                            source, d_attr) == getattr(target, d_attr):
                        current_result += 1
                    else:
                        break

                # We want a value between 0 and 1 for our score. 0 means no match at all and
                # 1 stands for perfect match. We just divide `current_result` by `lowest_prec`
                # so that we get the percentage of items that matches from the total number
                # of items we compared (since we have variable date precision)
                # we sum 1 to `lowest_prec` to account for the fact that the possible minimum
                # common precision is 0 (the year)
                best = max(best, (current_result / (lowest_prec + 1)))

            return best

        return fillna(concatenated.apply(check_date_equality),
                      self.missing_value)
Example #4
0
    def _compute_vectorized(
        self, source_column: pd.Series, target_column: pd.Series
    ) -> pd.Series:

        # concatenate columns for easier processing. Here each element in
        # the columns is a set of tokens
        concatenated = pd.Series(list(zip(source_column, target_column)))

        # Compute shared tokens after filtering stop words
        def compare_apply(pair: Tuple[List[str], List[str]]) -> float:
            if _pair_has_any_null(pair):
                LOGGER.debug(
                    "Can't compare, the pair contains null values: %s", pair
                )
                return np.nan

            # first we clean a bit the pair
            # make all lowercase and split on possible spaces
            # also reshape result into a list (flatten)
            pair = [
                self._flatten([el.lower().split() for el in p]) for p in pair
            ]

            s_item, t_item = pair

            # finally convert to sets
            s_item = set(s_item)
            t_item = set(t_item)

            if self.stop_words:
                s_item -= self.stop_words
                t_item -= self.stop_words

            min_length = min(len(s_item), len(t_item))
            n_shared_items = len(s_item & t_item)

            # Prevent division by 0
            if min_length != 0:
                return n_shared_items / min_length
            else:
                return np.nan

        return fillna(concatenated.apply(compare_apply), self.missing_value)
Example #5
0
    def _compute_vectorized(self, source_column, target_column):
        if self.algorithm == 'levenshtein':
            algorithm = self.levenshtein_similarity

        elif self.algorithm == 'cosine':
            algorithm = self.cosine_similarity

        else:
            err_msg = (f'Bad string similarity algorithm: {self.algorithm}. '
                       f"Please use one of ('levenshtein', 'cosine')")
            LOGGER.critical(err_msg)
            raise ValueError(err_msg)

        compared = algorithm(source_column, target_column)
        compared_filled = fillna(compared, self.missing_value)

        if self.threshold is None:
            return compared_filled

        return (compared_filled >= self.threshold).astype(np.float64)
Example #6
0
    def _compute_vectorized(self, source_column, target_column):
        concatenated = pd.Series(list(zip(source_column, target_column)))

        def exact_apply(pair):
            if _pair_has_any_null(pair):
                LOGGER.debug(
                    "Can't compare, the pair contains null values: %s", pair)
                return np.nan

            scores = []
            for source in pair[0]:
                for target in pair[1]:
                    if pd.isna(source) or pd.isna(target):
                        scores.append(self.missing_value)
                        continue
                    if source == target:
                        scores.append(self.match_value)
                    else:
                        scores.append(self.non_match_value)
            return max(scores)

        return fillna(concatenated.apply(exact_apply), self.missing_value)