def _compute_vectorized(self, source_column: pd.Series, target_column: pd.Series): # add the superclasses and subclasses of each occupation to # the target column target_column = target_column.apply(self._expand_occupations) concatenated = pd.Series(list(zip(source_column, target_column))) # Given 2 sets, return the percentage of items that the # smaller set shares with the larger set def check_occupation_equality(pair: Tuple[Set[str], Set[str]]): if _pair_has_any_null(pair): LOGGER.debug( "Can't compare occupations, " "the pair contains null values: %s", pair, ) return np.nan s_item, t_item = pair min_length = min(len(s_item), len(t_item)) n_shared_items = len(s_item & t_item) return n_shared_items / min_length return fillna(concatenated.apply(check_occupation_equality), self.missing_value)
def _compute_vectorized(self, source_column, target_column): concatenated = pd.Series(list(zip(source_column, target_column))) def intersection_percentage_size(pair): if _pair_has_any_null(pair): LOGGER.debug( "Can't compare tokens, the pair contains null values: %s", pair, ) return np.nan source_list, target_list = pair source_set, target_set = set(source_list), set() for value in target_list: if value: target_set.update(filter(None, value.split())) intersection = source_set.intersection(target_set) count_intersect = len(intersection) count_total = len(source_set.union(target_set)) # Penalize band stopwords count_low_score_words = len( text_utils.BAND_NAME_LOW_SCORE_WORDS.intersection( intersection)) return ((count_intersect - (count_low_score_words * 0.9)) / count_total if count_total > 0 else np.nan) return fillna(concatenated.apply(intersection_percentage_size), self.missing_value)
def _compute_vectorized(self, source_column, target_column): concatenated = pd.Series(list(zip(source_column, target_column))) def check_date_equality(pair: Tuple[List[pd.Period], List[pd.Period]]): if _pair_has_any_null(pair): LOGGER.debug( "Can't compare dates, the pair contains null values: %s", pair, ) return np.nan source_list, target_list = pair # Keep track of the best score best = 0 for source, target in itertools.product(source_list, target_list): # Get precision number for both dates s_precision = constants.PD_PERIOD_PRECISIONS.index( source.freq.name) t_precision = constants.PD_PERIOD_PRECISIONS.index( target.freq.name) # Minimum pair precision = maximum shared precision lowest_prec = min(s_precision, t_precision) # Result for the current `source` current_result = 0 # Loop through `pandas.Period` attributes that we can compare # and the precision that stands for said attribute for min_required_prec, d_attr in enumerate( ['year', 'month', 'day', 'hour', 'minute', 'second']): # If both `source` and `target` have a precision which allows the # current attribute to be compared then we do so. If the attribute # matches then we add 1 to `current_result`, if not then we break the loop. # We consider from lowest to highest precision. If a lowest # precision attribute (e.g., year) doesn't match then we say that # the dates don't match at all (we don't check if higher precision # attributes match) if lowest_prec >= min_required_prec and getattr( source, d_attr) == getattr(target, d_attr): current_result += 1 else: break # We want a value between 0 and 1 for our score. 0 means no match at all and # 1 stands for perfect match. We just divide `current_result` by `lowest_prec` # so that we get the percentage of items that matches from the total number # of items we compared (since we have variable date precision) # we sum 1 to `lowest_prec` to account for the fact that the possible minimum # common precision is 0 (the year) best = max(best, (current_result / (lowest_prec + 1))) return best return fillna(concatenated.apply(check_date_equality), self.missing_value)
def _compute_vectorized( self, source_column: pd.Series, target_column: pd.Series ) -> pd.Series: # concatenate columns for easier processing. Here each element in # the columns is a set of tokens concatenated = pd.Series(list(zip(source_column, target_column))) # Compute shared tokens after filtering stop words def compare_apply(pair: Tuple[List[str], List[str]]) -> float: if _pair_has_any_null(pair): LOGGER.debug( "Can't compare, the pair contains null values: %s", pair ) return np.nan # first we clean a bit the pair # make all lowercase and split on possible spaces # also reshape result into a list (flatten) pair = [ self._flatten([el.lower().split() for el in p]) for p in pair ] s_item, t_item = pair # finally convert to sets s_item = set(s_item) t_item = set(t_item) if self.stop_words: s_item -= self.stop_words t_item -= self.stop_words min_length = min(len(s_item), len(t_item)) n_shared_items = len(s_item & t_item) # Prevent division by 0 if min_length != 0: return n_shared_items / min_length else: return np.nan return fillna(concatenated.apply(compare_apply), self.missing_value)
def _compute_vectorized(self, source_column, target_column): if self.algorithm == 'levenshtein': algorithm = self.levenshtein_similarity elif self.algorithm == 'cosine': algorithm = self.cosine_similarity else: err_msg = (f'Bad string similarity algorithm: {self.algorithm}. ' f"Please use one of ('levenshtein', 'cosine')") LOGGER.critical(err_msg) raise ValueError(err_msg) compared = algorithm(source_column, target_column) compared_filled = fillna(compared, self.missing_value) if self.threshold is None: return compared_filled return (compared_filled >= self.threshold).astype(np.float64)
def _compute_vectorized(self, source_column, target_column): concatenated = pd.Series(list(zip(source_column, target_column))) def exact_apply(pair): if _pair_has_any_null(pair): LOGGER.debug( "Can't compare, the pair contains null values: %s", pair) return np.nan scores = [] for source in pair[0]: for target in pair[1]: if pd.isna(source) or pd.isna(target): scores.append(self.missing_value) continue if source == target: scores.append(self.match_value) else: scores.append(self.non_match_value) return max(scores) return fillna(concatenated.apply(exact_apply), self.missing_value)