def _add_name_tokens_features(feature_extractor): name_tokens_column = keys.NAME_TOKENS # Levenshtein distance on name tokens feature_extractor.add( features.SimilarStrings( name_tokens_column, name_tokens_column, label=f'{name_tokens_column}_levenshtein', )) # String kernel similarity on name tokens feature_extractor.add( features.SimilarStrings( name_tokens_column, name_tokens_column, algorithm='cosine', analyzer='char_wb', label=f'{name_tokens_column}_string_kernel_cosine', )) # Weighted intersection of name tokens feature_extractor.add( features.SharedTokens( name_tokens_column, name_tokens_column, label=f'{name_tokens_column}_shared', ))
def extract_features( candidate_pairs: pd.MultiIndex, wikidata: pd.DataFrame, target: pd.DataFrame, path_io: str, ) -> pd.DataFrame: """Extract feature vectors by comparing pairs of *(Wikidata, target catalog)* records. **Main features:** - exact match on full names and URLs - match on tokenized names, URLs, and genres - `Levenshtein distance <https://en.wikipedia.org/wiki/Levenshtein_distance>`_ on name tokens - `string kernel <https://en.wikipedia.org/wiki/String_kernel>`_ similarity on name tokens - weighted intersection on name tokens - match on dates by maximum shared precision - `cosine similarity <https://en.wikipedia.org/wiki/Cosine_similarity>`_ on textual descriptions - match on occupation QIDs See :mod:`features` for more details. This function uses multithreaded parallel processing. :param candidate_pairs: an index of *(QID, target ID)* pairs that should undergo comparison :param wikidata: a preprocessed Wikidata dataset (typically a chunk) :param target: a preprocessed target catalog dataset (typically a chunk) :param path_io: input/output path to an extracted feature file :return: the feature vectors dataset """ LOGGER.info('Extracting features ...') # Early return cached features, for development purposes if os.path.isfile(path_io): LOGGER.info("Will reuse existing features: '%s'", path_io) return pd.read_pickle(path_io) def in_both_datasets(col: str) -> bool: return (col in wikidata.columns) and (col in target.columns) feature_extractor = rl.Compare(n_jobs=cpu_count()) # Exact match on full name name_column = keys.NAME if in_both_datasets(name_column): feature_extractor.add( features.ExactMatch(name_column, name_column, label=f'{name_column}_exact')) # URL features if in_both_datasets(keys.URL): _add_url_features(feature_extractor) # Date features _add_date_features(feature_extractor, in_both_datasets) # Name tokens features if in_both_datasets(keys.NAME_TOKENS): _add_name_tokens_features(feature_extractor) # Cosine similarity on description description_column = keys.DESCRIPTION if in_both_datasets(description_column): feature_extractor.add( features.SimilarStrings( description_column, description_column, algorithm='cosine', analyzer='soweego', label=f'{description_column}_cosine', )) # Match on occupation QIDs occupations_column = keys.OCCUPATIONS if in_both_datasets(occupations_column): feature_extractor.add( features.SharedOccupations( occupations_column, occupations_column, label=f'{occupations_column}_shared', )) # Match on tokenized genres genres_column = keys.GENRES if in_both_datasets(genres_column): feature_extractor.add( features.SharedTokens( genres_column, genres_column, label=f'{genres_column}_tokens_shared', )) feature_vectors = feature_extractor.compute(candidate_pairs, wikidata, target) feature_vectors = feature_vectors[~feature_vectors.index.duplicated( ) # Drop duplicates ] os.makedirs(os.path.dirname(path_io), exist_ok=True) pd.to_pickle(feature_vectors, path_io) LOGGER.info("Features dumped to '%s'", path_io) LOGGER.info('Feature extraction done') return feature_vectors