Ejemplo n.º 1
0
def _add_url_features(feature_extractor):
    url_column, url_tokens_column = keys.URL, keys.URL_TOKENS

    # Exact match on URLs
    feature_extractor.add(
        features.ExactMatch(url_column,
                            url_column,
                            label=f'{url_column}_exact'))

    # Match on URL tokens
    feature_extractor.add(
        features.SharedTokensPlus(
            url_tokens_column,
            url_tokens_column,
            label=f'{url_tokens_column}_shared',
            stop_words=text_utils.STOPWORDS_URL_TOKENS,
        ))
Ejemplo n.º 2
0
def extract_features(
    candidate_pairs: pd.MultiIndex,
    wikidata: pd.DataFrame,
    target: pd.DataFrame,
    path_io: str,
) -> pd.DataFrame:
    """Extract feature vectors by comparing pairs of
    *(Wikidata, target catalog)* records.

    **Main features:**

    - exact match on full names and URLs
    - match on tokenized names, URLs, and genres
    - `Levenshtein distance <https://en.wikipedia.org/wiki/Levenshtein_distance>`_
      on name tokens
    - `string kernel <https://en.wikipedia.org/wiki/String_kernel>`_
      similarity on name tokens
    - weighted intersection on name tokens
    - match on dates by maximum shared precision
    - `cosine similarity <https://en.wikipedia.org/wiki/Cosine_similarity>`_
      on textual descriptions
    - match on occupation QIDs

    See :mod:`features` for more details.

    This function uses multithreaded parallel processing.

    :param candidate_pairs: an index of *(QID, target ID)* pairs
      that should undergo comparison
    :param wikidata: a preprocessed Wikidata dataset (typically a chunk)
    :param target: a preprocessed target catalog dataset (typically a chunk)
    :param path_io: input/output path to an extracted feature file
    :return: the feature vectors dataset
    """
    LOGGER.info('Extracting features ...')

    # Early return cached features, for development purposes
    if os.path.isfile(path_io):
        LOGGER.info("Will reuse existing features: '%s'", path_io)
        return pd.read_pickle(path_io)

    def in_both_datasets(col: str) -> bool:
        return (col in wikidata.columns) and (col in target.columns)

    feature_extractor = rl.Compare(n_jobs=cpu_count())

    # Exact match on full name
    name_column = keys.NAME
    if in_both_datasets(name_column):
        feature_extractor.add(
            features.ExactMatch(name_column,
                                name_column,
                                label=f'{name_column}_exact'))

    # URL features
    if in_both_datasets(keys.URL):
        _add_url_features(feature_extractor)

    # Date features
    _add_date_features(feature_extractor, in_both_datasets)

    # Name tokens features
    if in_both_datasets(keys.NAME_TOKENS):
        _add_name_tokens_features(feature_extractor)

    # Cosine similarity on description
    description_column = keys.DESCRIPTION
    if in_both_datasets(description_column):
        feature_extractor.add(
            features.SimilarStrings(
                description_column,
                description_column,
                algorithm='cosine',
                analyzer='soweego',
                label=f'{description_column}_cosine',
            ))

    # Match on occupation QIDs
    occupations_column = keys.OCCUPATIONS
    if in_both_datasets(occupations_column):
        feature_extractor.add(
            features.SharedOccupations(
                occupations_column,
                occupations_column,
                label=f'{occupations_column}_shared',
            ))

    # Match on tokenized genres
    genres_column = keys.GENRES
    if in_both_datasets(genres_column):
        feature_extractor.add(
            features.SharedTokens(
                genres_column,
                genres_column,
                label=f'{genres_column}_tokens_shared',
            ))

    feature_vectors = feature_extractor.compute(candidate_pairs, wikidata,
                                                target)
    feature_vectors = feature_vectors[~feature_vectors.index.duplicated(
    )  # Drop duplicates
                                      ]

    os.makedirs(os.path.dirname(path_io), exist_ok=True)
    pd.to_pickle(feature_vectors, path_io)
    LOGGER.info("Features dumped to '%s'", path_io)

    LOGGER.info('Feature extraction done')

    return feature_vectors