Beispiel #1
0
def cli(catalog, entity, rule, upload, sandbox, dir_io, dates):
    """Run a rule-based linker.

    Available rules:

    'perfect' = perfect match on names

    'links' = similar match on link tokens

    'names' = similar match on name tokens

    Run all of them by default.
    """
    LOGGER.info("Running baseline '%s' rule over %s %s ...", rule, catalog,
                entity)

    # No need for the return value: only the output file will be consumed
    build_wikidata('classification', catalog, entity, dir_io)

    _run(catalog, entity, rule, dates, upload, sandbox, dir_io)

    LOGGER.info("Baseline '%s' rule over %s %s completed", rule, catalog,
                entity)
Beispiel #2
0
def _classification_set_generator(
        catalog, entity,
        dir_io) -> Iterator[Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]]:
    goal = 'classification'

    # Wikidata side
    wd_reader = workflow.build_wikidata(goal, catalog, entity, dir_io)
    wd_generator = workflow.preprocess_wikidata(goal, wd_reader)

    for i, wd_chunk in enumerate(wd_generator, 1):
        # Collect samples via queries to the target DB
        samples = blocking.find_samples(
            goal,
            catalog,
            wd_chunk[keys.NAME_TOKENS],
            i,
            target_database.get_main_entity(catalog, entity),
            dir_io,
        )

        # Build target chunk from samples
        target_reader = workflow.build_target(
            goal, catalog, entity, set(samples.get_level_values(keys.TID)))

        # Preprocess target chunk
        target_chunk = workflow.preprocess_target(goal, target_reader)

        # Extract features
        features_path = os.path.join(
            dir_io, constants.FEATURES.format(catalog, entity, goal, i))
        feature_vectors = workflow.extract_features(samples, wd_chunk,
                                                    target_chunk,
                                                    features_path)

        yield wd_chunk, target_chunk, feature_vectors

        LOGGER.info('Chunk %d classified', i)
Beispiel #3
0
def build_training_set(
    catalog: str, entity: str, dir_io: str
) -> Tuple[pd.DataFrame, pd.MultiIndex]:
    """Build a training set.

    :param catalog: ``{'discogs', 'imdb', 'musicbrainz'}``.
      A supported catalog
    :param entity: ``{'actor', 'band', 'director', 'musician', 'producer',
      'writer', 'audiovisual_work', 'musical_work'}``.
      A supported entity
    :param dir_io: input/output directory where working files
      will be read/written
    :return: the feature vectors and positive samples pair.
      Features are computed by comparing *(QID, catalog ID)* pairs.
      Positive samples are catalog IDs available in Wikidata
    """
    goal = 'training'

    # Wikidata side
    wd_reader = workflow.build_wikidata(goal, catalog, entity, dir_io)
    wd_generator = workflow.preprocess_wikidata(goal, wd_reader)

    positive_samples, feature_vectors = None, None

    for i, wd_chunk in enumerate(wd_generator, 1):
        # Positive samples come from Wikidata
        if positive_samples is None:
            positive_samples = wd_chunk[keys.TID]
        else:
            # We concatenate the current chunk
            # and reset `positive_samples` at each iteration,
            # instead of appending each chunk to a list,
            # then concatenate it at the end of the loop.
            # Reason: keeping multiple yet small pandas objects
            # is less memory-efficient
            positive_samples = pd.concat([positive_samples, wd_chunk[keys.TID]])

        # All samples come from queries to the target DB
        # and include negative ones
        all_samples = blocking.find_samples(
            goal,
            catalog,
            wd_chunk[keys.NAME_TOKENS],
            i,
            target_database.get_main_entity(catalog, entity),
            dir_io,
        )

        # Build target chunk from all samples
        target_reader = workflow.build_target(
            goal, catalog, entity, set(all_samples.get_level_values(keys.TID))
        )
        # Preprocess target chunk
        target_chunk = workflow.preprocess_target(goal, target_reader)

        features_path = os.path.join(
            dir_io, constants.FEATURES.format(catalog, entity, goal, i)
        )

        # Extract features from all samples
        chunk_fv = workflow.extract_features(
            all_samples, wd_chunk, target_chunk, features_path
        )

        if feature_vectors is None:
            feature_vectors = chunk_fv
        else:
            feature_vectors = pd.concat([feature_vectors, chunk_fv], sort=False)

    # Final positive samples index
    positive_samples_index = pd.MultiIndex.from_tuples(
        zip(positive_samples.index, positive_samples),
        names=[keys.QID, keys.TID],
    )

    LOGGER.info('Built positive samples index from Wikidata')

    feature_vectors = feature_vectors.fillna(constants.FEATURE_MISSING_VALUE)

    return feature_vectors, positive_samples_index
Beispiel #4
0
def execute(
    model_path: str,
    catalog: str,
    entity: str,
    threshold: float,
    name_rule: bool,
    dir_io: str,
) -> Iterator[pd.Series]:
    """Run a supervised linker.

    1. Build the classification set relevant to the given catalog and entity
    2. generate links between Wikidata items and catalog identifiers

    :param model_path: path to a trained model file
    :param catalog: ``{'discogs', 'imdb', 'musicbrainz'}``.
      A supported catalog
    :param entity: ``{'actor', 'band', 'director', 'musician', 'producer',
      'writer', 'audiovisual_work', 'musical_work'}``.
      A supported entity
    :param threshold: minimum confidence score for generated links.
      Those below this value are discarded.
      Must be a float between 0 and 1
    :param name_rule: whether to enable the rule on full names or not:
      if *True*, links with different full names
      are discarded after classification
    :param dir_io: input/output directory where working files
      will be read/written
    :return: the generator yielding chunks of links
    """
    goal = 'classification'
    classifier = joblib.load(model_path)

    # Wikidata side
    wd_reader = workflow.build_wikidata(goal, catalog, entity, dir_io)
    wd_generator = workflow.preprocess_wikidata(goal, wd_reader)

    for i, wd_chunk in enumerate(wd_generator, 1):
        # Collect samples via queries to the target DB
        samples = blocking.find_samples(
            goal,
            catalog,
            wd_chunk[keys.NAME_TOKENS],
            i,
            target_database.get_main_entity(catalog, entity),
            dir_io,
        )

        # Build target chunk from samples
        target_reader = workflow.build_target(
            goal, catalog, entity, set(samples.get_level_values(keys.TID))
        )

        # Preprocess target chunk
        target_chunk = workflow.preprocess_target(goal, target_reader)

        # Extract features
        features_path = os.path.join(
            dir_io, constants.FEATURES.format(catalog, entity, goal, i)
        )
        feature_vectors = workflow.extract_features(
            samples, wd_chunk, target_chunk, features_path
        )

        # The classification set must have the same feature space
        # as the training one
        _add_missing_feature_columns(classifier, feature_vectors)

        predictions = (
            # LSVM doesn't support probability scores
            classifier.predict(feature_vectors)
            if isinstance(classifier, rl.SVMClassifier)
            else classifier.prob(feature_vectors)
        )

        # Full name rule: if names differ, it's not a link
        if name_rule:
            LOGGER.info('Applying full names rule ...')
            predictions = pd.DataFrame(predictions).apply(
                _zero_when_different_names,
                axis=1,
                args=(wd_chunk, target_chunk),
            )

        # Wikidata URL rule: if the target ID has a Wikidata URL, it's a link
        if target_chunk.get(keys.URL) is not None:
            predictions = pd.DataFrame(predictions).apply(
                _one_when_wikidata_link_correct, axis=1, args=(target_chunk,)
            )

        LOGGER.info('Chunk %d classified', i)

        # Filter by threshold
        above_threshold = predictions[predictions >= threshold]

        # Remove duplicates
        yield above_threshold[~above_threshold.index.duplicated()]