Esempio n. 1
0
def _run_for_one(classifier, catalog, entity, threshold, name_rule, upload,
                 sandbox, dir_io):
    """
    Runs the `linking` procedure for only one classifier
    """

    actual_classifier = constants.CLASSIFIERS[classifier]

    model_path, result_path = _handle_io(actual_classifier, catalog, entity,
                                         dir_io)
    # Exit if the model file doesn't exist
    if model_path is None:
        sys.exit(1)

    rl.set_option(*constants.CLASSIFICATION_RETURN_SERIES)

    for i, chunk in enumerate(
            execute(model_path, catalog, entity, threshold, name_rule,
                    dir_io)):
        chunk.to_csv(result_path, mode='a', header=False)

        if upload:
            _upload(chunk, i, catalog, entity, sandbox)

    # Free memory in case of neural networks:
    # can be done only after classification
    if actual_classifier in (
            keys.SINGLE_LAYER_PERCEPTRON,
            keys.MULTI_LAYER_PERCEPTRON,
    ):
        K.clear_session()  # Clear the TensorFlow graph
Esempio n. 2
0
def cli(ctx, classifier, catalog, entity, k_folds, single, nested, metric,
        dir_io):
    """Evaluate the performance of a supervised linker.

    By default, run 5-fold cross-validation and
    return averaged performance scores.
    """
    kwargs = utils.handle_extra_cli_args(ctx.args)
    if kwargs is None:
        sys.exit(1)

    rl.set_option(*constants.CLASSIFICATION_RETURN_INDEX)

    performance_out, predictions_out = _build_output_paths(
        catalog, entity, classifier, dir_io)

    # -n, --nested
    if nested:
        _run_nested(
            classifier,
            catalog,
            entity,
            k_folds,
            metric,
            kwargs,
            performance_out,
            dir_io,
        )

    # -s, --single
    elif single:
        _run_single(
            classifier,
            catalog,
            entity,
            k_folds,
            kwargs,
            performance_out,
            predictions_out,
            dir_io,
        )

    else:
        # Default: average evaluation over k-fold
        _run_average(
            classifier,
            catalog,
            entity,
            k_folds,
            kwargs,
            performance_out,
            predictions_out,
            dir_io,
        )
Esempio n. 3
0
def cli(classifier, catalog, entity, threshold, name_rule, upload, sandbox,
        dir_io):
    """Run a supervised linker.

    Build the classification set relevant to the given catalog and entity,
    then generate links between Wikidata items and catalog identifiers.

    Output a gzipped CSV file, format: QID,catalog_ID,confidence_score

    You can pass the '-u' flag to upload the output to Wikidata.

    A trained model must exist for the given classifier, catalog, entity.
    To do so, use:

    $ python -m soweego linker train
    """
    actual_classifier = constants.CLASSIFIERS[classifier]

    model_path, result_path = _handle_io(actual_classifier, catalog, entity,
                                         dir_io)
    # Exit if the model file doesn't exist
    if model_path is None:
        sys.exit(1)

    rl.set_option(*constants.CLASSIFICATION_RETURN_SERIES)

    for i, chunk in enumerate(
            execute(model_path, catalog, entity, threshold, name_rule,
                    dir_io)):
        chunk.to_csv(result_path, mode='a', header=False)

        if upload:
            _upload(chunk, i, catalog, entity, sandbox)

    # Free memory in case of neural networks:
    # can be done only after classification
    if actual_classifier in (
            keys.SINGLE_LAYER_PERCEPTRON,
            keys.MULTI_LAYER_PERCEPTRON,
            keys.VOTING_CLASSIFIER,
            keys.GATED_CLASSIFIER,
            keys.STACKED_CLASSIFIER,
    ):
        K.clear_session()  # Clear the TensorFlow graph

    LOGGER.info('Linking completed')
Esempio n. 4
0
def _run_for_all(catalog, entity, threshold, name_rule, upload, sandbox,
                 dir_io, join_method):
    """
    Runs the `linking` procedure using all available classifiers. Joins the results using
    `join_method`
    """
    assert join_method[0] in constants.SC_AVAILABLE_JOIN, (
        'The provided join method needs to be one of: ' +
        str(constants.SC_AVAILABLE_JOIN))

    assert join_method[1] in constants.SC_AVAILABLE_COMBINE, (
        'The provided combine method needs to be one of: ' +
        str(constants.SC_AVAILABLE_COMBINE))

    # ensure that models for all classifiers exist, and directly get the model
    # and results path
    available_classifiers = []
    for classifier_name in list(set(constants.CLASSIFIERS.values())):
        model_path, result_path = _handle_io(classifier_name, catalog, entity,
                                             dir_io)
        # Exit if the model file doesn't exist
        if model_path is None:
            sys.exit(1)

        LOGGER.debug('Loading %s classifier ..', classifier_name)

        available_classifiers.append(
            (classifier_name, joblib.load(model_path), result_path))

    rl.set_option(*constants.CLASSIFICATION_RETURN_SERIES)

    for (
            wd_chunk,
            target_chunk,
            feature_vectors,
    ) in _classification_set_generator(catalog, entity, dir_io):
        # predict the current chunk with all classifiers
        for classifier_name, classifier, result_path in available_classifiers:
            LOGGER.info('Classifying chunk with classifier: %s',
                        classifier_name)

            # The classification set must have the same feature space
            # as the training one
            _add_missing_feature_columns(classifier, feature_vectors)

            predictions = (
                # LSVM doesn't support probability scores
                classifier.predict(feature_vectors) if isinstance(
                    classifier, rl.SVMClassifier) else
                classifier.prob(feature_vectors))

            predictions = _apply_linking_rules(name_rule, predictions,
                                               target_chunk, wd_chunk)

            # Threshold will be applied later, after joining
            (_get_unique_predictions_above_threshold(predictions,
                                                     0.0).to_csv(result_path,
                                                                 mode='a',
                                                                 header=False))

    # Once we have all the classification sets we can proceed to mix them
    # as desired
    all_results = []
    for _, _, result_path in available_classifiers:
        all_results.append(
            (pd.read_csv(result_path,
                         header=None,
                         names=['qid', 'tid',
                                'prediction']).set_index(['qid', 'tid'])))

    LOGGER.info(
        "Joining the results of the classifications using the '%s' method",
        join_method,
    )

    how_to_join, how_to_rem_duplicates = join_method

    # Now we use join the dataframes using the correct method
    merged_results: pd.DataFrame
    if how_to_join == constants.SC_UNION:
        merged_results = ensembles.join_dataframes_by_union(all_results)

    elif how_to_join == constants.SC_INTERSECTION:
        merged_results = ensembles.join_dataframes_by_intersection(all_results)

    # and then proceed to deal with duplicates. This step also removes entries under the
    # specified threshold
    if how_to_rem_duplicates == constants.SC_AVERAGE:
        merged_results = ensembles.remove_duplicates_by_averaging(
            merged_results, threshold)

    elif how_to_rem_duplicates == constants.SC_VOTING:
        merged_results = ensembles.remove_duplicates_by_majority_vote(
            merged_results, threshold)

    merged_results = merged_results['prediction']  # get a pd.Series

    result_path = os.path.join(
        dir_io,
        constants.LINKER_RESULT_JOINED.format(catalog, entity, how_to_join,
                                              how_to_rem_duplicates),
    )

    # Delete existing result file,
    # otherwise the current output would be appended to it
    if os.path.isfile(result_path):
        LOGGER.warning("Will delete old output file found at '%s' ...",
                       result_path)
        os.remove(result_path)

    merged_results.to_csv(result_path, mode='a', header=False)

    if upload:
        _upload(merged_results, 0, catalog, entity, sandbox)

    K.clear_session()  # Clear the TensorFlow graph