def _run_for_one(classifier, catalog, entity, threshold, name_rule, upload, sandbox, dir_io): """ Runs the `linking` procedure for only one classifier """ actual_classifier = constants.CLASSIFIERS[classifier] model_path, result_path = _handle_io(actual_classifier, catalog, entity, dir_io) # Exit if the model file doesn't exist if model_path is None: sys.exit(1) rl.set_option(*constants.CLASSIFICATION_RETURN_SERIES) for i, chunk in enumerate( execute(model_path, catalog, entity, threshold, name_rule, dir_io)): chunk.to_csv(result_path, mode='a', header=False) if upload: _upload(chunk, i, catalog, entity, sandbox) # Free memory in case of neural networks: # can be done only after classification if actual_classifier in ( keys.SINGLE_LAYER_PERCEPTRON, keys.MULTI_LAYER_PERCEPTRON, ): K.clear_session() # Clear the TensorFlow graph
def cli(ctx, classifier, catalog, entity, k_folds, single, nested, metric, dir_io): """Evaluate the performance of a supervised linker. By default, run 5-fold cross-validation and return averaged performance scores. """ kwargs = utils.handle_extra_cli_args(ctx.args) if kwargs is None: sys.exit(1) rl.set_option(*constants.CLASSIFICATION_RETURN_INDEX) performance_out, predictions_out = _build_output_paths( catalog, entity, classifier, dir_io) # -n, --nested if nested: _run_nested( classifier, catalog, entity, k_folds, metric, kwargs, performance_out, dir_io, ) # -s, --single elif single: _run_single( classifier, catalog, entity, k_folds, kwargs, performance_out, predictions_out, dir_io, ) else: # Default: average evaluation over k-fold _run_average( classifier, catalog, entity, k_folds, kwargs, performance_out, predictions_out, dir_io, )
def cli(classifier, catalog, entity, threshold, name_rule, upload, sandbox, dir_io): """Run a supervised linker. Build the classification set relevant to the given catalog and entity, then generate links between Wikidata items and catalog identifiers. Output a gzipped CSV file, format: QID,catalog_ID,confidence_score You can pass the '-u' flag to upload the output to Wikidata. A trained model must exist for the given classifier, catalog, entity. To do so, use: $ python -m soweego linker train """ actual_classifier = constants.CLASSIFIERS[classifier] model_path, result_path = _handle_io(actual_classifier, catalog, entity, dir_io) # Exit if the model file doesn't exist if model_path is None: sys.exit(1) rl.set_option(*constants.CLASSIFICATION_RETURN_SERIES) for i, chunk in enumerate( execute(model_path, catalog, entity, threshold, name_rule, dir_io)): chunk.to_csv(result_path, mode='a', header=False) if upload: _upload(chunk, i, catalog, entity, sandbox) # Free memory in case of neural networks: # can be done only after classification if actual_classifier in ( keys.SINGLE_LAYER_PERCEPTRON, keys.MULTI_LAYER_PERCEPTRON, keys.VOTING_CLASSIFIER, keys.GATED_CLASSIFIER, keys.STACKED_CLASSIFIER, ): K.clear_session() # Clear the TensorFlow graph LOGGER.info('Linking completed')
def _run_for_all(catalog, entity, threshold, name_rule, upload, sandbox, dir_io, join_method): """ Runs the `linking` procedure using all available classifiers. Joins the results using `join_method` """ assert join_method[0] in constants.SC_AVAILABLE_JOIN, ( 'The provided join method needs to be one of: ' + str(constants.SC_AVAILABLE_JOIN)) assert join_method[1] in constants.SC_AVAILABLE_COMBINE, ( 'The provided combine method needs to be one of: ' + str(constants.SC_AVAILABLE_COMBINE)) # ensure that models for all classifiers exist, and directly get the model # and results path available_classifiers = [] for classifier_name in list(set(constants.CLASSIFIERS.values())): model_path, result_path = _handle_io(classifier_name, catalog, entity, dir_io) # Exit if the model file doesn't exist if model_path is None: sys.exit(1) LOGGER.debug('Loading %s classifier ..', classifier_name) available_classifiers.append( (classifier_name, joblib.load(model_path), result_path)) rl.set_option(*constants.CLASSIFICATION_RETURN_SERIES) for ( wd_chunk, target_chunk, feature_vectors, ) in _classification_set_generator(catalog, entity, dir_io): # predict the current chunk with all classifiers for classifier_name, classifier, result_path in available_classifiers: LOGGER.info('Classifying chunk with classifier: %s', classifier_name) # The classification set must have the same feature space # as the training one _add_missing_feature_columns(classifier, feature_vectors) predictions = ( # LSVM doesn't support probability scores classifier.predict(feature_vectors) if isinstance( classifier, rl.SVMClassifier) else classifier.prob(feature_vectors)) predictions = _apply_linking_rules(name_rule, predictions, target_chunk, wd_chunk) # Threshold will be applied later, after joining (_get_unique_predictions_above_threshold(predictions, 0.0).to_csv(result_path, mode='a', header=False)) # Once we have all the classification sets we can proceed to mix them # as desired all_results = [] for _, _, result_path in available_classifiers: all_results.append( (pd.read_csv(result_path, header=None, names=['qid', 'tid', 'prediction']).set_index(['qid', 'tid']))) LOGGER.info( "Joining the results of the classifications using the '%s' method", join_method, ) how_to_join, how_to_rem_duplicates = join_method # Now we use join the dataframes using the correct method merged_results: pd.DataFrame if how_to_join == constants.SC_UNION: merged_results = ensembles.join_dataframes_by_union(all_results) elif how_to_join == constants.SC_INTERSECTION: merged_results = ensembles.join_dataframes_by_intersection(all_results) # and then proceed to deal with duplicates. This step also removes entries under the # specified threshold if how_to_rem_duplicates == constants.SC_AVERAGE: merged_results = ensembles.remove_duplicates_by_averaging( merged_results, threshold) elif how_to_rem_duplicates == constants.SC_VOTING: merged_results = ensembles.remove_duplicates_by_majority_vote( merged_results, threshold) merged_results = merged_results['prediction'] # get a pd.Series result_path = os.path.join( dir_io, constants.LINKER_RESULT_JOINED.format(catalog, entity, how_to_join, how_to_rem_duplicates), ) # Delete existing result file, # otherwise the current output would be appended to it if os.path.isfile(result_path): LOGGER.warning("Will delete old output file found at '%s' ...", result_path) os.remove(result_path) merged_results.to_csv(result_path, mode='a', header=False) if upload: _upload(merged_results, 0, catalog, entity, sandbox) K.clear_session() # Clear the TensorFlow graph