def _process_file(filename, base_dir=None): """Preprocesses supplied data file.""" if not base_dir: base_dir = FLAGS.sigtyp_dir full_path = os.path.join(base_dir, filename + ".csv") _, df, data_info = sigtyp.read( full_path, categorical_as_ints=FLAGS.categorical_as_ints) _write_dict(data_info, filename, const.DATA_INFO_FILENAME) # Save preprocessed data frames to a csv. output_file = os.path.join(FLAGS.output_dir, filename + ".csv") logging.info("Saving preprocessed data to \"%s\" ...", output_file) df.to_csv(output_file, sep="|", index=False, float_format="%g") return data_info
def main(unused_argv): # Check flags. if not FLAGS.sigtyp_dir: raise ValueError("Specify --sigtyp_dir for input data!") if not FLAGS.training_data_dir: raise ValueError("Specify --training_data_dir!") if FLAGS.prediction_mode and not FLAGS.output_sigtyp_predictions_file: raise ValueError("In prediction mode specify " "--output_sigtyp_predictions_file!") # Load the test data (note: we read the vanilla test data in SIGTYP format). filename = os.path.join(FLAGS.sigtyp_dir, FLAGS.test_set_name + ".csv") vanilla_test_df, _, vanilla_data_info = sigtyp.read( filename, categorical_as_ints=False) if not len(vanilla_test_df): # pylint: disable=g-explicit-length-test raise ValueError("Test dataset is empty!") if len(vanilla_test_df.columns) != 8: raise ValueError("Wrong number of columns: %d" % len(vanilla_test_df.columns)) # Run evaluation/prediction. features_to_predict = _features_to_predict(vanilla_test_df) _evaluate(vanilla_test_df.copy(), vanilla_data_info, _make_model(features_to_predict))
def _evaluate(test_df, test_data_info, model): """Evaluates the model on the supplied dataframe.""" # Prepare the test languages. This lets the model precompute some of the # information that is based solely on the language context, not the features. logging.info("Preparing test languages ...") test_languages = test_df.to_dict(orient="row") for test_language_df in test_languages: model.prepare_target(test_language_df) # Run actual evaluation. mode_info = "Prediction" if FLAGS.prediction_mode else "Evaluation" logging.info("[%s] Running over %d languages ...", mode_info, len(test_df)) total_num_evals = 0 total_num_correct = 0.0 feature_counts = {} all_languages_predictions = [] for _, test_language_df in test_df.iterrows(): lang_num_evals, lang_num_correct, predictions = _evaluate_language( test_language_df, model, feature_counts) total_num_evals += lang_num_evals total_num_correct += lang_num_correct all_languages_predictions.append(predictions) logging.info("Total number of evals: %d", total_num_evals) if total_num_evals == 0: logging.warning( "No features to predict found. You are probably not using " "the blind test set and should either use it or switch the " "prediction mode off with '--noprediction_mode'.") if not FLAGS.prediction_mode: logging.info("Global Accuracy: %f%%", total_num_correct / total_num_evals * 100.0) for feature in sorted(feature_counts): stats = feature_counts[feature] if not FLAGS.prediction_mode: logging.info("%s: [n=%d] Accuracy: %f%%", feature, stats["total"], (stats["correct"] / stats["total"] * 100.0 if stats["total"] != 0.0 else 0.0)) else: logging.info("%s: [%d predictions]", feature, stats["total"]) # Save the test dataset with all the features filled in. if not FLAGS.prediction_mode: return logging.info("Saving predictions to \"%s\" ...", FLAGS.output_sigtyp_predictions_file) columns = [ "wals_code", "name", "latitude", "longitude", "genus", "family", "countrycodes", "features" ] result_test_df = pd.DataFrame(all_languages_predictions, columns=columns) result_test_df.to_csv(FLAGS.output_sigtyp_predictions_file, index=False, sep="\t", encoding=const.ENCODING, quotechar='"') # Sanity check. Read the data back in and make sure the values are sane. logging.info("Sanity check ...") read_result_test_df, _, read_result_data_info = sigtyp.read( FLAGS.output_sigtyp_predictions_file, categorical_as_ints=False, verbose=False) if len(test_df) != len(read_result_test_df): raise ValueError("Expected %s languages in the resulting file!" % len(test_df)) vanilla_num_feats = len(test_data_info[const.DATA_KEY_FEATURES]) read_num_feats = len(read_result_data_info[const.DATA_KEY_FEATURES]) if vanilla_num_feats > read_num_feats: raise ValueError( "Expected same or larger number of feature to be present: " "Original %d, read %d" % (vanilla_num_feats, read_num_feats)) test_df_nonzero_values = test_df.count(axis=1).sum() result_df_non_zero_values = read_result_test_df.count(axis=1).sum() if test_df_nonzero_values != result_df_non_zero_values: raise ValueError( "Expected same number of non-zero values in predictions!")