def _process_file(filename, base_dir=None):
    """Preprocesses supplied data file."""
    if not base_dir:
        base_dir = FLAGS.sigtyp_dir
    full_path = os.path.join(base_dir, filename + ".csv")
    _, df, data_info = sigtyp.read(
        full_path, categorical_as_ints=FLAGS.categorical_as_ints)
    _write_dict(data_info, filename, const.DATA_INFO_FILENAME)

    # Save preprocessed data frames to a csv.
    output_file = os.path.join(FLAGS.output_dir, filename + ".csv")
    logging.info("Saving preprocessed data to \"%s\" ...", output_file)
    df.to_csv(output_file, sep="|", index=False, float_format="%g")
    return data_info
Esempio n. 2
0
def main(unused_argv):
    # Check flags.
    if not FLAGS.sigtyp_dir:
        raise ValueError("Specify --sigtyp_dir for input data!")
    if not FLAGS.training_data_dir:
        raise ValueError("Specify --training_data_dir!")
    if FLAGS.prediction_mode and not FLAGS.output_sigtyp_predictions_file:
        raise ValueError("In prediction mode specify "
                         "--output_sigtyp_predictions_file!")

    # Load the test data (note: we read the vanilla test data in SIGTYP format).
    filename = os.path.join(FLAGS.sigtyp_dir, FLAGS.test_set_name + ".csv")
    vanilla_test_df, _, vanilla_data_info = sigtyp.read(
        filename, categorical_as_ints=False)
    if not len(vanilla_test_df):  # pylint: disable=g-explicit-length-test
        raise ValueError("Test dataset is empty!")
    if len(vanilla_test_df.columns) != 8:
        raise ValueError("Wrong number of columns: %d" %
                         len(vanilla_test_df.columns))

    # Run evaluation/prediction.
    features_to_predict = _features_to_predict(vanilla_test_df)
    _evaluate(vanilla_test_df.copy(), vanilla_data_info,
              _make_model(features_to_predict))
Esempio n. 3
0
def _evaluate(test_df, test_data_info, model):
    """Evaluates the model on the supplied dataframe."""
    # Prepare the test languages. This lets the model precompute some of the
    # information that is based solely on the language context, not the features.
    logging.info("Preparing test languages ...")
    test_languages = test_df.to_dict(orient="row")
    for test_language_df in test_languages:
        model.prepare_target(test_language_df)

    # Run actual evaluation.
    mode_info = "Prediction" if FLAGS.prediction_mode else "Evaluation"
    logging.info("[%s] Running over %d languages ...", mode_info, len(test_df))
    total_num_evals = 0
    total_num_correct = 0.0
    feature_counts = {}
    all_languages_predictions = []
    for _, test_language_df in test_df.iterrows():
        lang_num_evals, lang_num_correct, predictions = _evaluate_language(
            test_language_df, model, feature_counts)
        total_num_evals += lang_num_evals
        total_num_correct += lang_num_correct
        all_languages_predictions.append(predictions)

    logging.info("Total number of evals: %d", total_num_evals)
    if total_num_evals == 0:
        logging.warning(
            "No features to predict found. You are probably not using "
            "the blind test set and should either use it or switch the "
            "prediction mode off with '--noprediction_mode'.")
    if not FLAGS.prediction_mode:
        logging.info("Global Accuracy: %f%%",
                     total_num_correct / total_num_evals * 100.0)
    for feature in sorted(feature_counts):
        stats = feature_counts[feature]
        if not FLAGS.prediction_mode:
            logging.info("%s: [n=%d] Accuracy: %f%%", feature, stats["total"],
                         (stats["correct"] / stats["total"] *
                          100.0 if stats["total"] != 0.0 else 0.0))
        else:
            logging.info("%s: [%d predictions]", feature, stats["total"])

    # Save the test dataset with all the features filled in.
    if not FLAGS.prediction_mode:
        return
    logging.info("Saving predictions to \"%s\" ...",
                 FLAGS.output_sigtyp_predictions_file)
    columns = [
        "wals_code", "name", "latitude", "longitude", "genus", "family",
        "countrycodes", "features"
    ]
    result_test_df = pd.DataFrame(all_languages_predictions, columns=columns)
    result_test_df.to_csv(FLAGS.output_sigtyp_predictions_file,
                          index=False,
                          sep="\t",
                          encoding=const.ENCODING,
                          quotechar='"')

    # Sanity check. Read the data back in and make sure the values are sane.
    logging.info("Sanity check ...")
    read_result_test_df, _, read_result_data_info = sigtyp.read(
        FLAGS.output_sigtyp_predictions_file,
        categorical_as_ints=False,
        verbose=False)
    if len(test_df) != len(read_result_test_df):
        raise ValueError("Expected %s languages in the resulting file!" %
                         len(test_df))
    vanilla_num_feats = len(test_data_info[const.DATA_KEY_FEATURES])
    read_num_feats = len(read_result_data_info[const.DATA_KEY_FEATURES])
    if vanilla_num_feats > read_num_feats:
        raise ValueError(
            "Expected same or larger number of feature to be present: "
            "Original %d, read %d" % (vanilla_num_feats, read_num_feats))
    test_df_nonzero_values = test_df.count(axis=1).sum()
    result_df_non_zero_values = read_result_test_df.count(axis=1).sum()
    if test_df_nonzero_values != result_df_non_zero_values:
        raise ValueError(
            "Expected same number of non-zero values in predictions!")