Exemple #1
0
def make_feature_maker(algo_name, training_data_dir,
                       train_set_name, dev_set_name):
  logging.info("[%s] Initializing feature maker ...", algo_name)
  train_path = os.path.join(training_data_dir, train_set_name + ".csv")
  dev_path = os.path.join(training_data_dir, dev_set_name + ".csv")
  data_info_path = data_info_lib.data_info_path_for_testing(training_data_dir)
  return feature_lib.FeatureMaker(train_path, dev_path, data_info_path)
Exemple #2
0
def main(unused_argv):
    if not FLAGS.dev_data_file:
        raise ValueError("Specify --dev_data_file")
    if not FLAGS.training_data_file:
        raise ValueError("Specify --training_data_file")
    if not FLAGS.data_info_file:
        raise ValueError("Specify --data_info_file")
    if not (FLAGS.target_feature or FLAGS.target_feature_file):
        raise ValueError("Specify --target_feature or --target_feature_file")
    if FLAGS.cross_validate and not FLAGS.best_configurations_file:
        raise ValueError(
            "Specify --best_configurations_file in cross-validation "
            "mode")

    features = []
    if FLAGS.target_feature_file:
        with open(FLAGS.target_feature_file) as s:
            for line in s:
                features.append(line.strip())
    else:
        features = [FLAGS.target_feature]

    # Process features.
    feature_maker = feature_lib.FeatureMaker(FLAGS.training_data_file,
                                             FLAGS.dev_data_file,
                                             FLAGS.data_info_file)

    # Perform cross-validation to establish the best configurations of models
    # and features or simply train and evaluate.
    if FLAGS.cross_validate:
        best_configs = _cross_validation_training(feature_maker, features)
        logging.info("Saving best configs to \"%s\" ...",
                     FLAGS.best_configurations_file)
        with open(FLAGS.best_configurations_file, "w",
                  encoding=const.ENCODING) as f:
            json.dump(best_configs, f)
    else:
        _train_and_evaluate(feature_maker, features)
Exemple #3
0
def main(unused_argv):
  feature_maker = lib.FeatureMaker(
      FLAGS.training_data,
      FLAGS.dev_data,
      FLAGS.data_info)
  training_df, dev_df = feature_maker.process_data(
      "Order_of_Subject,_Object_and_Verb")
  long_implicational = (
      "The_Position_of_Negative_Morphemes_in_SOV_Languages"
      "@18 SV&OV&NegV@Order_of_Subject,_Object_and_Verb_majval")
  assert "family_majval" in dev_df.columns
  assert "family_count" in dev_df.columns
  assert long_implicational in dev_df.columns
  assert long_implicational in training_df.columns
  non_zeroes = []
  for fname in training_df.columns:
    if ("majval" in fname and
        "genus" not in fname and
        "family" not in fname and
        "neighborhood" not in fname):
      for i in training_df[fname]:
        if i:
          non_zeroes.append(i)
  # Show that there are non-zero (non-NA) entries for implicationals:
  assert non_zeroes
  non_zeroes = []
  for fname in dev_df.columns:
    if ("majval" in fname and
        "genus" not in fname and
        "family" not in fname and
        "neighborhood" not in fname):
      for i in dev_df[fname]:
        if i:
          non_zeroes.append(i)
  # Show that there are non-zero (non-NA) entries for implicationals:
  assert non_zeroes

  # Remove some of the columns.
  #
  # Obviously if you use this make sure you do the same thing to both training
  # and dev.
  smaller_dev_df = feature_maker.select_columns(
      dev_df, discard_counts=True, discard_implicationals=True)
  assert "wals_code" in smaller_dev_df.columns
  assert "target_value" in smaller_dev_df.columns
  assert long_implicational not in smaller_dev_df.columns
  assert "family_count" not in smaller_dev_df.columns
  # Remove some different columns
  smaller_dev_df = feature_maker.select_columns(
      dev_df, discard_counts=True, discard_implicationals=False)
  assert "wals_code" in smaller_dev_df.columns
  assert "target_value" in smaller_dev_df.columns
  assert long_implicational in smaller_dev_df.columns
  assert "family_count" not in smaller_dev_df.columns

  # Try another feature
  training_df, dev_df = feature_maker.process_data("Hand_and_Arm")
  assert "family_majval" in dev_df.columns
  assert "family_count" in dev_df.columns
  long_implicational = (
      "Number_of_Cases@9 Exclusively borderline case-marking"
      "@Hand_and_Arm_majval")
  assert long_implicational in dev_df.columns
  for c in dev_df.columns:
    assert c in training_df.columns