Exemple #1
0
def write_filtered_data(args, train_dy_metrics):
  """
  Filter data based on the given metric, and write it in TSV format to train GLUE-style classifier.
  """
  # First save the args for filtering, to keep track of which model was used for filtering.
  argparse_dict = vars(args)
  with open(os.path.join(args.filtering_output_dir, f"filtering_configs.json"), "w") as outfile:
    outfile.write(json.dumps(argparse_dict, indent=4, sort_keys=True) + "\n")

  # Determine whether to sort data in ascending order or not, based on the metric.
  is_ascending = consider_ascending_order(args.metric)
  if args.worst:
    is_ascending = not is_ascending

  # Sort by selection.
  sorted_scores = train_dy_metrics.sort_values(by=[args.metric],
                                               ascending=is_ascending)

  original_train_file = os.path.join(os.path.join(args.data_dir, ''), f"train.tsv")
  train_numeric, header = read_data(original_train_file, task_name=args.task_name, guid_as_int=True)
  #是一个字典,字典的key是原始的训练集中的index,值就是这个index对应的一行

  for fraction in [0.01, 0.05, 0.10, 0.1667, 0.25, 0.3319, 0.50, 0.75]:
    outdir = os.path.join(args.filtering_output_dir,
            f"cartography_{args.metric}_{fraction:.2f}/{args.task_name}")
    if not os.path.exists(outdir):
      os.makedirs(outdir)

    # Dev and test need not be subsampled.
    copy_dev_test(args.task_name,
                  from_dir=os.path.join(args.data_dir, ''),
                  to_dir=outdir)

    num_samples = int(fraction * 5276)
    with open(os.path.join(outdir, f"train.tsv"), "w") as outfile:
      outfile.write(header + "\n")
      selected = sorted_scores.head(n=num_samples+1)#5267
      if args.both_ends:
        hardest = sorted_scores.head(n=int(num_samples * 0.7))
        easiest = sorted_scores.tail(n=num_samples - hardest.shape[0])
        selected = pd.concat([hardest, easiest])
        fm = args.metric
        logger.info(f"Selecting both ends: {fm} = "
                    f"({hardest.head(1)[fm].values[0]:3f}: {hardest.tail(1)[fm].values[0]:3f}) "
                    f"& ({easiest.head(1)[fm].values[0]:3f}: {easiest.tail(1)[fm].values[0]:3f})")

      selection_iterator = tqdm.tqdm(range(len(selected)))
      for idx in selection_iterator:
        selection_iterator.set_description(
          f"{args.metric} = {selected.iloc[idx][args.metric]:.4f}")

        selected_id = selected.iloc[idx]["guid"]
        if args.task_name in ["SNLI", "MNLI"]:
          selected_id = int(selected_id)
        elif args.task_name == "WINOGRANDE":
          selected_id = str(int(selected_id))
        if int(selected_id) >= 104743:
            record = train_numeric[str(int(selected_id))]
        outfile.write(record + "\n")

    logger.info(f"Wrote {num_samples} samples to {outdir}.")
Exemple #2
0
 def get_test_examples(self, data_dir):
     return self._create_examples(
         read_data(os.path.join(data_dir, "test.tsv"),
                   task_name="WINOGRANDE"))
Exemple #3
0
 def get_examples(self, data_file, set_type):
     return self._create_examples(
         read_data(data_file, task_name="WINOGRANDE"))
    args = parser.parse_args()

    if args.fraction and 0 < args.fraction < 1:
        fractions = [args.fraction]
    else:
        fractions = [0.01, 0.05, 0.10, 0.1667, 0.25, 0.33, 0.50, 0.75]

    # Read the input train file.
    input_train_file = os.path.join(args.input_dir, "train.tsv")
    try:
        train = pd.read_csv(input_train_file, sep="\t")
    except pd.errors.ParserError:
        logger.info(
            f"Could not parse {input_train_file}. "
            "Will read it as TSV and then convert into a Pandas dataframe.")
        train_dict, train_header = read_data(input_train_file,
                                             task_name=args.task_name)
        train = convert_tsv_entries_to_dataframe(train_dict,
                                                 header=train_header)

    logger.info(f"Read {len(train)} examples from {input_train_file}. "
                f"Creating {fractions} subsamples...")
    outdir_base = f"{args.output_dir}_{args.seed}"
    for fraction in fractions:
        outdir = os.path.join(
            outdir_base, f"{args.task_name}_{fraction:.2f}/{args.task_name}")
        if not os.path.exists(outdir):
            os.makedirs(outdir)
        out_file_name = os.path.join(outdir, "train.tsv")

        # Dev and test need not be subsampled.
        copy_dev_test(args.task_name, from_dir=args.input_dir, to_dir=outdir)