def write_filtered_data(args, train_dy_metrics): """ Filter data based on the given metric, and write it in TSV format to train GLUE-style classifier. """ # First save the args for filtering, to keep track of which model was used for filtering. argparse_dict = vars(args) with open(os.path.join(args.filtering_output_dir, f"filtering_configs.json"), "w") as outfile: outfile.write(json.dumps(argparse_dict, indent=4, sort_keys=True) + "\n") # Determine whether to sort data in ascending order or not, based on the metric. is_ascending = consider_ascending_order(args.metric) if args.worst: is_ascending = not is_ascending # Sort by selection. sorted_scores = train_dy_metrics.sort_values(by=[args.metric], ascending=is_ascending) original_train_file = os.path.join(os.path.join(args.data_dir, ''), f"train.tsv") train_numeric, header = read_data(original_train_file, task_name=args.task_name, guid_as_int=True) #是一个字典,字典的key是原始的训练集中的index,值就是这个index对应的一行 for fraction in [0.01, 0.05, 0.10, 0.1667, 0.25, 0.3319, 0.50, 0.75]: outdir = os.path.join(args.filtering_output_dir, f"cartography_{args.metric}_{fraction:.2f}/{args.task_name}") if not os.path.exists(outdir): os.makedirs(outdir) # Dev and test need not be subsampled. copy_dev_test(args.task_name, from_dir=os.path.join(args.data_dir, ''), to_dir=outdir) num_samples = int(fraction * 5276) with open(os.path.join(outdir, f"train.tsv"), "w") as outfile: outfile.write(header + "\n") selected = sorted_scores.head(n=num_samples+1)#5267 if args.both_ends: hardest = sorted_scores.head(n=int(num_samples * 0.7)) easiest = sorted_scores.tail(n=num_samples - hardest.shape[0]) selected = pd.concat([hardest, easiest]) fm = args.metric logger.info(f"Selecting both ends: {fm} = " f"({hardest.head(1)[fm].values[0]:3f}: {hardest.tail(1)[fm].values[0]:3f}) " f"& ({easiest.head(1)[fm].values[0]:3f}: {easiest.tail(1)[fm].values[0]:3f})") selection_iterator = tqdm.tqdm(range(len(selected))) for idx in selection_iterator: selection_iterator.set_description( f"{args.metric} = {selected.iloc[idx][args.metric]:.4f}") selected_id = selected.iloc[idx]["guid"] if args.task_name in ["SNLI", "MNLI"]: selected_id = int(selected_id) elif args.task_name == "WINOGRANDE": selected_id = str(int(selected_id)) if int(selected_id) >= 104743: record = train_numeric[str(int(selected_id))] outfile.write(record + "\n") logger.info(f"Wrote {num_samples} samples to {outdir}.")
def get_test_examples(self, data_dir): return self._create_examples( read_data(os.path.join(data_dir, "test.tsv"), task_name="WINOGRANDE"))
def get_examples(self, data_file, set_type): return self._create_examples( read_data(data_file, task_name="WINOGRANDE"))
args = parser.parse_args() if args.fraction and 0 < args.fraction < 1: fractions = [args.fraction] else: fractions = [0.01, 0.05, 0.10, 0.1667, 0.25, 0.33, 0.50, 0.75] # Read the input train file. input_train_file = os.path.join(args.input_dir, "train.tsv") try: train = pd.read_csv(input_train_file, sep="\t") except pd.errors.ParserError: logger.info( f"Could not parse {input_train_file}. " "Will read it as TSV and then convert into a Pandas dataframe.") train_dict, train_header = read_data(input_train_file, task_name=args.task_name) train = convert_tsv_entries_to_dataframe(train_dict, header=train_header) logger.info(f"Read {len(train)} examples from {input_train_file}. " f"Creating {fractions} subsamples...") outdir_base = f"{args.output_dir}_{args.seed}" for fraction in fractions: outdir = os.path.join( outdir_base, f"{args.task_name}_{fraction:.2f}/{args.task_name}") if not os.path.exists(outdir): os.makedirs(outdir) out_file_name = os.path.join(outdir, "train.tsv") # Dev and test need not be subsampled. copy_dev_test(args.task_name, from_dir=args.input_dir, to_dir=outdir)