def evaluate(prediction_folder, gold_files, output, metric, split_reasoning, split_num_modifier): eval_metrics = metric result = dict() for gold_file in gold_files: click.echo(f"Evaluating predictions on {click.style(gold_file, fg='blue')}") gold = list(sample_iter(load_json(gold_file))) gold_descriptor, prediction_files = match_prediction_to_gold(gold_file, prediction_folder) result[gold_file] = {'n': len(gold)} logger.debug(prediction_files) for prediction_file in sorted(prediction_files): model_name = extract_model_name(gold_descriptor, prediction_file) result[gold_file][model_name] = dict() click.echo(f"Evaluating predictions of model {click.style(model_name, fg='green')}") predictions = load_json(prediction_file) result[gold_file][model_name]['full'] = _get_score(gold, predictions, eval_metrics) click.echo() if split_reasoning: for reasoning, gold_split in groupby(sorted(gold, key=reasoning_key), key=reasoning_key): result[gold_file][model_name][reasoning] = _get_score(list(gold_split), predictions, eval_metrics, reasoning) click.echo() if split_num_modifier: for num_mod, gold_split in groupby(sorted(gold, key=num_modifier_key), key=num_modifier_key): result[gold_file][model_name][num_mod] = _get_score(list(gold_split), predictions, eval_metrics, f"Num modifications: {num_mod}") click.echo() write_json(result, output)
def count(action, n, output, domain: Bundle, mod): result = {} templates = domain[f"templates_{mod}" if mod else "templates"] actions, sentences = get_templates(templates=templates, action=action, n=n, command="Counting") for action in actions: click.echo(f"For action '{click.style(action, fg='blue')}':") r = SizeEstimator(processor=Processor(accessor=Accessor(**templates), chooser=RandomChooser())) upper_bound = r.estimate_size(r.processor.accessor.sentences[action]) r = SizeEstimator(processor=Processor(accessor=Accessor(**templates), chooser=RandomChooser())) lower_bound = r.estimate_size(r.processor.accessor.sentences[action], pessimistic=True) click.secho( f"Pessimistically speaking, you can generate {click.style(str(lower_bound), fg='red', bold=True)} " f"distinct sentences!") click.secho( f"Optimistically speaking, you can generate {click.style(str(upper_bound), fg='green', bold=True)} " f"distinct sentences!") result[action] = {"lower": lower_bound, "upper": upper_bound} if output: write_json(result, output)
def diversity(input, reference, output, attr, random_seed, subsample, metric: List[Type[Distance]]): if random_seed: random.seed(random_seed) # TODO: make metrics appendable sample = load_json(input) reference = load_json(reference) getter: Callable[[Any], str] = itemgetter(attr) # samples corpus: List[str] = [s['paragraphs'][0]['context'] for s in sample['data']] corpus_reference: List[str] = [getter(s) for s in reference] if subsample: corpus = random.sample(corpus, subsample) n = len(corpus) n_reference = len(corpus_reference) logger.debug(f"Evaluating sample with n={n} paragraphs.") results = dict() for m in metric: result = np.array(pointwise_average_distance(corpus, m())) result_reference = np.array( pointwise_average_distance(corpus_reference, m())) mean, var, ci95 = get_mean_var_ci(result, alpha=0.025) mean_ref, var_ref, ci95_ref = get_mean_var_ci(result_reference, alpha=0.025) printable_result = f'{mean:.4f} +/- {ci95:.4f}' printable_result_reference = f'{mean_ref:.4f} +/- {ci95_ref:.4f}' click.echo( f"Point-wise average distance under the {click.style(str(m.__name__), fg='green')} metric (n={n}): " f"{click.style(printable_result, fg='green', bold=True)}") click.echo( f"Reference point-wise average distance under the {click.style(str(m.__name__), fg='green')} " f"metric(n={len(corpus_reference)}): " f"{click.style(printable_result_reference, fg='green', bold=True)}" ) results[str(m.__name__)] = { 'ours': { "n": n, 'human_readable': printable_result, 'mean': mean, 'variance': var, '95ci': ci95, }, "reference": { 'n': n_reference, 'human_readable': printable_result_reference, 'mean': mean_ref, 'variance': var_ref, '95ci': ci95_ref }, "difference": mean - mean_ref } if output: write_json(results, output)
def predictions(in_files, out_folder, model_paths, model_types, no_cuda, per_gpu_eval_batch_size, do_not_lower_case, lang_id, v2, n_best_size, max_answer_length, verbose_logging, null_score_diff_threshold, do_evaluate, **kwargs): assert len(model_paths) == len(model_types) for model_path, model_type in zip(model_paths, model_types): model = get_model(model_path) args = Args(model_path=model_path, model_type=model_type, predictions_folder=out_folder, no_cuda=no_cuda, do_not_lower_case=do_not_lower_case, per_gpu_eval_batch_size=per_gpu_eval_batch_size, lang_id=lang_id, v2=v2, n_best_size=n_best_size, max_answer_length=max_answer_length, verbose_logging=verbose_logging, null_score_diff_threshold=null_score_diff_threshold, **kwargs) tokenizer = get_tokenizer(model_path, args.do_lower_case) for in_file in in_files: args.eval_file = in_file logger.debug(args) dataset, examples, features = load_or_convert(args, tokenizer, evaluate=True) if do_evaluate: out_path = args.predictions_folder args.predictions_folder = None suffix = os.path.basename(os.path.normpath(model_path)) score = evaluate(args, model, tokenizer, dataset, examples, features, suffix=suffix, return_raw=False) file_name = get_output_predictions_file_name( args.eval_file, out_path, suffix) write_json(score, file_name) args.predictions_folder = out_path else: evaluate(args, model, tokenizer, dataset, examples, features, suffix=os.path.basename(os.path.normpath(model_path)))
def predict(in_files, output_folder, models, model_classes, gpu, batch_size): # There is a chance i'll need to scrap all of this and do convert to features stuff if gpu is None: gpu = _is_gpu_available() logger.debug(fmt_dict(locals())) if not len(models) == len(model_classes): click.echo( f"Num models supplied ({len(models)})!= num model classes supplied ({len(model_classes)})!" ) sys.exit(1) for cls, weights_path in zip(model_classes, models): model_cls: Model = do_import(cls, relative_import='stresstest.model') # TODO: Bidaf should also respect max answer length model = model_cls.make(weights_path, gpu=gpu) click.echo( f"Evaluating model '{click.style(model_cls.__name__, fg='green', bold=True)}' from weights file: " f"{click.style(weights_path, fg='blue')}.") click.echo( f"Running on {click.style('gpu' if gpu else 'cpu', fg='green', bold=True)}." ) for in_file in in_files: sample = load_json(in_file) num_q = num_questions(sample) click.echo( f"Evaluating on sample (n={num_q}, |{{C}}|={len(sample)}): {click.style(in_file, fg='blue')}" ) predictions = dict() for sample_batch in batch(tqdm(sample_iter(sample), position=1, total=num_q), batch_size=batch_size): sample_batch: List[Entry] batch_predictions = model.predict_batch(sample_batch) for entry, answer in zip(sample_batch, batch_predictions): logger.debug(f"Passage: {entry.passage}") logger.debug(f"Question: {entry.question}") logger.debug(f"Prediction: {answer}") predictions[entry.qa_id] = str(answer) output_file_name = get_output_predictions_file_name( in_file, output_folder, weights_path) click.echo( f"Saving predictions to {click.style(output_file_name, fg='blue')}" ) write_json(predictions, output_file_name, pretty=False)
def update_zone_names(output_path): # update all the zone names and set the right ids to be written in the poly_zone_ids.bin global poly_zone_ids global list_of_pointers global poly_boundaries global polygons global polygon_lengths global polynrs_of_holes global nr_of_zones global nr_of_polygons file_path = abspath(join(output_path, TIMEZONE_NAMES_FILE)) print(f"updating the zone names in {file_path} now.") # pickle the zone names (python array) write_json(all_tz_names, file_path) print("...Done.\n\nComputing where zones start and end...") last_id = -1 zone_id = 0 poly_nr = 0 for poly_nr, zone_id in enumerate(poly_zone_ids): if zone_id != last_id: poly_nr2zone_id.append(poly_nr) assert zone_id >= last_id last_id = zone_id assert nr_of_polygons == len(poly_zone_ids) # TODO # assert ( # zone_id == nr_of_zones - 1 # ), f"not pointing to the last zone with id {nr_of_zones - 1}" # assert ( # poly_nr == nr_of_polygons - 1 # ), f"not pointing to the last polygon with id {nr_of_polygons - 1}" # ATTENTION: add one more entry for knowing where the last zone ends! # ATTENTION: the last entry is one higher than the last polygon id (to be consistant with the poly_nr2zone_id.append(nr_of_polygons) # assert len(poly_nr2zone_id) == nr_of_zones + 1 print("...Done.\n")
def generate_modifier(config, out_path, seed, subsample, do_print, do_save, domain, num_workers, split_templates, modifier_type): if seed: random.seed(seed) uuid4 = lambda: uuid.UUID(int=random.getrandbits(128)).hex cfg = Config(config) max_sents = cfg["world.num_sentences"] modify_event_types = cfg['modify_event_types'] # num of modifications: f(max_sent) = |modify_event_types| * 1/3 * max_sent * (max_sent - 1) * (max_sent - 2) n = int( len(modify_event_types) * 1 / 3 * max_sents * (max_sents - 1) * (max_sents - 2)) if split_templates: first, second = split(domain.templates_modifier, event_types_to_split=modify_event_types, split_ratio=split_templates) template_splits = [first, second] click.echo(f"Splitting templates with a {split_templates} ratio.") for event_type, templates in domain.templates_modifier[ 'sentences'].items(): click.echo(f"For event type '{event_type}'") click.echo( f"First split: {[templates.index(t) for t in first['sentences'][event_type]]}" ) click.echo( f"Second split: {[templates.index(t) for t in second['sentences'][event_type]]}" ) else: template_splits = [domain.templates_modifier] for i, templates in enumerate(template_splits): answer_types = cfg.get('answer_types') question_types = cfg.get('reasoning') subsample_str = subsample if subsample else 'full' subsample_str = f"{subsample_str}-{i}" if split_templates else subsample_str file_name = f"{{}}-{'-'.join(answer_types)}-{'-'.join(question_types)}-{n}-{subsample_str}.json" click.echo( f"Generating from '{click.style(config, fg='green')}': {click.style(str(n), fg='green', bold=True)} passages, " f"{click.style(str(subsample_str), fg='green', bold=True)} realisation per passage." ) click.echo( f"Saving baseline in " f"{click.style(os.path.join(out_path, file_name.format(BASELINE)), fg='blue', bold=True)}." ) click.echo( f"Saving modified in " f"{click.style(os.path.join(out_path, file_name.format(INTERVENTION)), fg='blue', bold=True)}." ) click.echo( f"Saving control in " f"{click.style(os.path.join(out_path, file_name.format(CONTROL)), fg='blue', bold=True)}." ) baseline, modified, control = generate(cfg, domain, num_workers, subsample, templates, uuid4, modifier_type) baseline = sorted(baseline, key=lambda d: d['title']) modified = sorted(modified, key=lambda d: d['title']) control = sorted(control, key=lambda d: d['title']) if do_print: _do_print(baseline, modified, control) click.echo(f"Total Passages: {len(baseline)}") click.echo( f"Total Questions over baseline passages: {sum(len(b['paragraphs'][0]['qas']) for b in baseline)}" ) click.echo( f"Total Questions over modified passages: {sum(len(b['paragraphs'][0]['qas']) for b in modified)}" ) if do_save: write_json({ "version": 0.1, "data": baseline }, os.path.join(out_path, file_name.format(BASELINE)), pretty=False) write_json({ "version": 0.1, "data": modified }, os.path.join(out_path, file_name.format(INTERVENTION)), pretty=False) write_json({ "version": 0.1, "data": control }, os.path.join(out_path, file_name.format(CONTROL)), pretty=False)
def train(**kwargs): # doc_stride = kwargs.pop("doc_stride") # max_query_length = kwargs.pop('max_query_length') # max_seq_length = kwargs.pop("max_seq_length") # num_workers = kwargs.pop('num_workers') # debug_features = kwargs.pop('debug_features') # do_lower_case = not kwargs.pop('do_not_lower_case') # kwargs['logging_steps'] = [int(i) for i in kwargs['logging_steps'].split(',')] if kwargs['logging_steps'] else [] args = Args(**kwargs) args.local_rank = int(os.environ.get('LOCAL_RANK', -1)) logger.debug(args) if (os.path.exists(args.save_model_folder) and os.listdir(args.save_model_folder) and not args.overwrite_output_dir): raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.save_model_folder)) # os.makedirs(args.predictions_folder, exist_ok=True) os.makedirs(args.save_model_folder, exist_ok=True) # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging if args.local_rank not in [-1, 0]: logger.remove() logger.add(sys.stdout, level="WARNING") logger.warning( f"Process rank: {args.local_rank}, device: {device}, n_gpu: " f"{args.n_gpu}, distributed training: " f"{bool(args.local_rank != -1)}, 16-bits training: {args.fp16}", ) # Set seed set_seed(args) # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: # Make sure only the first process in distributed training will download model & vocab torch.distributed.barrier() args.model_type = args.model_type.lower() tokenizer = get_tokenizer(args.model_path, args.do_lower_case) model = get_model(args.model_path) if args.local_rank == 0: # Make sure only the first process in distributed training will download model & vocab torch.distributed.barrier() model.to(args.device) # logger.info("Training/evaluation parameters %s", args) # Before we do anything with models, we want to ensure that we get fp16 execution of torch.einsum # if args.fp16 is set. # Otherwise it'll default to "promote" mode, and we'll get fp32 operations. # Note that running `--fp16_opt_level="O2"` will # remove the need for this code, but it is still valid. if args.fp16: try: import apex apex.amp.register_half_function(torch, "einsum") except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) train_dataset = load_or_convert(args, tokenizer, dataset_only=True) # train_dataset, e, f = load_examples(args.train_file) logger.info("loaded dataset") global_step, tr_loss = do_train(args, train_dataset, model, tokenizer) logger.info(f"global_step = {global_step}, average loss = {tr_loss}") if args.local_rank == -1 or torch.distributed.get_rank() == 0: logger.info(f"Saving model checkpoint to {args.save_model_folder}") # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` # Take care of distributed/parallel training model_to_save = model.module if hasattr(model, "module") else model model_to_save.save_pretrained(args.save_model_folder) tokenizer.save_pretrained(args.save_model_folder) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.save_model_folder, "training_args.bin")) # Load a trained model and vocabulary that you have fine-tuned model = AutoModelForQuestionAnswering.from_pretrained( args.save_model_folder) # , force_download=True) tokenizer = AutoTokenizer.from_pretrained( args.save_model_folder, do_lower_case=args.do_lower_case) model.to(args.device) # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory results = {} if args.do_eval_after_training and args.local_rank in [-1, 0]: logger.info("Loading checkpoints saved during training for evaluation") checkpoints = [args.save_model_folder] if args.eval_all_checkpoints: checkpoints = list( os.path.dirname(c) for c in sorted( glob.glob(args.save_model_folder + "/**/" + WEIGHTS_NAME, recursive=True))) # logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce model loading logs logger.info(f"Evaluate the following checkpoints: {checkpoints}") dataset, examples, features = load_or_convert(args, tokenizer, evaluate=True) for checkpoint in checkpoints: # Reload the model global_step = checkpoint.split( "-")[-1] if len(checkpoints) > 1 else "" model = AutoModelForQuestionAnswering.from_pretrained( checkpoint) # , force_download=True) model.to(args.device) # Evaluate result = evaluate(args, model, tokenizer, dataset, examples, features, suffix=global_step) result = dict( (k + ("_{}".format(global_step) if global_step else ""), v) for k, v in result.items()) results.update(result) logger.info("Results: {}".format(results)) write_json(results, os.path.join(args.save_model_folder, 'dev-results.json')) return results
def evaluate_intervention(predictions_folder, baseline_file, output, do_print, do_save, control, split_reasoning, split_num_modifier, split_sam): gold = load_json(baseline_file) intervention_basename = os.path.basename(baseline_file).replace(BASELINE, INTERVENTION) intervention_file = baseline_file.replace(os.path.basename(baseline_file), intervention_basename) gold_intervention = load_json(intervention_file) gold_descriptor, prediction_files = match_prediction_to_gold(baseline_file, predictions_folder) gold_intervention_descriptor, prediction_intervention_files = match_prediction_to_gold(intervention_file, predictions_folder) click.echo(f"Evaluation by intervention with baseline gold: {click.style(baseline_file, fg='blue')}") click.echo(f"And intervention gold: {click.style(intervention_file, fg='blue')}") if control: control_basename = os.path.basename(baseline_file).replace(BASELINE, CONTROL) control_file = baseline_file.replace(os.path.basename(baseline_file), control_basename) gold_control = load_json(control_file) _, control_prediction_files = match_prediction_to_gold(control_file, predictions_folder) click.echo(f"And control gold: {click.style(control_file, fg='blue')}") # assert c_aligned_baseline == aligned_baseline, c_aligned_intervention == aligned_intervention else: control_prediction_files = [""] * len(prediction_files) gold_control = None aligned_baseline, aligned_intervention, aligned_control = align(gold, gold_intervention, gold_control) result = dict() for predictions_file, prediction_intervention_file, control_prediction_file in \ zip(sorted(prediction_files), sorted(prediction_intervention_files), sorted(control_prediction_files)): predictions: Dict[str, str] = load_json(predictions_file) predictions_intervention: Dict[str, str] = load_json(prediction_intervention_file) model_name = extract_model_name(gold_descriptor, predictions_file) click.echo(f"Evaluating predictions of model {click.style(model_name, fg='green')}") click.echo(f"Evaluating {click.style(str(len(aligned_baseline)), fg='green', bold=True)} sample(s).") predictions_control: Dict[str, str] = load_json(control_prediction_file) if control_prediction_file else None ( overall_result, results_baseline, results_intervention, results_control, correct_before_intervention, correct_change_correct, correct_keep_wrong, correct_change_wrong, wrong_change_right, wrong_keep_right, correct_baseline_control, correct_baseline_control_intervention ) = eval_intervention(aligned_baseline, aligned_intervention, aligned_control, predictions, predictions_intervention, predictions_control) click.echo(f"Got {sum(results_baseline)} correct for baseline.") click.echo(f"Got {sum(results_intervention)} correct for intervention.") click.echo(f"Out of {sum(results_baseline)} correct baseline results, got {len(correct_change_correct)} " f"correct after intervention.") click.echo(f"Interventions that the model 'ignored': {len(correct_keep_wrong)}") click.echo(f"Interventions that left the model 'confused': {len(correct_change_wrong)}") click.echo(f"Wrong predictions that the model changed to correct: {len(wrong_change_right)}") click.echo(f"Wrong predictions that the model didn't change but that became correct: {len(wrong_keep_right)}") if do_print: print_examples(correct_baseline_control, correct_baseline_control_intervention, correct_change_correct, correct_keep_wrong, correct_change_wrong, wrong_change_right, wrong_keep_right) mean, var, ci = get_mean_var_ci_bernoulli(overall_result) printable_result = f'{mean:.4f} +/- {ci:.4f}' if "bert" in model_name: dev_results = load_json(f"models/{model_name}/dev-results.json") original_dev_em = dev_results.get('exact',None) original_dev_f1 = dev_results.get('f1',None) elif "t5" in model_name: dev_results = load_json(f"models/{model_name}/dev-results.json") original_dev_em = dev_results['exact_final'] original_dev_f1 = dev_results['f1_final'] elif 'bidaf' in model_name: dev_results = load_json(f"models/{model_name}/metrics.json") original_dev_em = dev_results['best_validation_em'] original_dev_f1 = dev_results['best_validation_f1'] else: original_dev_em = None original_dev_f1 = None result[model_name] = { 'evaluation_on_intervention': { 'human_readable': printable_result, 'mean': mean, '95ci': ci, 'control': control }, 'original_dev_em': original_dev_em, 'original_dev_f1': original_dev_f1, 'n': len(aligned_baseline), 'behaviour': { 'correct_baseline': sum(results_baseline), 'correct_intervention': sum(results_intervention), 'right->change->right': len(correct_change_correct), 'right->keep->wrong': len(correct_keep_wrong), 'right->change->wrong': len(correct_change_wrong), 'wrong->change->right': len(wrong_change_right), 'wrong->keep->right': len(wrong_keep_right), 'consistency': len(correct_change_correct) / len(aligned_baseline) } } if control: correct_baseline_control_ids = [d.qa_id for d, *_ in correct_baseline_control] assert len(correct_baseline_control_ids) == len(set(correct_baseline_control_ids)) correct_baseline_control_ids = set(correct_baseline_control_ids) correct_baseline_control_keep_wrong = [ x for x in correct_keep_wrong if x[0].qa_id in correct_baseline_control_ids ] correct_baseline_control_change_wrong = [ x for x in correct_change_wrong if x[0].qa_id in correct_baseline_control_ids ] click.echo(f"Got {sum(results_control)} correct for control.") result[model_name]['behaviour'].update({ 'correct_control': sum(results_control), 'correct_baseline_control': len(correct_baseline_control), 'right+control->change->right': len(correct_baseline_control_intervention), 'right+control->keep->wrong': len(correct_baseline_control_keep_wrong), 'right+control->change->wrong': len(correct_baseline_control_change_wrong), 'consistency+control': len(correct_baseline_control_intervention) / len(aligned_baseline) }) click.echo(f"Overall result: {printable_result}.") click.echo(result[model_name]['behaviour']) click.echo() if split_reasoning: result[model_name]['by_reasoning'] = dict() for reasoning, gold_split in groupby(sorted(aligned_baseline, key=reasoning_key), key=reasoning_key): ab, ai, ac = align(gold_split, gold_intervention, gold_control) ( overall_result, results_baseline, results_intervention, results_control, correct_before_intervention, correct_change_correct, correct_keep_wrong, correct_change_wrong, wrong_change_right, wrong_keep_right, correct_baseline_control, correct_baseline_control_intervention ) = eval_intervention(ab, ai, ac, predictions, predictions_intervention, predictions_control) mean, var, ci = get_mean_var_ci_bernoulli(overall_result) pr = f'{mean:.4f} +/- {ci:.4f}' result[model_name]['by_reasoning'][reasoning] = { 'mean': mean, "var": var, 'n': len(correct_baseline_control), 'tp': len(correct_baseline_control_intervention), '95ci': ci, "printable_result": pr, } click.echo(f'{reasoning}: {pr}') if split_num_modifier: result[model_name]['by_num_modifier'] = dict() for num_mod, gold_split in groupby(sorted(aligned_baseline, key=num_modifier_key), key=num_modifier_key): ab, ai, ac = align(gold_split, gold_intervention, gold_control) ( overall_result, results_baseline, results_intervention, results_control, correct_before_intervention, correct_change_correct, correct_keep_wrong, correct_change_wrong, wrong_change_right, wrong_keep_right, correct_baseline_control, correct_baseline_control_intervention ) = eval_intervention(ab, ai, ac, predictions, predictions_intervention, predictions_control) mean, var, ci = get_mean_var_ci_bernoulli(overall_result) pr = f'{mean:.4f} +/- {ci:.4f}' result[model_name]['by_num_modifier'][num_mod] = { 'mean': mean, 'n': len(correct_baseline_control), 'tp': len(correct_baseline_control_intervention), "var": var, '95ci': ci, "printable_result": pr, } click.echo(f'{model_name}: {num_mod}: {pr}') if split_sam: result[model_name]['by_sam'] = dict() for sam, gold_split in groupby(sorted(aligned_baseline, key=sam_key), key=sam_key): ab, ai, ac = align(gold_split, gold_intervention, gold_control) ( overall_result, results_baseline, results_intervention, results_control, correct_before_intervention, correct_change_correct, correct_keep_wrong, correct_change_wrong, wrong_change_right, wrong_keep_right, correct_baseline_control, correct_baseline_control_intervention ) = eval_intervention(ab, ai, ac, predictions, predictions_intervention, predictions_control) mean, var, ci = get_mean_var_ci_bernoulli(overall_result) pr = f'{mean:.4f} +/- {ci:.4f}' result[model_name]['by_sam'][sam] = { 'mean': mean, "var": var, 'n': len(correct_baseline_control), 'tp': len(correct_baseline_control_intervention), '95ci': ci, "printable_result": pr, } click.echo(f'{model_name}: {sam}: {pr}') click.echo(f"Result: {result}") if do_save: write_json(result, output)
def main(): parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() model_args: ModelArguments data_args: DataTrainingArguments training_args: TrainingArguments if training_args.fp16: try: import apex apex.amp.register_half_function(torch, "einsum") except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) # if training_args.do_eval and not training_args.do_train and not data_args.predictions_folder: # raise ValueError("Supply predictions folder destination to save the predictions!") logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.debug(model_args) logger.debug(training_args) logger.debug(data_args) # raise NotImplementedError if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " f"Use --overwrite_output_dir to overcome.") # Set seed set_seed(training_args.seed) if training_args.local_rank not in [-1, 0]: # Make sure only the first process in distributed training will download model & vocab torch.distributed.barrier() tokenizer = get_tokenizer(model_args.model_name_or_path, do_lower_case=False) if data_args.model_parallel == 4: model = T5ForConditionalGeneration4WayParallel.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) elif data_args.model_parallel == 2: model = T5ForConditionalGeneration2WayParallel.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) elif data_args.model_parallel is None: model = T5ForConditionalGeneration.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) else: raise ValueError( f"Can only have no, 2way or 4way model parallelism! (expected: {data_args.model_parallel})" ) if training_args.local_rank == 0: # Make sure only the first process in distributed training will download model & vocab torch.distributed.barrier() # Get datasets if training_args.do_eval and training_args.local_rank in [-1, 0]: eval_dataset, examples = get_dataset(data_args.eval_file_path, tokenizer, data_args, evaluate=True) else: eval_dataset, examples = None, None # Training if training_args.do_train: if training_args.local_rank in [-1, 0]: train_dataset, _ = get_dataset(data_args.train_file_path, tokenizer, data_args) torch.save(train_dataset, 'features.bin') else: torch.distributed.barrier() train_dataset = None if training_args.local_rank == 0: torch.distributed.barrier() else: train_dataset = torch.load('features.bin') # Initialize our Trainer if data_args.model_parallel: trainer = MyTrainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, data_collator=collate_training, prediction_loss_only=True) model.set_parallel() else: trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, data_collator=collate_training, prediction_loss_only=True) trainer.train(model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation if training_args.do_eval and training_args.local_rank in [-1, 0]: if training_args.do_train: model_path = os.path.basename(training_args.output_dir) else: model_path = os.path.basename(model_args.model_name_or_path) checkpoints = [training_args.output_dir] if data_args.eval_all_checkpoints and training_args.do_train: logger.info( "Loading checkpoints saved during training for evaluation") checkpoints = list( os.path.dirname(c) for c in sorted( glob.glob(training_args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))) # logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce model loading logs logger.info(f"Evaluate the following checkpoints: {checkpoints}") results = {} logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) for checkpoint in checkpoints: # Reload the model global_step = checkpoint.split("-")[-1] if not all(s in string.digits for s in global_step): global_step = '' # no model parallelism here (didnt check model.generate) model = T5ForConditionalGeneration.from_pretrained(checkpoint) device = torch.device("cuda" if torch.cuda.is_available() and not training_args.no_cuda else "cpu") model.to(device) model_str = f'{model_path}-{global_step}' if global_step else model_path # Note that DistributedSampler samples click.echo( f"Generating predictions for model {click.style(model_str, fg='blue')}, " f"running on {click.style(str(training_args.device), fg='green')}" ) predictions = generate_predictions(eval_dataset, examples, model, tokenizer, training_args) final_metric = squad_evaluate(examples, predictions) if is_wandb_available(): if training_args.do_train: step = int( global_step) if global_step else trainer.global_step else: step = 0 # for now WANDB cannot 'log back in time' wandb.log(final_metric, step=step) print(f"GLOBAL STEP: {global_step}") result = dict( (k + ("_{}".format(global_step) if global_step else '_final'), v) for k, v in final_metric.items()) logger.info(f"Result for {model_str}: {result}") results.update(result) # sort results by best checkpoint_scores = { c.split('_')[-1]: v for c, v in results.items() if any(c.endswith(digit) for digit in string.digits) and c.startswith('exact') } sorted_checkpoint_scores = { k: v for k, v in sorted(checkpoint_scores.items(), key=lambda k_v: k_v[1], reverse=True) } best_cp = next((c for c, v in sorted_checkpoint_scores.items() if v > results['exact_final']), None) if best_cp: click.echo(f"Best checkpoint is: {best_cp}") # copy over best results best_cp_folder = f'checkpoint-{best_cp}' click.echo( f"Copying over files: from {os.path.join(training_args.output_dir, best_cp_folder)} " f"to {training_args.output_dir}") files_to_copy = glob.glob( os.path.join(training_args.output_dir, best_cp_folder, '*')) for file in files_to_copy: shutil.copy(file, training_args.output_dir) else: click.echo("best checkpoint is the last step...") # remove 'kek'points folders_to_remove = [ p for p in glob.glob(os.path.join(training_args.output_dir, '*')) if os.path.isdir(p) ] click.echo('Folders to remove: ') for folder in folders_to_remove: click.echo(f"Removing {folder}") shutil.rmtree(folder) if training_args.do_train: logger.info(results) write_json( results, os.path.join(training_args.output_dir, 'dev-results.json')) else: write_json( predictions, get_output_predictions_file_name( data_args.eval_file_path, training_args.output_dir, os.path.basename( os.path.normpath(model_args.model_name_or_path))))
def finetune(optimize_consistency, evaluate_on, original_dev_dataset, runs_per_trial, hyperparam_opt_runs, out_file, mute, baseline_gold_file, hyperparams, keep_predictions, original_ans_length, **kwargs): gold_files = get_baseline_intervention_control_from_baseline( baseline_gold_file) golds = tuple(load_json(g) for g in gold_files) # load eval gold for evaluation aligneds = align(*golds, assert_same=True) hyper_params = [{ 'name': hp['name'], 'type': hp.get("type", 'range'), 'bounds': hp['bounds'], 'value_type': hp.get('value_type', 'float'), 'log_scale': hp.get('log_scale', True) } for hp in json.loads(hyperparams)] logger.info(hyper_params) args = Args(**kwargs) args.debug_features = not mute tokenizer = get_tokenizer(args.model_path, args.do_lower_case) features = [] for f in gold_files: args.eval_file = f features.append(load_or_convert(args, tokenizer, evaluate=True)) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") kwargs['n_gpu'] = 0 if args.no_cuda else torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") kwargs['n_gpu'] = 1 kwargs['device'] = device args.n_gpu = kwargs['n_gpu'] args.device = kwargs['device'] if args.seed: set_seed(args) logger.debug(args) if args.fp16: try: import apex apex.amp.register_half_function(torch, "einsum") except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) # load train dataset train_dataset, train_examples, train_features = load_or_convert( args, tokenizer) if not mute: debug_features_examples_dataset(train_dataset, train_examples, train_features, tokenizer) if original_dev_dataset: args.eval_file = original_dev_dataset original_dev_dataset = load_or_convert(args, tokenizer, evaluate=True) ax_client = AxClient() ax_client.create_experiment( name=f'{args.model_path}@{args.train_file}', parameters=hyper_params, objective_name=evaluate_on, minimize=False, ) result = { "trials": [], "tried_params": defaultdict(list), "best_params": ..., 'pre_eval': train_and_eval_single_step(args, train_dataset, *aligneds, *features, original_dev_dataset, *gold_files, run_nr='eval', train=False, evaluate_on=evaluate_on, original_ans_length=original_ans_length) } # first, eval and save what is the performance before training click.echo(f"Results: {json.dumps(result['pre_eval'], indent=4)}") # run hyperparam optimisation predictions_folder = keep_predictions for i in trange(hyperparam_opt_runs): parameters, trial_index = ax_client.get_next_trial() logger.info(f"Trying parameters: {parameters}") single_step_args = deepcopy(kwargs) single_step_args.update(parameters) args = Args(**single_step_args) args.predictions_folder = str(predictions_folder) trial_result = train_and_eval_single_step( args, train_dataset, *aligneds, *features, original_dev_dataset, *gold_files, run_nr=i, num_runs=runs_per_trial, evaluate_on=evaluate_on, original_ans_length=original_ans_length) # if optimize_consistency: assert evaluate_on == 'eoi' mean = trial_result['consistency'] else: mean = trial_result['overall' if evaluate_on == 'eoi' else 'EMRelaxed'] if runs_per_trial > 1: mean, var, ci = mean if original_dev_dataset: logger.info(f"Mean: ({mean} * 100 + {trial_result['original']})/2") mean = (mean * 100 + trial_result['original']) / 2 trial_result["mean"] = mean logger.info(f"Result: {mean}") logger.info(f"Results: {json.dumps(trial_result, indent=4)}") result["trials"].append(trial_result) result['tried_params'][i].append(parameters) ax_client.complete_trial(trial_index=trial_index, raw_data=mean) best_params, metrics = ax_client.get_best_parameters() result['best_params'] = best_params result['best_metrics'] = metrics click.echo(f"What is metrics? {metrics}") click.echo(json.dumps(result, indent=4)) write_json(result, out_file)
def quality(input, reference, output, attr, random_seed, subsample, taaco_dir, indices): if random_seed: random.seed(random_seed) if not indices: indices = DEFAULT_INDICES else: indices = indices.split(",") sample = load_json(input) reference = load_json(reference) # scores = Dict[str, np.ndarray] getter: Callable[[Any], str] = itemgetter(attr) corpus: List[str] = [s['paragraphs'][0]['context'] for s in sample['data']] n = len(corpus) logger.debug(f"Evaluating sample with n={n} paragraphs.") if subsample: corpus = random.sample(corpus, subsample) result = apply_taaco(corpus, taaco_dir, indices) # for index, values in result.items(): # scores[index] = np.array(values, dtype=np.float) corpus_reference: List[str] = [getter(s) for s in reference] n_reference = len(corpus_reference) scores_reference = apply_taaco(corpus_reference, taaco_dir, indices) final_result = dict() overall = 0 overall_ref = 0 overall_pos = 0 overall_pos_reference = 0 overall_neg = 0 overall_neg_reference = 0 by_measure = defaultdict(list) by_measure_ref = defaultdict(list) for index, values in result.items(): # t_975 = t.ppf(1 - .025, df=n - 1) # ci95 = t_975 * values.std() / math.sqrt(len(values)) values = np.array(values) mean, var, ci95 = get_mean_var_ci(values, alpha=0.025) printable_result = f'{mean:.4f} +/- {ci95:.4f}' values_reference = np.array(scores_reference[index]) mean_ref, var_ref, ci95_ref = get_mean_var_ci(values_reference, alpha=0.025) printable_result_reference = f'{mean_ref:.4f} +/- {ci95_ref:.4f}' by_measure[INDEX_TO_MEASURE[index]].append(mean) by_measure_ref[INDEX_TO_MEASURE[index]].append(mean_ref) if index in NEG: overall_neg = mean overall_neg_reference += mean_ref overall += (1 - mean) overall_ref += (1 - mean_ref) else: overall_pos += mean overall_pos_reference += mean_ref overall += mean overall_ref += mean_ref # t_975_reference = t.ppf(1 - .025, df=n_reference - 1) # ci95_reference = t_975_reference * values_reference.std() click.echo(f"Mean for index {click.style(index, fg='green')} (n={n}): " f"{click.style(printable_result, fg='green', bold=True)}") click.echo( f"Reference mean for index {click.style(index, fg='green')} (n={n_reference}): " f"{click.style(printable_result_reference, fg='green', bold=True)}" ) final_result[index] = { 'ours': { 'n': len(sample), 'human_readable': printable_result, 'mean': mean, 'variance': var, '95ci': ci95, }, "reference": { 'human_readable': printable_result_reference, 'mean': mean_ref, 'variance': var_ref, '95ci': ci95_ref }, "difference": { "difference": mean - mean_ref, # "within_ci": bool(within_ci) } } ours = ((overall_pos / len(POS)) + overall_neg / len(NEG)) / 2 ref = ((overall_pos_reference / len(POS)) + overall_neg_reference / len(NEG)) / 2 ours_smooth = overall / len(indices) ref_smooth = overall_ref / len(indices) by_measure_avg = [ (sum(v) / len(v)) if k == 'w2v similarity' else 1 - (sum(v) / len(v)) for k, v in by_measure.items() ] by_measure_ref_avg = [ (sum(v) / len(v)) if k == 'w2v similarity' else 1 - (sum(v) / len(v)) for k, v in by_measure_ref.items() ] rows = [[k, o, r] for k, o, r in zip(by_measure, by_measure_avg, by_measure_ref_avg)] click.echo( tabulate.tabulate(rows, headers=['Measure', "Ours", "Reference"])) by_measure_avg = sum(by_measure_avg) / len(by_measure_avg) by_measure_avg_ref = sum(by_measure_ref_avg) / len(by_measure_ref_avg) final_result['overall'] = { 'ours': ours, "reference": ref, 'ours_smooth': ours_smooth, 'ref_smooth': ref_smooth, 'by_measure_avg': by_measure_avg, 'by_measure_avg_ref': by_measure_avg_ref } click.echo(f"Overall: {click.style(f'{ours:03f}', fg='green')} (n={n}): ") click.echo( f"Reference overall: {click.style(f'{ref:03f}', fg='green')} (n={n_reference}): " ) click.echo( f"Overall smooth: {click.style(f'{ours_smooth:03f}', fg='green')} (n={n}): " ) click.echo( f"Reference overall smooth: {click.style(f'{ref_smooth:03f}', fg='green')} (n={n_reference}): " ) click.echo( f"Overall by measure: {click.style(f'{by_measure_avg:03f}', fg='green')} (n={n}): " ) click.echo( f"Reference overall by measure: {click.style(f'{by_measure_avg_ref:03f}', fg='green')} (n={n_reference}): " ) write_json(final_result, output)