Example #1
0
def evaluate(prediction_folder, gold_files, output, metric, split_reasoning, split_num_modifier):
    eval_metrics = metric
    result = dict()
    for gold_file in gold_files:
        click.echo(f"Evaluating predictions on {click.style(gold_file, fg='blue')}")
        gold = list(sample_iter(load_json(gold_file)))
        gold_descriptor, prediction_files = match_prediction_to_gold(gold_file, prediction_folder)
        result[gold_file] = {'n': len(gold)}
        logger.debug(prediction_files)
        for prediction_file in sorted(prediction_files):
            model_name = extract_model_name(gold_descriptor, prediction_file)
            result[gold_file][model_name] = dict()
            click.echo(f"Evaluating predictions of model {click.style(model_name, fg='green')}")
            predictions = load_json(prediction_file)
            result[gold_file][model_name]['full'] = _get_score(gold, predictions, eval_metrics)
            click.echo()
            if split_reasoning:
                for reasoning, gold_split in groupby(sorted(gold, key=reasoning_key), key=reasoning_key):
                    result[gold_file][model_name][reasoning] = _get_score(list(gold_split), predictions, eval_metrics,
                                                                          reasoning)
                click.echo()
            if split_num_modifier:
                for num_mod, gold_split in groupby(sorted(gold, key=num_modifier_key), key=num_modifier_key):
                    result[gold_file][model_name][num_mod] = _get_score(list(gold_split), predictions, eval_metrics,
                                                                        f"Num modifications: {num_mod}")
            click.echo()
        write_json(result, output)
Example #2
0
def count(action, n, output, domain: Bundle, mod):
    result = {}
    templates = domain[f"templates_{mod}" if mod else "templates"]
    actions, sentences = get_templates(templates=templates,
                                       action=action,
                                       n=n,
                                       command="Counting")
    for action in actions:
        click.echo(f"For action '{click.style(action, fg='blue')}':")
        r = SizeEstimator(processor=Processor(accessor=Accessor(**templates),
                                              chooser=RandomChooser()))
        upper_bound = r.estimate_size(r.processor.accessor.sentences[action])
        r = SizeEstimator(processor=Processor(accessor=Accessor(**templates),
                                              chooser=RandomChooser()))
        lower_bound = r.estimate_size(r.processor.accessor.sentences[action],
                                      pessimistic=True)
        click.secho(
            f"Pessimistically speaking, you can generate {click.style(str(lower_bound), fg='red', bold=True)} "
            f"distinct sentences!")

        click.secho(
            f"Optimistically speaking, you can generate {click.style(str(upper_bound), fg='green', bold=True)} "
            f"distinct sentences!")
        result[action] = {"lower": lower_bound, "upper": upper_bound}
    if output:
        write_json(result, output)
Example #3
0
def diversity(input, reference, output, attr, random_seed, subsample,
              metric: List[Type[Distance]]):
    if random_seed:
        random.seed(random_seed)
    # TODO: make metrics appendable
    sample = load_json(input)
    reference = load_json(reference)
    getter: Callable[[Any], str] = itemgetter(attr)

    # samples
    corpus: List[str] = [s['paragraphs'][0]['context'] for s in sample['data']]
    corpus_reference: List[str] = [getter(s) for s in reference]
    if subsample:
        corpus = random.sample(corpus, subsample)

    n = len(corpus)
    n_reference = len(corpus_reference)
    logger.debug(f"Evaluating sample with n={n} paragraphs.")
    results = dict()
    for m in metric:
        result = np.array(pointwise_average_distance(corpus, m()))
        result_reference = np.array(
            pointwise_average_distance(corpus_reference, m()))
        mean, var, ci95 = get_mean_var_ci(result, alpha=0.025)
        mean_ref, var_ref, ci95_ref = get_mean_var_ci(result_reference,
                                                      alpha=0.025)

        printable_result = f'{mean:.4f} +/- {ci95:.4f}'
        printable_result_reference = f'{mean_ref:.4f} +/- {ci95_ref:.4f}'

        click.echo(
            f"Point-wise average distance under the {click.style(str(m.__name__), fg='green')} metric (n={n}): "
            f"{click.style(printable_result, fg='green', bold=True)}")
        click.echo(
            f"Reference point-wise average distance under the {click.style(str(m.__name__), fg='green')} "
            f"metric(n={len(corpus_reference)}): "
            f"{click.style(printable_result_reference, fg='green', bold=True)}"
        )
        results[str(m.__name__)] = {
            'ours': {
                "n": n,
                'human_readable': printable_result,
                'mean': mean,
                'variance': var,
                '95ci': ci95,
            },
            "reference": {
                'n': n_reference,
                'human_readable': printable_result_reference,
                'mean': mean_ref,
                'variance': var_ref,
                '95ci': ci95_ref
            },
            "difference": mean - mean_ref
        }

    if output:
        write_json(results, output)
Example #4
0
def predictions(in_files, out_folder, model_paths, model_types, no_cuda,
                per_gpu_eval_batch_size, do_not_lower_case, lang_id, v2,
                n_best_size, max_answer_length, verbose_logging,
                null_score_diff_threshold, do_evaluate, **kwargs):
    assert len(model_paths) == len(model_types)
    for model_path, model_type in zip(model_paths, model_types):
        model = get_model(model_path)
        args = Args(model_path=model_path,
                    model_type=model_type,
                    predictions_folder=out_folder,
                    no_cuda=no_cuda,
                    do_not_lower_case=do_not_lower_case,
                    per_gpu_eval_batch_size=per_gpu_eval_batch_size,
                    lang_id=lang_id,
                    v2=v2,
                    n_best_size=n_best_size,
                    max_answer_length=max_answer_length,
                    verbose_logging=verbose_logging,
                    null_score_diff_threshold=null_score_diff_threshold,
                    **kwargs)
        tokenizer = get_tokenizer(model_path, args.do_lower_case)
        for in_file in in_files:
            args.eval_file = in_file
            logger.debug(args)
            dataset, examples, features = load_or_convert(args,
                                                          tokenizer,
                                                          evaluate=True)
            if do_evaluate:
                out_path = args.predictions_folder
                args.predictions_folder = None
                suffix = os.path.basename(os.path.normpath(model_path))
                score = evaluate(args,
                                 model,
                                 tokenizer,
                                 dataset,
                                 examples,
                                 features,
                                 suffix=suffix,
                                 return_raw=False)
                file_name = get_output_predictions_file_name(
                    args.eval_file, out_path, suffix)
                write_json(score, file_name)
                args.predictions_folder = out_path
            else:
                evaluate(args,
                         model,
                         tokenizer,
                         dataset,
                         examples,
                         features,
                         suffix=os.path.basename(os.path.normpath(model_path)))
Example #5
0
def predict(in_files, output_folder, models, model_classes, gpu, batch_size):
    # There is a chance i'll need to scrap all of this and do convert to features stuff
    if gpu is None:
        gpu = _is_gpu_available()
    logger.debug(fmt_dict(locals()))
    if not len(models) == len(model_classes):
        click.echo(
            f"Num models supplied ({len(models)})!= num model classes supplied ({len(model_classes)})!"
        )
        sys.exit(1)

    for cls, weights_path in zip(model_classes, models):
        model_cls: Model = do_import(cls, relative_import='stresstest.model')
        # TODO: Bidaf should also respect max answer length
        model = model_cls.make(weights_path, gpu=gpu)
        click.echo(
            f"Evaluating model '{click.style(model_cls.__name__, fg='green', bold=True)}' from weights file: "
            f"{click.style(weights_path, fg='blue')}.")
        click.echo(
            f"Running on {click.style('gpu' if gpu else 'cpu', fg='green', bold=True)}."
        )
        for in_file in in_files:
            sample = load_json(in_file)
            num_q = num_questions(sample)
            click.echo(
                f"Evaluating on sample (n={num_q}, |{{C}}|={len(sample)}): {click.style(in_file, fg='blue')}"
            )

            predictions = dict()
            for sample_batch in batch(tqdm(sample_iter(sample),
                                           position=1,
                                           total=num_q),
                                      batch_size=batch_size):
                sample_batch: List[Entry]
                batch_predictions = model.predict_batch(sample_batch)
                for entry, answer in zip(sample_batch, batch_predictions):
                    logger.debug(f"Passage: {entry.passage}")
                    logger.debug(f"Question: {entry.question}")
                    logger.debug(f"Prediction: {answer}")
                    predictions[entry.qa_id] = str(answer)
            output_file_name = get_output_predictions_file_name(
                in_file, output_folder, weights_path)
            click.echo(
                f"Saving predictions to {click.style(output_file_name, fg='blue')}"
            )
            write_json(predictions, output_file_name, pretty=False)
def update_zone_names(output_path):
    # update all the zone names and set the right ids to be written in the poly_zone_ids.bin
    global poly_zone_ids
    global list_of_pointers
    global poly_boundaries
    global polygons
    global polygon_lengths
    global polynrs_of_holes
    global nr_of_zones
    global nr_of_polygons
    file_path = abspath(join(output_path, TIMEZONE_NAMES_FILE))
    print(f"updating the zone names in {file_path} now.")
    # pickle the zone names (python array)
    write_json(all_tz_names, file_path)
    print("...Done.\n\nComputing where zones start and end...")
    last_id = -1
    zone_id = 0
    poly_nr = 0
    for poly_nr, zone_id in enumerate(poly_zone_ids):
        if zone_id != last_id:
            poly_nr2zone_id.append(poly_nr)
            assert zone_id >= last_id
            last_id = zone_id
    assert nr_of_polygons == len(poly_zone_ids)

    # TODO
    # assert (
    #         zone_id == nr_of_zones - 1
    # ), f"not pointing to the last zone with id {nr_of_zones - 1}"
    # assert (
    #         poly_nr == nr_of_polygons - 1
    # ), f"not pointing to the last polygon with id {nr_of_polygons - 1}"
    # ATTENTION: add one more entry for knowing where the last zone ends!
    # ATTENTION: the last entry is one higher than the last polygon id (to be consistant with the
    poly_nr2zone_id.append(nr_of_polygons)
    # assert len(poly_nr2zone_id) == nr_of_zones + 1
    print("...Done.\n")
Example #7
0
def generate_modifier(config, out_path, seed, subsample, do_print, do_save,
                      domain, num_workers, split_templates, modifier_type):
    if seed:
        random.seed(seed)
    uuid4 = lambda: uuid.UUID(int=random.getrandbits(128)).hex

    cfg = Config(config)
    max_sents = cfg["world.num_sentences"]

    modify_event_types = cfg['modify_event_types']
    # num of modifications: f(max_sent) = |modify_event_types| * 1/3 * max_sent * (max_sent - 1) * (max_sent - 2)
    n = int(
        len(modify_event_types) * 1 / 3 * max_sents * (max_sents - 1) *
        (max_sents - 2))
    if split_templates:
        first, second = split(domain.templates_modifier,
                              event_types_to_split=modify_event_types,
                              split_ratio=split_templates)
        template_splits = [first, second]

        click.echo(f"Splitting templates with a {split_templates} ratio.")

        for event_type, templates in domain.templates_modifier[
                'sentences'].items():
            click.echo(f"For event type '{event_type}'")
            click.echo(
                f"First split: {[templates.index(t) for t in first['sentences'][event_type]]}"
            )
            click.echo(
                f"Second split: {[templates.index(t) for t in second['sentences'][event_type]]}"
            )
    else:
        template_splits = [domain.templates_modifier]

    for i, templates in enumerate(template_splits):
        answer_types = cfg.get('answer_types')
        question_types = cfg.get('reasoning')

        subsample_str = subsample if subsample else 'full'
        subsample_str = f"{subsample_str}-{i}" if split_templates else subsample_str
        file_name = f"{{}}-{'-'.join(answer_types)}-{'-'.join(question_types)}-{n}-{subsample_str}.json"
        click.echo(
            f"Generating from '{click.style(config, fg='green')}': {click.style(str(n), fg='green', bold=True)} passages, "
            f"{click.style(str(subsample_str), fg='green', bold=True)} realisation per passage."
        )
        click.echo(
            f"Saving baseline in "
            f"{click.style(os.path.join(out_path, file_name.format(BASELINE)), fg='blue', bold=True)}."
        )
        click.echo(
            f"Saving modified in "
            f"{click.style(os.path.join(out_path, file_name.format(INTERVENTION)), fg='blue', bold=True)}."
        )
        click.echo(
            f"Saving control in "
            f"{click.style(os.path.join(out_path, file_name.format(CONTROL)), fg='blue', bold=True)}."
        )

        baseline, modified, control = generate(cfg, domain, num_workers,
                                               subsample, templates, uuid4,
                                               modifier_type)
        baseline = sorted(baseline, key=lambda d: d['title'])
        modified = sorted(modified, key=lambda d: d['title'])
        control = sorted(control, key=lambda d: d['title'])
        if do_print:
            _do_print(baseline, modified, control)

        click.echo(f"Total Passages: {len(baseline)}")
        click.echo(
            f"Total Questions over baseline passages: {sum(len(b['paragraphs'][0]['qas']) for b in baseline)}"
        )
        click.echo(
            f"Total Questions over modified passages: {sum(len(b['paragraphs'][0]['qas']) for b in modified)}"
        )

        if do_save:
            write_json({
                "version": 0.1,
                "data": baseline
            },
                       os.path.join(out_path, file_name.format(BASELINE)),
                       pretty=False)
            write_json({
                "version": 0.1,
                "data": modified
            },
                       os.path.join(out_path, file_name.format(INTERVENTION)),
                       pretty=False)
            write_json({
                "version": 0.1,
                "data": control
            },
                       os.path.join(out_path, file_name.format(CONTROL)),
                       pretty=False)
Example #8
0
def train(**kwargs):
    # doc_stride = kwargs.pop("doc_stride")
    # max_query_length = kwargs.pop('max_query_length')
    # max_seq_length = kwargs.pop("max_seq_length")
    # num_workers = kwargs.pop('num_workers')
    # debug_features = kwargs.pop('debug_features')
    # do_lower_case = not kwargs.pop('do_not_lower_case')
    # kwargs['logging_steps'] = [int(i) for i in kwargs['logging_steps'].split(',')] if kwargs['logging_steps'] else []
    args = Args(**kwargs)
    args.local_rank = int(os.environ.get('LOCAL_RANK', -1))
    logger.debug(args)
    if (os.path.exists(args.save_model_folder)
            and os.listdir(args.save_model_folder)
            and not args.overwrite_output_dir):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
            .format(args.save_model_folder))
    # os.makedirs(args.predictions_folder, exist_ok=True)
    os.makedirs(args.save_model_folder, exist_ok=True)

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
    args.device = device

    # Setup logging
    if args.local_rank not in [-1, 0]:
        logger.remove()
        logger.add(sys.stdout, level="WARNING")
    logger.warning(
        f"Process rank: {args.local_rank}, device: {device}, n_gpu: "
        f"{args.n_gpu}, distributed training: "
        f"{bool(args.local_rank != -1)}, 16-bits training: {args.fp16}", )

    # Set seed
    set_seed(args)

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        # Make sure only the first process in distributed training will download model & vocab
        torch.distributed.barrier()

    args.model_type = args.model_type.lower()

    tokenizer = get_tokenizer(args.model_path, args.do_lower_case)
    model = get_model(args.model_path)

    if args.local_rank == 0:
        # Make sure only the first process in distributed training will download model & vocab
        torch.distributed.barrier()

    model.to(args.device)

    # logger.info("Training/evaluation parameters %s", args)

    # Before we do anything with models, we want to ensure that we get fp16 execution of torch.einsum
    # if args.fp16 is set.
    # Otherwise it'll default to "promote" mode, and we'll get fp32 operations.
    # Note that running `--fp16_opt_level="O2"` will
    # remove the need for this code, but it is still valid.
    if args.fp16:
        try:
            import apex

            apex.amp.register_half_function(torch, "einsum")
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
    train_dataset = load_or_convert(args, tokenizer, dataset_only=True)
    # train_dataset, e, f = load_examples(args.train_file)
    logger.info("loaded dataset")
    global_step, tr_loss = do_train(args, train_dataset, model, tokenizer)
    logger.info(f"global_step = {global_step}, average loss = {tr_loss}")

    if args.local_rank == -1 or torch.distributed.get_rank() == 0:
        logger.info(f"Saving model checkpoint to {args.save_model_folder}")
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        # Take care of distributed/parallel training
        model_to_save = model.module if hasattr(model, "module") else model
        model_to_save.save_pretrained(args.save_model_folder)
        tokenizer.save_pretrained(args.save_model_folder)

        # Good practice: save your training arguments together with the trained model
        torch.save(args,
                   os.path.join(args.save_model_folder, "training_args.bin"))

        # Load a trained model and vocabulary that you have fine-tuned
        model = AutoModelForQuestionAnswering.from_pretrained(
            args.save_model_folder)  # , force_download=True)
        tokenizer = AutoTokenizer.from_pretrained(
            args.save_model_folder, do_lower_case=args.do_lower_case)
        model.to(args.device)

    # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
    results = {}
    if args.do_eval_after_training and args.local_rank in [-1, 0]:
        logger.info("Loading checkpoints saved during training for evaluation")
        checkpoints = [args.save_model_folder]
        if args.eval_all_checkpoints:
            checkpoints = list(
                os.path.dirname(c) for c in sorted(
                    glob.glob(args.save_model_folder + "/**/" + WEIGHTS_NAME,
                              recursive=True)))
            # logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce model loading logs

        logger.info(f"Evaluate the following checkpoints: {checkpoints}")
        dataset, examples, features = load_or_convert(args,
                                                      tokenizer,
                                                      evaluate=True)
        for checkpoint in checkpoints:
            # Reload the model
            global_step = checkpoint.split(
                "-")[-1] if len(checkpoints) > 1 else ""
            model = AutoModelForQuestionAnswering.from_pretrained(
                checkpoint)  # , force_download=True)
            model.to(args.device)

            # Evaluate

            result = evaluate(args,
                              model,
                              tokenizer,
                              dataset,
                              examples,
                              features,
                              suffix=global_step)

            result = dict(
                (k + ("_{}".format(global_step) if global_step else ""), v)
                for k, v in result.items())
            results.update(result)

    logger.info("Results: {}".format(results))
    write_json(results, os.path.join(args.save_model_folder,
                                     'dev-results.json'))
    return results
Example #9
0
def evaluate_intervention(predictions_folder, baseline_file, output, do_print, do_save, control,
                          split_reasoning, split_num_modifier, split_sam):
    gold = load_json(baseline_file)
    intervention_basename = os.path.basename(baseline_file).replace(BASELINE, INTERVENTION)
    intervention_file = baseline_file.replace(os.path.basename(baseline_file), intervention_basename)
    gold_intervention = load_json(intervention_file)
    gold_descriptor, prediction_files = match_prediction_to_gold(baseline_file, predictions_folder)
    gold_intervention_descriptor, prediction_intervention_files = match_prediction_to_gold(intervention_file,
                                                                                           predictions_folder)

    click.echo(f"Evaluation by intervention with baseline gold: {click.style(baseline_file, fg='blue')}")
    click.echo(f"And intervention gold: {click.style(intervention_file, fg='blue')}")

    if control:
        control_basename = os.path.basename(baseline_file).replace(BASELINE, CONTROL)
        control_file = baseline_file.replace(os.path.basename(baseline_file), control_basename)
        gold_control = load_json(control_file)
        _, control_prediction_files = match_prediction_to_gold(control_file, predictions_folder)

        click.echo(f"And control gold: {click.style(control_file, fg='blue')}")
        # assert c_aligned_baseline == aligned_baseline, c_aligned_intervention == aligned_intervention
    else:
        control_prediction_files = [""] * len(prediction_files)
        gold_control = None

    aligned_baseline, aligned_intervention, aligned_control = align(gold, gold_intervention,
                                                                    gold_control)

    result = dict()

    for predictions_file, prediction_intervention_file, control_prediction_file in \
            zip(sorted(prediction_files), sorted(prediction_intervention_files), sorted(control_prediction_files)):
        predictions: Dict[str, str] = load_json(predictions_file)
        predictions_intervention: Dict[str, str] = load_json(prediction_intervention_file)
        model_name = extract_model_name(gold_descriptor, predictions_file)
        click.echo(f"Evaluating predictions of model {click.style(model_name, fg='green')}")
        click.echo(f"Evaluating {click.style(str(len(aligned_baseline)), fg='green', bold=True)} sample(s).")
        predictions_control: Dict[str, str] = load_json(control_prediction_file) if control_prediction_file else None
        (
            overall_result, results_baseline, results_intervention, results_control,
            correct_before_intervention, correct_change_correct, correct_keep_wrong, correct_change_wrong,
            wrong_change_right, wrong_keep_right, correct_baseline_control, correct_baseline_control_intervention
        ) = eval_intervention(aligned_baseline, aligned_intervention, aligned_control, predictions,
                              predictions_intervention, predictions_control)
        click.echo(f"Got {sum(results_baseline)} correct for baseline.")
        click.echo(f"Got {sum(results_intervention)} correct for intervention.")
        click.echo(f"Out of {sum(results_baseline)} correct baseline results, got {len(correct_change_correct)} "
                   f"correct after intervention.")
        click.echo(f"Interventions that the model 'ignored': {len(correct_keep_wrong)}")
        click.echo(f"Interventions that left the model 'confused': {len(correct_change_wrong)}")
        click.echo(f"Wrong predictions that the model changed to correct: {len(wrong_change_right)}")
        click.echo(f"Wrong predictions that the model didn't change but that became correct: {len(wrong_keep_right)}")

        if do_print:
            print_examples(correct_baseline_control, correct_baseline_control_intervention, correct_change_correct,
                           correct_keep_wrong, correct_change_wrong, wrong_change_right, wrong_keep_right)

        mean, var, ci = get_mean_var_ci_bernoulli(overall_result)
        printable_result = f'{mean:.4f} +/- {ci:.4f}'
        if "bert" in model_name:
            dev_results = load_json(f"models/{model_name}/dev-results.json")
            original_dev_em = dev_results.get('exact',None)
            original_dev_f1 = dev_results.get('f1',None)
        elif "t5" in model_name:
            dev_results = load_json(f"models/{model_name}/dev-results.json")
            original_dev_em = dev_results['exact_final']
            original_dev_f1 = dev_results['f1_final']
        elif 'bidaf' in model_name:
            dev_results = load_json(f"models/{model_name}/metrics.json")
            original_dev_em = dev_results['best_validation_em']
            original_dev_f1 = dev_results['best_validation_f1']
        else:
            original_dev_em = None
            original_dev_f1 = None
        result[model_name] = {
            'evaluation_on_intervention': {
                'human_readable': printable_result,
                'mean': mean,
                '95ci': ci,
                'control': control
            },
            'original_dev_em': original_dev_em,
            'original_dev_f1': original_dev_f1,
            'n': len(aligned_baseline),
            'behaviour': {
                'correct_baseline': sum(results_baseline),
                'correct_intervention': sum(results_intervention),
                'right->change->right': len(correct_change_correct),
                'right->keep->wrong': len(correct_keep_wrong),
                'right->change->wrong': len(correct_change_wrong),
                'wrong->change->right': len(wrong_change_right),
                'wrong->keep->right': len(wrong_keep_right),
                'consistency': len(correct_change_correct) / len(aligned_baseline)
            }
        }
        if control:
            correct_baseline_control_ids = [d.qa_id for d, *_ in correct_baseline_control]
            assert len(correct_baseline_control_ids) == len(set(correct_baseline_control_ids))
            correct_baseline_control_ids = set(correct_baseline_control_ids)
            correct_baseline_control_keep_wrong = [
                x for x in correct_keep_wrong if x[0].qa_id in correct_baseline_control_ids
            ]
            correct_baseline_control_change_wrong = [
                x for x in correct_change_wrong if x[0].qa_id in correct_baseline_control_ids
            ]
            click.echo(f"Got {sum(results_control)} correct for control.")
            result[model_name]['behaviour'].update({
                'correct_control': sum(results_control),
                'correct_baseline_control': len(correct_baseline_control),
                'right+control->change->right': len(correct_baseline_control_intervention),
                'right+control->keep->wrong': len(correct_baseline_control_keep_wrong),
                'right+control->change->wrong': len(correct_baseline_control_change_wrong),
                'consistency+control': len(correct_baseline_control_intervention) / len(aligned_baseline)
            })
        click.echo(f"Overall result: {printable_result}.")
        click.echo(result[model_name]['behaviour'])
        click.echo()

        if split_reasoning:
            result[model_name]['by_reasoning'] = dict()
            for reasoning, gold_split in groupby(sorted(aligned_baseline, key=reasoning_key), key=reasoning_key):
                ab, ai, ac = align(gold_split, gold_intervention, gold_control)
                (
                    overall_result, results_baseline, results_intervention, results_control,
                    correct_before_intervention, correct_change_correct, correct_keep_wrong, correct_change_wrong,
                    wrong_change_right, wrong_keep_right, correct_baseline_control,
                    correct_baseline_control_intervention
                ) = eval_intervention(ab, ai, ac, predictions, predictions_intervention, predictions_control)
                mean, var, ci = get_mean_var_ci_bernoulli(overall_result)
                pr = f'{mean:.4f} +/- {ci:.4f}'
                result[model_name]['by_reasoning'][reasoning] = {
                    'mean': mean,
                    "var": var,
                    'n': len(correct_baseline_control),
                    'tp': len(correct_baseline_control_intervention),
                    '95ci': ci,
                    "printable_result": pr,
                }
                click.echo(f'{reasoning}: {pr}')
        if split_num_modifier:
            result[model_name]['by_num_modifier'] = dict()
            for num_mod, gold_split in groupby(sorted(aligned_baseline, key=num_modifier_key), key=num_modifier_key):
                ab, ai, ac = align(gold_split, gold_intervention, gold_control)
                (
                    overall_result, results_baseline, results_intervention, results_control,
                    correct_before_intervention, correct_change_correct, correct_keep_wrong, correct_change_wrong,
                    wrong_change_right, wrong_keep_right, correct_baseline_control,
                    correct_baseline_control_intervention
                ) = eval_intervention(ab, ai, ac, predictions, predictions_intervention, predictions_control)
                mean, var, ci = get_mean_var_ci_bernoulli(overall_result)
                pr = f'{mean:.4f} +/- {ci:.4f}'
                result[model_name]['by_num_modifier'][num_mod] = {
                    'mean': mean,
                    'n': len(correct_baseline_control),
                    'tp': len(correct_baseline_control_intervention),
                    "var": var,
                    '95ci': ci,
                    "printable_result": pr,
                }
                click.echo(f'{model_name}: {num_mod}: {pr}')
        if split_sam:
            result[model_name]['by_sam'] = dict()
            for sam, gold_split in groupby(sorted(aligned_baseline, key=sam_key), key=sam_key):
                ab, ai, ac = align(gold_split, gold_intervention, gold_control)
                (
                    overall_result, results_baseline, results_intervention, results_control,
                    correct_before_intervention, correct_change_correct, correct_keep_wrong, correct_change_wrong,
                    wrong_change_right, wrong_keep_right, correct_baseline_control,
                    correct_baseline_control_intervention
                ) = eval_intervention(ab, ai, ac, predictions, predictions_intervention, predictions_control)
                mean, var, ci = get_mean_var_ci_bernoulli(overall_result)
                pr = f'{mean:.4f} +/- {ci:.4f}'
                result[model_name]['by_sam'][sam] = {
                    'mean': mean,
                    "var": var,
                    'n': len(correct_baseline_control),
                    'tp': len(correct_baseline_control_intervention),
                    '95ci': ci,
                    "printable_result": pr,
                }
                click.echo(f'{model_name}: {sam}: {pr}')
    click.echo(f"Result: {result}")
    if do_save:
        write_json(result, output)
Example #10
0
def main():
    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
    model_args: ModelArguments
    data_args: DataTrainingArguments
    training_args: TrainingArguments
    if training_args.fp16:
        try:
            import apex

            apex.amp.register_half_function(torch, "einsum")
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
    # if training_args.do_eval and not training_args.do_train and not data_args.predictions_folder:
    #     raise ValueError("Supply predictions folder destination to save the predictions!")
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )

    logger.debug(model_args)
    logger.debug(training_args)
    logger.debug(data_args)
    # raise NotImplementedError
    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. "
            f"Use --overwrite_output_dir to overcome.")

    # Set seed
    set_seed(training_args.seed)
    if training_args.local_rank not in [-1, 0]:
        # Make sure only the first process in distributed training will download model & vocab
        torch.distributed.barrier()
    tokenizer = get_tokenizer(model_args.model_name_or_path,
                              do_lower_case=False)
    if data_args.model_parallel == 4:
        model = T5ForConditionalGeneration4WayParallel.from_pretrained(
            model_args.model_name_or_path,
            cache_dir=model_args.cache_dir,
        )
    elif data_args.model_parallel == 2:
        model = T5ForConditionalGeneration2WayParallel.from_pretrained(
            model_args.model_name_or_path,
            cache_dir=model_args.cache_dir,
        )
    elif data_args.model_parallel is None:
        model = T5ForConditionalGeneration.from_pretrained(
            model_args.model_name_or_path,
            cache_dir=model_args.cache_dir,
        )
    else:
        raise ValueError(
            f"Can only have no, 2way or 4way model parallelism! (expected: {data_args.model_parallel})"
        )
    if training_args.local_rank == 0:
        # Make sure only the first process in distributed training will download model & vocab
        torch.distributed.barrier()
    # Get datasets
    if training_args.do_eval and training_args.local_rank in [-1, 0]:
        eval_dataset, examples = get_dataset(data_args.eval_file_path,
                                             tokenizer,
                                             data_args,
                                             evaluate=True)
    else:
        eval_dataset, examples = None, None
    # Training
    if training_args.do_train:
        if training_args.local_rank in [-1, 0]:
            train_dataset, _ = get_dataset(data_args.train_file_path,
                                           tokenizer, data_args)
            torch.save(train_dataset, 'features.bin')
        else:
            torch.distributed.barrier()
            train_dataset = None

        if training_args.local_rank == 0:
            torch.distributed.barrier()

        else:
            train_dataset = torch.load('features.bin')
        # Initialize our Trainer
        if data_args.model_parallel:
            trainer = MyTrainer(model=model,
                                args=training_args,
                                train_dataset=train_dataset,
                                eval_dataset=eval_dataset,
                                data_collator=collate_training,
                                prediction_loss_only=True)
            model.set_parallel()
        else:
            trainer = Trainer(model=model,
                              args=training_args,
                              train_dataset=train_dataset,
                              eval_dataset=eval_dataset,
                              data_collator=collate_training,
                              prediction_loss_only=True)
        trainer.train(model_path=model_args.model_name_or_path if os.path.
                      isdir(model_args.model_name_or_path) else None)
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    if training_args.do_eval and training_args.local_rank in [-1, 0]:
        if training_args.do_train:
            model_path = os.path.basename(training_args.output_dir)
        else:
            model_path = os.path.basename(model_args.model_name_or_path)
        checkpoints = [training_args.output_dir]
        if data_args.eval_all_checkpoints and training_args.do_train:
            logger.info(
                "Loading checkpoints saved during training for evaluation")
            checkpoints = list(
                os.path.dirname(c) for c in sorted(
                    glob.glob(training_args.output_dir + "/**/" + WEIGHTS_NAME,
                              recursive=True)))
            # logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce model loading logs

        logger.info(f"Evaluate the following checkpoints: {checkpoints}")
        results = {}

        logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)
        for checkpoint in checkpoints:
            # Reload the model
            global_step = checkpoint.split("-")[-1]
            if not all(s in string.digits for s in global_step):
                global_step = ''
            # no model parallelism here (didnt check model.generate)
            model = T5ForConditionalGeneration.from_pretrained(checkpoint)
            device = torch.device("cuda" if torch.cuda.is_available()
                                  and not training_args.no_cuda else "cpu")
            model.to(device)
            model_str = f'{model_path}-{global_step}' if global_step else model_path
            # Note that DistributedSampler samples
            click.echo(
                f"Generating predictions for model {click.style(model_str, fg='blue')}, "
                f"running on {click.style(str(training_args.device), fg='green')}"
            )
            predictions = generate_predictions(eval_dataset, examples, model,
                                               tokenizer, training_args)
            final_metric = squad_evaluate(examples, predictions)

            if is_wandb_available():
                if training_args.do_train:
                    step = int(
                        global_step) if global_step else trainer.global_step
                else:
                    step = 0
                # for now WANDB cannot 'log back in time'
                wandb.log(final_metric, step=step)
            print(f"GLOBAL STEP: {global_step}")
            result = dict(
                (k + ("_{}".format(global_step) if global_step else '_final'),
                 v) for k, v in final_metric.items())

            logger.info(f"Result for {model_str}: {result}")
            results.update(result)

        # sort results by best
        checkpoint_scores = {
            c.split('_')[-1]: v
            for c, v in results.items()
            if any(c.endswith(digit)
                   for digit in string.digits) and c.startswith('exact')
        }
        sorted_checkpoint_scores = {
            k: v
            for k, v in sorted(checkpoint_scores.items(),
                               key=lambda k_v: k_v[1],
                               reverse=True)
        }
        best_cp = next((c for c, v in sorted_checkpoint_scores.items()
                        if v > results['exact_final']), None)

        if best_cp:
            click.echo(f"Best checkpoint is: {best_cp}")
            # copy over best results
            best_cp_folder = f'checkpoint-{best_cp}'

            click.echo(
                f"Copying over files: from {os.path.join(training_args.output_dir, best_cp_folder)} "
                f"to {training_args.output_dir}")
            files_to_copy = glob.glob(
                os.path.join(training_args.output_dir, best_cp_folder, '*'))
            for file in files_to_copy:
                shutil.copy(file, training_args.output_dir)
        else:
            click.echo("best checkpoint is the last step...")
        # remove 'kek'points
        folders_to_remove = [
            p for p in glob.glob(os.path.join(training_args.output_dir, '*'))
            if os.path.isdir(p)
        ]
        click.echo('Folders to remove: ')
        for folder in folders_to_remove:
            click.echo(f"Removing {folder}")
            shutil.rmtree(folder)
        if training_args.do_train:
            logger.info(results)
            write_json(
                results,
                os.path.join(training_args.output_dir, 'dev-results.json'))
        else:
            write_json(
                predictions,
                get_output_predictions_file_name(
                    data_args.eval_file_path, training_args.output_dir,
                    os.path.basename(
                        os.path.normpath(model_args.model_name_or_path))))
Example #11
0
def finetune(optimize_consistency, evaluate_on, original_dev_dataset,
             runs_per_trial, hyperparam_opt_runs, out_file, mute,
             baseline_gold_file, hyperparams, keep_predictions,
             original_ans_length, **kwargs):
    gold_files = get_baseline_intervention_control_from_baseline(
        baseline_gold_file)

    golds = tuple(load_json(g) for g in gold_files)
    # load eval gold for evaluation
    aligneds = align(*golds, assert_same=True)

    hyper_params = [{
        'name': hp['name'],
        'type': hp.get("type", 'range'),
        'bounds': hp['bounds'],
        'value_type': hp.get('value_type', 'float'),
        'log_scale': hp.get('log_scale', True)
    } for hp in json.loads(hyperparams)]

    logger.info(hyper_params)

    args = Args(**kwargs)

    args.debug_features = not mute
    tokenizer = get_tokenizer(args.model_path, args.do_lower_case)
    features = []
    for f in gold_files:
        args.eval_file = f
        features.append(load_or_convert(args, tokenizer, evaluate=True))
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        kwargs['n_gpu'] = 0 if args.no_cuda else torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend="nccl")
        kwargs['n_gpu'] = 1
    kwargs['device'] = device
    args.n_gpu = kwargs['n_gpu']
    args.device = kwargs['device']
    if args.seed:
        set_seed(args)
    logger.debug(args)

    if args.fp16:
        try:
            import apex

            apex.amp.register_half_function(torch, "einsum")
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )

    # load train dataset

    train_dataset, train_examples, train_features = load_or_convert(
        args, tokenizer)
    if not mute:
        debug_features_examples_dataset(train_dataset, train_examples,
                                        train_features, tokenizer)
    if original_dev_dataset:
        args.eval_file = original_dev_dataset
        original_dev_dataset = load_or_convert(args, tokenizer, evaluate=True)
    ax_client = AxClient()
    ax_client.create_experiment(
        name=f'{args.model_path}@{args.train_file}',
        parameters=hyper_params,
        objective_name=evaluate_on,
        minimize=False,
    )
    result = {
        "trials": [],
        "tried_params":
        defaultdict(list),
        "best_params":
        ...,
        'pre_eval':
        train_and_eval_single_step(args,
                                   train_dataset,
                                   *aligneds,
                                   *features,
                                   original_dev_dataset,
                                   *gold_files,
                                   run_nr='eval',
                                   train=False,
                                   evaluate_on=evaluate_on,
                                   original_ans_length=original_ans_length)
    }
    # first, eval and save what is the performance before training

    click.echo(f"Results: {json.dumps(result['pre_eval'], indent=4)}")
    # run hyperparam optimisation
    predictions_folder = keep_predictions
    for i in trange(hyperparam_opt_runs):
        parameters, trial_index = ax_client.get_next_trial()
        logger.info(f"Trying parameters: {parameters}")
        single_step_args = deepcopy(kwargs)
        single_step_args.update(parameters)
        args = Args(**single_step_args)
        args.predictions_folder = str(predictions_folder)
        trial_result = train_and_eval_single_step(
            args,
            train_dataset,
            *aligneds,
            *features,
            original_dev_dataset,
            *gold_files,
            run_nr=i,
            num_runs=runs_per_trial,
            evaluate_on=evaluate_on,
            original_ans_length=original_ans_length)
        #
        if optimize_consistency:
            assert evaluate_on == 'eoi'
            mean = trial_result['consistency']
        else:
            mean = trial_result['overall' if evaluate_on ==
                                'eoi' else 'EMRelaxed']
        if runs_per_trial > 1:
            mean, var, ci = mean
        if original_dev_dataset:
            logger.info(f"Mean: ({mean} * 100 + {trial_result['original']})/2")
            mean = (mean * 100 + trial_result['original']) / 2

        trial_result["mean"] = mean

        logger.info(f"Result: {mean}")
        logger.info(f"Results: {json.dumps(trial_result, indent=4)}")
        result["trials"].append(trial_result)
        result['tried_params'][i].append(parameters)
        ax_client.complete_trial(trial_index=trial_index, raw_data=mean)
    best_params, metrics = ax_client.get_best_parameters()
    result['best_params'] = best_params
    result['best_metrics'] = metrics
    click.echo(f"What is metrics? {metrics}")
    click.echo(json.dumps(result, indent=4))
    write_json(result, out_file)
Example #12
0
def quality(input, reference, output, attr, random_seed, subsample, taaco_dir,
            indices):
    if random_seed:
        random.seed(random_seed)
    if not indices:
        indices = DEFAULT_INDICES
    else:
        indices = indices.split(",")

    sample = load_json(input)
    reference = load_json(reference)
    # scores = Dict[str, np.ndarray]
    getter: Callable[[Any], str] = itemgetter(attr)

    corpus: List[str] = [s['paragraphs'][0]['context'] for s in sample['data']]
    n = len(corpus)
    logger.debug(f"Evaluating sample with n={n} paragraphs.")
    if subsample:
        corpus = random.sample(corpus, subsample)

    result = apply_taaco(corpus, taaco_dir, indices)
    # for index, values in result.items():
    #    scores[index] = np.array(values, dtype=np.float)

    corpus_reference: List[str] = [getter(s) for s in reference]
    n_reference = len(corpus_reference)
    scores_reference = apply_taaco(corpus_reference, taaco_dir, indices)

    final_result = dict()
    overall = 0
    overall_ref = 0
    overall_pos = 0
    overall_pos_reference = 0
    overall_neg = 0
    overall_neg_reference = 0
    by_measure = defaultdict(list)
    by_measure_ref = defaultdict(list)
    for index, values in result.items():
        # t_975 = t.ppf(1 - .025, df=n - 1)
        # ci95 = t_975 * values.std() / math.sqrt(len(values))
        values = np.array(values)
        mean, var, ci95 = get_mean_var_ci(values, alpha=0.025)
        printable_result = f'{mean:.4f} +/- {ci95:.4f}'

        values_reference = np.array(scores_reference[index])
        mean_ref, var_ref, ci95_ref = get_mean_var_ci(values_reference,
                                                      alpha=0.025)
        printable_result_reference = f'{mean_ref:.4f} +/- {ci95_ref:.4f}'

        by_measure[INDEX_TO_MEASURE[index]].append(mean)
        by_measure_ref[INDEX_TO_MEASURE[index]].append(mean_ref)

        if index in NEG:
            overall_neg = mean
            overall_neg_reference += mean_ref
            overall += (1 - mean)
            overall_ref += (1 - mean_ref)
        else:
            overall_pos += mean
            overall_pos_reference += mean_ref
            overall += mean
            overall_ref += mean_ref
        # t_975_reference = t.ppf(1 - .025, df=n_reference - 1)
        # ci95_reference = t_975_reference * values_reference.std()
        click.echo(f"Mean for index {click.style(index, fg='green')} (n={n}): "
                   f"{click.style(printable_result, fg='green', bold=True)}")
        click.echo(
            f"Reference mean for index {click.style(index, fg='green')} (n={n_reference}): "
            f"{click.style(printable_result_reference, fg='green', bold=True)}"
        )
        final_result[index] = {
            'ours': {
                'n': len(sample),
                'human_readable': printable_result,
                'mean': mean,
                'variance': var,
                '95ci': ci95,
            },
            "reference": {
                'human_readable': printable_result_reference,
                'mean': mean_ref,
                'variance': var_ref,
                '95ci': ci95_ref
            },
            "difference": {
                "difference": mean - mean_ref,
                # "within_ci": bool(within_ci)
            }
        }
    ours = ((overall_pos / len(POS)) + overall_neg / len(NEG)) / 2
    ref = ((overall_pos_reference / len(POS)) +
           overall_neg_reference / len(NEG)) / 2

    ours_smooth = overall / len(indices)
    ref_smooth = overall_ref / len(indices)

    by_measure_avg = [
        (sum(v) / len(v)) if k == 'w2v similarity' else 1 - (sum(v) / len(v))
        for k, v in by_measure.items()
    ]
    by_measure_ref_avg = [
        (sum(v) / len(v)) if k == 'w2v similarity' else 1 - (sum(v) / len(v))
        for k, v in by_measure_ref.items()
    ]
    rows = [[k, o, r]
            for k, o, r in zip(by_measure, by_measure_avg, by_measure_ref_avg)]
    click.echo(
        tabulate.tabulate(rows, headers=['Measure', "Ours", "Reference"]))
    by_measure_avg = sum(by_measure_avg) / len(by_measure_avg)

    by_measure_avg_ref = sum(by_measure_ref_avg) / len(by_measure_ref_avg)
    final_result['overall'] = {
        'ours': ours,
        "reference": ref,
        'ours_smooth': ours_smooth,
        'ref_smooth': ref_smooth,
        'by_measure_avg': by_measure_avg,
        'by_measure_avg_ref': by_measure_avg_ref
    }
    click.echo(f"Overall: {click.style(f'{ours:03f}', fg='green')} (n={n}): ")
    click.echo(
        f"Reference overall: {click.style(f'{ref:03f}', fg='green')} (n={n_reference}): "
    )
    click.echo(
        f"Overall smooth: {click.style(f'{ours_smooth:03f}', fg='green')} (n={n}): "
    )
    click.echo(
        f"Reference overall smooth: {click.style(f'{ref_smooth:03f}', fg='green')} (n={n_reference}): "
    )
    click.echo(
        f"Overall by measure: {click.style(f'{by_measure_avg:03f}', fg='green')} (n={n}): "
    )
    click.echo(
        f"Reference overall by measure: {click.style(f'{by_measure_avg_ref:03f}', fg='green')} (n={n_reference}): "
    )
    write_json(final_result, output)