Example #1
0
def predictions(in_files, out_folder, model_paths, model_types, no_cuda,
                per_gpu_eval_batch_size, do_not_lower_case, lang_id, v2,
                n_best_size, max_answer_length, verbose_logging,
                null_score_diff_threshold, do_evaluate, **kwargs):
    assert len(model_paths) == len(model_types)
    for model_path, model_type in zip(model_paths, model_types):
        model = get_model(model_path)
        args = Args(model_path=model_path,
                    model_type=model_type,
                    predictions_folder=out_folder,
                    no_cuda=no_cuda,
                    do_not_lower_case=do_not_lower_case,
                    per_gpu_eval_batch_size=per_gpu_eval_batch_size,
                    lang_id=lang_id,
                    v2=v2,
                    n_best_size=n_best_size,
                    max_answer_length=max_answer_length,
                    verbose_logging=verbose_logging,
                    null_score_diff_threshold=null_score_diff_threshold,
                    **kwargs)
        tokenizer = get_tokenizer(model_path, args.do_lower_case)
        for in_file in in_files:
            args.eval_file = in_file
            logger.debug(args)
            dataset, examples, features = load_or_convert(args,
                                                          tokenizer,
                                                          evaluate=True)
            if do_evaluate:
                out_path = args.predictions_folder
                args.predictions_folder = None
                suffix = os.path.basename(os.path.normpath(model_path))
                score = evaluate(args,
                                 model,
                                 tokenizer,
                                 dataset,
                                 examples,
                                 features,
                                 suffix=suffix,
                                 return_raw=False)
                file_name = get_output_predictions_file_name(
                    args.eval_file, out_path, suffix)
                write_json(score, file_name)
                args.predictions_folder = out_path
            else:
                evaluate(args,
                         model,
                         tokenizer,
                         dataset,
                         examples,
                         features,
                         suffix=os.path.basename(os.path.normpath(model_path)))
Example #2
0
def predict(in_files, output_folder, models, model_classes, gpu, batch_size):
    # There is a chance i'll need to scrap all of this and do convert to features stuff
    if gpu is None:
        gpu = _is_gpu_available()
    logger.debug(fmt_dict(locals()))
    if not len(models) == len(model_classes):
        click.echo(
            f"Num models supplied ({len(models)})!= num model classes supplied ({len(model_classes)})!"
        )
        sys.exit(1)

    for cls, weights_path in zip(model_classes, models):
        model_cls: Model = do_import(cls, relative_import='stresstest.model')
        # TODO: Bidaf should also respect max answer length
        model = model_cls.make(weights_path, gpu=gpu)
        click.echo(
            f"Evaluating model '{click.style(model_cls.__name__, fg='green', bold=True)}' from weights file: "
            f"{click.style(weights_path, fg='blue')}.")
        click.echo(
            f"Running on {click.style('gpu' if gpu else 'cpu', fg='green', bold=True)}."
        )
        for in_file in in_files:
            sample = load_json(in_file)
            num_q = num_questions(sample)
            click.echo(
                f"Evaluating on sample (n={num_q}, |{{C}}|={len(sample)}): {click.style(in_file, fg='blue')}"
            )

            predictions = dict()
            for sample_batch in batch(tqdm(sample_iter(sample),
                                           position=1,
                                           total=num_q),
                                      batch_size=batch_size):
                sample_batch: List[Entry]
                batch_predictions = model.predict_batch(sample_batch)
                for entry, answer in zip(sample_batch, batch_predictions):
                    logger.debug(f"Passage: {entry.passage}")
                    logger.debug(f"Question: {entry.question}")
                    logger.debug(f"Prediction: {answer}")
                    predictions[entry.qa_id] = str(answer)
            output_file_name = get_output_predictions_file_name(
                in_file, output_folder, weights_path)
            click.echo(
                f"Saving predictions to {click.style(output_file_name, fg='blue')}"
            )
            write_json(predictions, output_file_name, pretty=False)
Example #3
0
def generate_dvc(command, dataset_name,
                 train_file, eval_file, batch_size, extra_args,
                 notify, model_names, model_types, gradient_accumulation_steps):
    model_name = model_names[0]
    model_type = model_types[0]
    root_data_path = 'data/football'
    baseline_file = f"{root_data_path}/full/baseline.json"
    intervention_file = f"{root_data_path}/full/intervention.json"
    control_file = f"{root_data_path}/full/control.json"
    predictions_folder = f"{root_data_path}/full/predictions/{dataset_name}"
    model_folder = f"{model_name}-{dataset_name}"
    model_path = f"models/{model_folder}"
    train_path = f"data/datasets/{dataset_name}/{train_file}"
    eval_path = f"data/datasets/{dataset_name}/{eval_file}"
    baseline_predictions = get_output_predictions_file_name(baseline_file, predictions_folder, model_folder)
    intervention_predictions = get_output_predictions_file_name(intervention_file, predictions_folder, model_folder)
    control_predictions = get_output_predictions_file_name(control_file, predictions_folder, model_folder)
    if not batch_size:
        batch_size = 8 if "large" in model_name else 24

    if command == 'train-transformers':
        cmd = (
            f"python main.py --debug --notify {notify} train {train_path} --model-path {model_name} "
            f"--model-type {model_type} --eval-file {eval_path} --save-model-folder {model_path} "
            f"--do-eval-after-training --num-workers 8 --per-gpu-train-batch-size {batch_size} "
            f"--max-answer-length 30 {extra_args}"
        )
        stage_name = f"train-{model_name}-on-{dataset_name}"
        dvc_cmd = (
            f"dvc run -n {stage_name} -d {train_path} -d {eval_path} -o {model_path} "
            f"{cmd}"
        )
    elif command == 'train-transformers-hotpotqa':
        train_path = f"data/datasets/hotpotqa/train.json"
        eval_path = f"data/datasets/hotpotqa/dev.json"
        model_folder = f"{model_name}-hotpotqa"
        model_path = f"models/{model_folder}"
        cmd = (
            f"MODEL={model_name} CACHE_LOCATION=~/localscratch/hotpotqa/ SAVE_TO={model_path} "
            f"BATCH_SIZE={batch_size} ACC_STEPS={gradient_accumulation_steps} MODEL_TYPE={model_type} "
            f"bash scripts/cache_and_train_hotpotqa.sh"
        )
        stage_name = f"train-{model_name}-on-hotpotqa"
        dvc_cmd = (
            f"dvc run -n {stage_name} -d {train_path} -d {eval_path} -o {model_path} "
            f"{cmd}"
        )
    elif command == 'train-transformers-wikihop':
        train_path = f"data/datasets/wikihop/train.json"
        eval_path = f"data/datasets/wikihop/dev.json"
        model_folder = f"{model_name}-wikihop"
        model_path = f"models/{model_folder}"
        cmd = (
            f"MODEL={model_name} CACHE_LOCATION=~/localscratch/wikihop/ SAVE_TO={model_path} "
            f"BATCH_SIZE={batch_size} ACC_STEPS={gradient_accumulation_steps} MODEL_TYPE={model_type} "
            f"bash scripts/cache_and_train_wikihop.sh"
        )
        stage_name = f"train-{model_name}-on-wikihop"
        dvc_cmd = (
            f"dvc run -n {stage_name} -d {train_path} -d {eval_path} -o {model_path} "
            f"{cmd}"
        )
    elif command == 'train-transformers-newsqa':
        train_path = f"data/datasets/newsqa/train.json"
        eval_path = f"data/datasets/newsqa/dev.json"
        model_folder = f"{model_name}-newsqa"
        model_path = f"models/{model_folder}"
        cmd = (
            f"MODEL={model_name} CACHE_LOCATION=~/localscratch/newsqa/ SAVE_TO={model_path} "
            f"BATCH_SIZE={batch_size} ACC_STEPS={gradient_accumulation_steps} MODEL_TYPE={model_type} "
            f"bash scripts/cache_and_train_newsqa.sh"
        )
        stage_name = f"train-{model_name}-on-newsqa"
        dvc_cmd = (
            f"dvc run -n {stage_name} -d {train_path} -d {eval_path} -o {model_path} "
            f"{cmd}"
        )
    elif command == 'train-transformers-searchqa':
        train_path = f"data/datasets/searchqa/train.json"
        eval_path = f"data/datasets/searchqa/dev.json"
        model_folder = f"{model_name}-searchqa"
        model_path = f"models/{model_folder}"
        cmd = (
            f"MODEL={model_name} CACHE_LOCATION=~/localscratch/searchqa/ SAVE_TO={model_path} "
            f"BATCH_SIZE={batch_size} ACC_STEPS={gradient_accumulation_steps} MODEL_TYPE={model_type} "
            f"bash scripts/cache_and_train_searchqa.sh"
        )
        stage_name = f"train-{model_name}-on-searchqa"
        dvc_cmd = (
            f"dvc run -n {stage_name} -d {train_path} -d {eval_path} -o {model_path} "
            f"{cmd}"
        )
    elif command == 'train-transformers-drop':
        train_path = f"data/datasets/drop/train.json"
        eval_path = f"data/datasets/drop/dev.json"
        model_folder = f"{model_name}-drop"
        model_path = f"models/{model_folder}"
        cmd = (
            f"MODEL={model_name} CACHE_LOCATION=~/localscratch/drop/ SAVE_TO={model_path} "
            f"BATCH_SIZE={batch_size} ACC_STEPS={gradient_accumulation_steps} MODEL_TYPE={model_type} "
            f"bash scripts/cache_and_train_drop.sh"
        )
        stage_name = f"train-{model_name}-on-drop"
        dvc_cmd = (
            f"dvc run -n {stage_name} -d {train_path} -d {eval_path} -o {model_path} "
            f"{cmd}"
        )
    elif command == 'train-transformers-combined':
        train_path = f"data/datasets/combined/train.json"
        model_folder = f"{model_name}-combined"
        model_path = f"models/{model_folder}"
        cmd = (
            f"MODEL={model_name} CACHE_LOCATION=~/localscratch/combined/ SAVE_TO={model_path} "
            f"BATCH_SIZE={batch_size} ACC_STEPS={gradient_accumulation_steps} MODEL_TYPE={model_type} "
            f"bash scripts/cache_and_train_combined.sh"
        )
        stage_name = f"train-{model_name}-on-combined"
        dvc_cmd = (
            f"dvc run -n {stage_name} -d {train_path} -d {eval_path} -o {model_path} "
            f"{cmd}"
        )
    elif command == 'predict-transformers':
        model_names = ("albert-base-v2", "albert-large-v2", "albert-xlarge-v2", "albert-xxlarge-v2",
                       'bert-base-uncased', 'bert-large-uncased', 'roberta-base', 'roberta-large')
        model_types = ["albert"] * 4 + ["bert"] * 2 + ["roberta"] * 2
        models_str = " ".join(f"--model-path models/{mp}-{dataset_name} --model-type {mt}" for mp, mt in
                              zip(model_names, model_types))
        if extra_args == DEFAULT_EXTRA_ARGS:
            extra_args = ''
        cmd = (f"python main.py --debug --notify {notify} "
               f"predictions {baseline_file} {intervention_file} {control_file} "
               f" {models_str} "
               f"--out-folder {predictions_folder} --max-answer-length 10 {extra_args}")

        stage_name = f"predict-transformers-{dataset_name}"
        deps_str = " ".join(f"-d models/{mp}-{dataset_name}" for mp in model_names)
        outs_str = ' '.join(
            f"-o {get_output_predictions_file_name(baseline_file, predictions_folder, f'{mp}-{dataset_name}')} "
            f"-o {get_output_predictions_file_name(intervention_file, predictions_folder, f'{mp}-{dataset_name}')} "
            f"-o {get_output_predictions_file_name(control_file, predictions_folder, f'{mp}-{dataset_name}')}"
            for mp in model_names
        )
        dvc_cmd = (f"dvc run -n {stage_name} {deps_str} -d {baseline_file} "
                   f"-d {intervention_file} -d {control_file}  {outs_str} {cmd}")

    elif command == 'evaluate':
        eoi_metric_file = f"eoi.json"
        eoi_metric_path = f"metrics/football/{dataset_name}/{eoi_metric_file}"
        cmd = (f"python main.py evaluate-intervention --baseline-file {baseline_file} "
               f"--predictions-folder {predictions_folder} --control --output {eoi_metric_path} "
               f"--split-reasoning --split-num-modifier --split-sam --do-save")

        stage_name = f"evaluate-intervention-{dataset_name}"

        dvc_cmd = (f"dvc run -n {stage_name} -d {baseline_file} -d {intervention_file} -d {control_file} "
                   f"-d scripts/evaluate_intervention.py -d {predictions_folder} -M {eoi_metric_path} {cmd}")
    elif command == "generate":
        # conf_name = f'conf/{dataset_name}.json'
        cmd = (f"python main.py generate-balanced --config conf/evaluate.json --seed 56 "
               f"--num-workers 8 --do-save --out-path {root_data_path}/full --multiplier 35")
        stage_name = f"generate-sam"
        dvc_cmd = (f"dvc run -n {stage_name} -d scripts/generate_balanced.py "
                   f"-d conf/evaluate.json -o {baseline_file} -o {intervention_file} -o {control_file} {cmd}")

    elif command == 'predict-allennlp':
        stage_name = f"predict-{model_name}-on-{dataset_name}"
        model_path = os.path.join(model_path, "model.tar.gz")
        cmd = (
            f"mkdir -p {predictions_folder} &&"
            f"allennlp predict {model_path} {baseline_file} --output-file {baseline_predictions} --cuda-device 0 "
            f"--use-dataset-reader --silent && python main.py convert-allennlp {baseline_file} {baseline_predictions} "
            f"&& allennlp predict {model_path} {intervention_file} --output-file {intervention_predictions} --cuda-device 0 "
            f"--use-dataset-reader --silent &&"
            f"python main.py convert-allennlp {intervention_file} {intervention_predictions} &&"
            f"allennlp predict {model_path} {control_file} --output-file {control_predictions} --cuda-device 0 "
            f"--use-dataset-reader --silent && python main.py convert-allennlp {control_file} {control_predictions}"
        )

        dvc_cmd = (f"dvc run -n {stage_name} -d {model_path} -d {baseline_file} "
                   f"-d {intervention_file} -d {control_file} -o {baseline_predictions} -o {control_predictions} "
                   f"-o {intervention_predictions}  '{cmd}'")

    elif command == 'train-allennlp':
        cmd = f"TRAIN_SET={train_path} EVAL_SET={eval_path} CUDA=0 MULT={extra_args} " \
              f"allennlp train conf/{model_name}.jsonnet -s {model_path}"
        stage_name = f"train-{model_name}-on-{dataset_name}"
        dvc_cmd = f"dvc run -n {stage_name} -d {train_path} -d {eval_path} -o {model_path} {cmd}"
    elif command == 'train-baselines':
        cmds = []
        dvc_cmds = []
        for masking in [None, 'q', 'p']:
            if masking:
                mask = f"-mask-{masking}"
            else:
                mask = ''
            train_file = f"{root_data_path}/split{mask}/train/combined-train.json"
            model_folder = f"models/bert{mask}-baseline"
            cmd = ("python main.py --debug --notify '*****@*****.**' train "
                   f" {train_file} "
                   "--model-path bert-base-uncased --model-type bert "
                   f"--save-model-folder {model_folder} "
                   "--num-workers 8 --per-gpu-train-batch-size 24 --max-answer-length 10  --debug-features "
                   "--num-train-epochs 15 --overwrite-output-dir "
                   "--save-steps 0 --per-gpu-eval-batch-size 64 --gradient-accumulation-steps 3 --learning-rate 5e-5")
            cmds.append(cmd)
            stage_name = f"train-bert{mask}-baseline"
            dvc_cmd = (
                f"dvc run -n {stage_name} -d {train_file} -o {model_folder} "
                f"{cmd}"
            )
            dvc_cmds.append(dvc_cmd)
        cmd = "\n".join(cmds)
        dvc_cmd = "\n".join(dvc_cmds)
    elif command == 'predict-baselines':
        cmds = []
        dvc_cmds = []

        for masking in [None, 'q', 'p']:
            if masking:
                mask = f"-mask-{masking}"
            else:
                mask = ''
            out_path = f"{root_data_path}/split{mask}/test/"
            predictions_folder = f'{out_path}predictions/'
            baseline_file = f"{out_path}baseline-test.json"
            intervention_file = f"{out_path}intervention-test.json"
            control_file = f"{out_path}control-test.json"
            if masking:
                mask = f"-mask-{masking}"
            else:
                mask = ''
                stage_name = f"predict-random-baselines"
                cmd = (
                    f"python main.py predict {baseline_file} {control_file} {intervention_file} "
                    f"--output-folder {predictions_folder} "
                    f"--model 'random-baseline' --cls RandomBaseline "
                    f"--model 'educated-baseline' --cls EducatedBaseline "
                    f"--model 'informed-baseline' --cls InformedBaseline"
                )
                outs = " ".join(
                    f"-o {get_output_predictions_file_name(f, predictions_folder, n)}"
                    for f in [baseline_file, intervention_file, control_file] for n in
                    ['random-baseline', 'educated-baseline', 'informed-baseline']
                )
                # f"-o {get_output_predictions_file_name(intervention_file, predictions_folder, 'random')}"
                # f"-o {get_output_predictions_file_name(control_file, predictions_folder, 'random')}"]
                cmds.append(cmd)
                dvc_cmd = (
                    f"dvc run -n {stage_name} -d {baseline_file} -d {intervention_file} -d {control_file} {outs} {cmd}"
                )
                dvc_cmds.append(dvc_cmd)
            model_name = f"bert{mask}-baseline"
            model_folder = f"models/bert{mask}-baseline"
            cmd = (f"python main.py --debug --notify {notify} "
                   f"predictions {baseline_file} {intervention_file} {control_file} "
                   f"--model-path {model_folder} --model-type bert "
                   f"--out-folder {predictions_folder} --max-answer-length 10 --per-gpu-eval-batch-size 64")
            cmds.append(cmd)
            stage_name = f"predict-bert{mask}-baseline"
            outs = " ".join(
                f"-o {get_output_predictions_file_name(f, predictions_folder, model_name)}"
                for f in [baseline_file, intervention_file, control_file]
            )
            dvc_cmd = (
                f"dvc run -n {stage_name} -d {model_folder} -d {baseline_file} -d {intervention_file} "
                f"-d {control_file} {outs} "
                f"{cmd}"
            )
            dvc_cmds.append(dvc_cmd)
        cmd = "\n".join(cmds)
        dvc_cmd = "\n".join(dvc_cmds)
    elif command == 'finetune':
        raise NotImplementedError()
    elif command == 'train-t5':

        cmd = (f"PYTHONPATH='.' python scripts/t5.py --model_name_or_path {model_name} --output_dir {model_path} "
               f"--train_file {train_path} --eval_file {eval_path} "
               f"--do_train --do_eval --num_workers 8 --per_device_train_batch_size {batch_size} "
               f"{extra_args}")
        stage_name = f"train-{model_name}-on-{dataset_name}"
        dvc_cmd = (
            f"dvc run -n {stage_name} -d {train_path} -d {eval_path} -o {model_path} "
            f"{cmd}"
        )
    elif command == 'predict-t5':
        model_names = [f"t5-{s}" for s in ('small', 'base', 'large')]
        dataset_names = ['drop', 'hotpotqa', 'newsqa', 'squad1']
        models_str = " ".join(f"--model-path models/{m}-{dataset_name}" for m in model_names)
        if extra_args == DEFAULT_EXTRA_ARGS:
            extra_args = ''
        cmd = (f"python main.py --debug "
               f"t5-predictions {baseline_file} {intervention_file} {control_file} "
               f" {models_str} "
               f"--out-folder {predictions_folder} --max-answer-length 10 {extra_args}")

        stage_name = f"predict-t5-{dataset_name}"
        deps_str = " ".join(f"-d models/{m}-{dataset_name}" for m in model_names)
        outs_str = ' '.join(
            f"-o {get_output_predictions_file_name(baseline_file, predictions_folder, f'{m}-{dataset_name}')} "
            f"-o {get_output_predictions_file_name(intervention_file, predictions_folder, f'{m}-{dataset_name}')} "
            f"-o {get_output_predictions_file_name(control_file, predictions_folder, f'{m}-{dataset_name}')}"
            for m in model_names
        )
        dvc_cmd = (f"dvc run -n {stage_name} {deps_str} -d {baseline_file} "
                   f"-d {intervention_file} -d {control_file}  {outs_str} {cmd}")
    elif command == "generate-baselines":
        cmds = []
        dvc_cmds = []
        for masking in [None, 'q', 'p']:
            # conf_name = f'conf/{dataset_name}.json'
            if masking:
                mask = f"-mask-{masking}"
                mask_opt = f'--mask-{masking}' + (' --keep-answer-candidates' if masking == 'p' else '')
            else:
                mask = ''
                mask_opt = ''
            for split, multiplier, seed in [("train", 100, 56), ("test", 20, 38676)]:
                out_path = f"{root_data_path}/split{mask}/{split}/"
                baseline_file = f"{out_path}baseline-{split}.json"
                intervention_file = f"{out_path}intervention-{split}.json"
                control_file = f"{out_path}control-{split}.json"
                combined_file = f"{out_path}combined-{split}.json"
                if split == 'train':
                    combine = '--combine'
                    outs = f"-o {combined_file}"
                else:
                    combine = ''
                    outs = f' -o {baseline_file} -o {intervention_file} -o {control_file} '
                cmd = (f"python main.py generate-balanced --config conf/finetune.json "
                       f"--seed {seed} --num-workers 8 --do-save --out-path {out_path} "
                       f"--multiplier {multiplier} --split {split} "
                       f"{combine} {mask_opt}")
                cmds.append(cmd)

                stage_name = f"generate-{split}{mask}"
                dvc_cmd = (f"dvc run -n {stage_name} -d scripts/generate_balanced.py "
                           f"-d conf/finetune.json {outs} "
                           f"{cmd}")
                dvc_cmds.append(dvc_cmd)
        cmd = "\n".join(cmds)
        dvc_cmd = "\n".join(dvc_cmds)
    elif command == 'evaluate-baselines':
        cmds = []
        dvc_cmds = []
        for masking in [None, 'q', 'p']:
            split = 'test'
            mask = f"-mask-{masking}" if masking else ''
            out_path = f"{root_data_path}/split{mask}/{split}/"
            baseline_file = f"{out_path}baseline-{split}.json"
            predictions_folder = f'{out_path}predictions/'
            intervention_file = f"{out_path}intervention-{split}.json"''
            control_file = f"{out_path}control-{split}.json"
            metric = f"metrics/football/baselines/baselines{mask}.json"
            cmd = (f"python main.py evaluate {baseline_file} "
                   f"{intervention_file} {control_file} --prediction-folder {predictions_folder} "
                   f"--output {metric} --metric EMRelaxed")
            stage_name = f"evaluate-baselines{mask}"
            dvc_cmd = (
                (f"dvc run -n {stage_name} -d {baseline_file} -d {intervention_file} -d {control_file} "
                 f"-d {predictions_folder} -M {metric} "
                 f"'{cmd}'")
            )
            cmds.append(cmd)
            dvc_cmds.append(dvc_cmd)
        cmd = "\n\n".join(cmds)
        dvc_cmd = "\n\n".join(dvc_cmds)
    else:
        raise NotImplementedError()

    click.secho("Python command:", fg='green', bold=True)
    click.echo(cmd)
    click.secho("DVC command:", fg='green', bold=True)
    click.echo(dvc_cmd)
Example #4
0
def evaluate(args: Args,
             model,
             tokenizer,
             dataset,
             examples,
             features,
             suffix="",
             return_raw=False):
    if args.no_cuda is None:
        args.no_cuda = not _is_gpu_available()
    if args.predictions_folder:
        assert args.eval_file, "Need name of the eval file to save predictions!"
    device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    n_gpu = 0 if args.no_cuda else torch.cuda.device_count()

    eval_batch_size = args.per_gpu_eval_batch_size * max(1, n_gpu)

    # Note that DistributedSampler samples randomly

    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset,
                                 sampler=eval_sampler,
                                 batch_size=eval_batch_size)
    model.to(device)
    # multi-gpu evaluate
    if n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
        model = torch.nn.DataParallel(model)

    # Eval!
    click.echo(
        f"Generating predictions for model {click.style(args.model_path, fg='blue')}, "
        f"running on {click.style(str(device), fg='green')}")
    click.echo("  Num examples = %d" % len(dataset))
    click.echo("  Batch size = %d" % eval_batch_size)

    all_results = []
    start_time = timeit.default_timer()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            if args.model_type in [
                    "xlm", "roberta", "distilbert", "camembert"
            ]:
                del inputs["token_type_ids"]

            feature_indices = batch[3]

            # XLNet and XLM use more arguments for their predictions
            if args.model_type in ["xlnet", "xlm"]:
                inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
                # for lang_id-sensitive xlm models
                if hasattr(model, "config") and hasattr(
                        model.config, "lang2id"):
                    inputs.update({
                        "langs":
                        (torch.ones(batch[0].shape, dtype=torch.int64) *
                         args.lang_id).to(device)
                    })
            outputs = model(**inputs)

        for i, feature_index in enumerate(feature_indices):
            eval_feature = features[feature_index.item()]
            unique_id = int(eval_feature.unique_id)

            output = [to_list(output[i]) for output in outputs]

            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
            # models only use two.
            if len(output) >= 5:
                start_logits = output[0]
                start_top_index = output[1]
                end_logits = output[2]
                end_top_index = output[3]
                cls_logits = output[4]

                result = SquadResult(
                    unique_id,
                    start_logits,
                    end_logits,
                    start_top_index=start_top_index,
                    end_top_index=end_top_index,
                    cls_logits=cls_logits,
                )

            else:
                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)

    eval_time = timeit.default_timer() - start_time
    logger.info(
        f"Evaluation done in total {eval_time} secs ({eval_time / len(dataset)} sec per example)"
    )
    eval_file = args.eval_file
    predictions_folder = args.predictions_folder
    v2 = args.v2
    if predictions_folder:
        out_file = get_output_predictions_file_name(eval_file,
                                                    predictions_folder, suffix)
        logger.info(f"Saving predictions in {out_file}")

        # Compute predictions
        file_name = os.path.basename(out_file)
        output_prediction_file = os.path.join(predictions_folder, file_name)
        # output_nbest_file = os.path.join(predictions_folder, f"nbest-{file_name}")
        output_nbest_file = None

        if v2:
            output_null_log_odds_file = os.path.join(predictions_folder,
                                                     f"null-odds-{file_name}")
        else:
            output_null_log_odds_file = None
    else:
        logger.info("Not saving predictions...")
        output_prediction_file = None
        output_nbest_file = None
        output_null_log_odds_file = None

    # XLNet and XLM use a more complex post-processing procedure
    if args.model_type in ["xlnet", "xlm"]:
        start_n_top = model.config.start_n_top if hasattr(
            model, "config") else model.module.config.start_n_top
        end_n_top = model.config.end_n_top if hasattr(
            model, "config") else model.module.config.end_n_top

        predictions = compute_predictions_log_probs(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_length,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            start_n_top,
            end_n_top,
            args.v2,
            tokenizer,
            args.verbose_logging,
        )
    else:
        predictions = compute_predictions_logits(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_length,
            args.do_lower_case,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            args.verbose_logging,
            args.v2,
            args.null_score_diff_threshold,
            tokenizer,
        )

    # Compute the F1 and exact scores.
    # results = squad_evaluate(examples, predictions)
    # return results
    if return_raw:
        return predictions
    else:
        return squad_evaluate(examples, predictions)
Example #5
0
def main():
    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
    model_args: ModelArguments
    data_args: DataTrainingArguments
    training_args: TrainingArguments
    if training_args.fp16:
        try:
            import apex

            apex.amp.register_half_function(torch, "einsum")
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
    # if training_args.do_eval and not training_args.do_train and not data_args.predictions_folder:
    #     raise ValueError("Supply predictions folder destination to save the predictions!")
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )

    logger.debug(model_args)
    logger.debug(training_args)
    logger.debug(data_args)
    # raise NotImplementedError
    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. "
            f"Use --overwrite_output_dir to overcome.")

    # Set seed
    set_seed(training_args.seed)
    if training_args.local_rank not in [-1, 0]:
        # Make sure only the first process in distributed training will download model & vocab
        torch.distributed.barrier()
    tokenizer = get_tokenizer(model_args.model_name_or_path,
                              do_lower_case=False)
    if data_args.model_parallel == 4:
        model = T5ForConditionalGeneration4WayParallel.from_pretrained(
            model_args.model_name_or_path,
            cache_dir=model_args.cache_dir,
        )
    elif data_args.model_parallel == 2:
        model = T5ForConditionalGeneration2WayParallel.from_pretrained(
            model_args.model_name_or_path,
            cache_dir=model_args.cache_dir,
        )
    elif data_args.model_parallel is None:
        model = T5ForConditionalGeneration.from_pretrained(
            model_args.model_name_or_path,
            cache_dir=model_args.cache_dir,
        )
    else:
        raise ValueError(
            f"Can only have no, 2way or 4way model parallelism! (expected: {data_args.model_parallel})"
        )
    if training_args.local_rank == 0:
        # Make sure only the first process in distributed training will download model & vocab
        torch.distributed.barrier()
    # Get datasets
    if training_args.do_eval and training_args.local_rank in [-1, 0]:
        eval_dataset, examples = get_dataset(data_args.eval_file_path,
                                             tokenizer,
                                             data_args,
                                             evaluate=True)
    else:
        eval_dataset, examples = None, None
    # Training
    if training_args.do_train:
        if training_args.local_rank in [-1, 0]:
            train_dataset, _ = get_dataset(data_args.train_file_path,
                                           tokenizer, data_args)
            torch.save(train_dataset, 'features.bin')
        else:
            torch.distributed.barrier()
            train_dataset = None

        if training_args.local_rank == 0:
            torch.distributed.barrier()

        else:
            train_dataset = torch.load('features.bin')
        # Initialize our Trainer
        if data_args.model_parallel:
            trainer = MyTrainer(model=model,
                                args=training_args,
                                train_dataset=train_dataset,
                                eval_dataset=eval_dataset,
                                data_collator=collate_training,
                                prediction_loss_only=True)
            model.set_parallel()
        else:
            trainer = Trainer(model=model,
                              args=training_args,
                              train_dataset=train_dataset,
                              eval_dataset=eval_dataset,
                              data_collator=collate_training,
                              prediction_loss_only=True)
        trainer.train(model_path=model_args.model_name_or_path if os.path.
                      isdir(model_args.model_name_or_path) else None)
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    if training_args.do_eval and training_args.local_rank in [-1, 0]:
        if training_args.do_train:
            model_path = os.path.basename(training_args.output_dir)
        else:
            model_path = os.path.basename(model_args.model_name_or_path)
        checkpoints = [training_args.output_dir]
        if data_args.eval_all_checkpoints and training_args.do_train:
            logger.info(
                "Loading checkpoints saved during training for evaluation")
            checkpoints = list(
                os.path.dirname(c) for c in sorted(
                    glob.glob(training_args.output_dir + "/**/" + WEIGHTS_NAME,
                              recursive=True)))
            # logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce model loading logs

        logger.info(f"Evaluate the following checkpoints: {checkpoints}")
        results = {}

        logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)
        for checkpoint in checkpoints:
            # Reload the model
            global_step = checkpoint.split("-")[-1]
            if not all(s in string.digits for s in global_step):
                global_step = ''
            # no model parallelism here (didnt check model.generate)
            model = T5ForConditionalGeneration.from_pretrained(checkpoint)
            device = torch.device("cuda" if torch.cuda.is_available()
                                  and not training_args.no_cuda else "cpu")
            model.to(device)
            model_str = f'{model_path}-{global_step}' if global_step else model_path
            # Note that DistributedSampler samples
            click.echo(
                f"Generating predictions for model {click.style(model_str, fg='blue')}, "
                f"running on {click.style(str(training_args.device), fg='green')}"
            )
            predictions = generate_predictions(eval_dataset, examples, model,
                                               tokenizer, training_args)
            final_metric = squad_evaluate(examples, predictions)

            if is_wandb_available():
                if training_args.do_train:
                    step = int(
                        global_step) if global_step else trainer.global_step
                else:
                    step = 0
                # for now WANDB cannot 'log back in time'
                wandb.log(final_metric, step=step)
            print(f"GLOBAL STEP: {global_step}")
            result = dict(
                (k + ("_{}".format(global_step) if global_step else '_final'),
                 v) for k, v in final_metric.items())

            logger.info(f"Result for {model_str}: {result}")
            results.update(result)

        # sort results by best
        checkpoint_scores = {
            c.split('_')[-1]: v
            for c, v in results.items()
            if any(c.endswith(digit)
                   for digit in string.digits) and c.startswith('exact')
        }
        sorted_checkpoint_scores = {
            k: v
            for k, v in sorted(checkpoint_scores.items(),
                               key=lambda k_v: k_v[1],
                               reverse=True)
        }
        best_cp = next((c for c, v in sorted_checkpoint_scores.items()
                        if v > results['exact_final']), None)

        if best_cp:
            click.echo(f"Best checkpoint is: {best_cp}")
            # copy over best results
            best_cp_folder = f'checkpoint-{best_cp}'

            click.echo(
                f"Copying over files: from {os.path.join(training_args.output_dir, best_cp_folder)} "
                f"to {training_args.output_dir}")
            files_to_copy = glob.glob(
                os.path.join(training_args.output_dir, best_cp_folder, '*'))
            for file in files_to_copy:
                shutil.copy(file, training_args.output_dir)
        else:
            click.echo("best checkpoint is the last step...")
        # remove 'kek'points
        folders_to_remove = [
            p for p in glob.glob(os.path.join(training_args.output_dir, '*'))
            if os.path.isdir(p)
        ]
        click.echo('Folders to remove: ')
        for folder in folders_to_remove:
            click.echo(f"Removing {folder}")
            shutil.rmtree(folder)
        if training_args.do_train:
            logger.info(results)
            write_json(
                results,
                os.path.join(training_args.output_dir, 'dev-results.json'))
        else:
            write_json(
                predictions,
                get_output_predictions_file_name(
                    data_args.eval_file_path, training_args.output_dir,
                    os.path.basename(
                        os.path.normpath(model_args.model_name_or_path))))