コード例 #1
0
def combine_partial_results(result_dir: str,
                            save_dir: str = None,
                            save_prefix=None,
                            calc_bleu=False,
                            just_metrics=False):
    """Write first n lines of each file f in src_dir to dest_dir/f """
    src_dir = Path(result_dir)
    save_dir = Path(save_dir)
    save_dir.mkdir(exist_ok=True)
    paths_to_combine = list(src_dir.glob("rank*.json"))
    records = []
    for partial_result in paths_to_combine:
        records.extend(load_json(partial_result))
    preds = [x["pred"] for x in records]
    labels = [x["label"] for x in records]
    score_fn = calculate_bleu if calc_bleu else calculate_rouge
    metrics = score_fn(preds, labels)
    save_json(metrics, save_dir.joinpath(
        "metrics.json"))  # better would be be {prefix}_{rouge|bleu}.json
    print(metrics)
    if just_metrics:
        return

    if save_prefix is None:
        save_prefix = "generated"
        print("using generated as prefix")

    tgt_path = save_dir.joinpath(f"{save_prefix}.target")
    write_txt_file(labels, tgt_path)
    pred_path = save_dir.joinpath(f"{save_prefix}.pred_target")
    write_txt_file(preds, pred_path)
    if "source" in records[0]:
        src_path = save_dir.joinpath(f"{save_prefix}.source")
        write_txt_file([x["source"] for x in records], src_path)
コード例 #2
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments))

    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    check_output_dir(training_args)

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.parallel_mode == ParallelMode.DISTRIBUTED),
        training_args.fp16,
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    config = BartConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )

    extra_model_params = ("encoder_layerdrop", "decoder_layerdrop", "dropout",
                          "attention_dropout")
    for p in extra_model_params:
        if getattr(training_args, p, None):
            assert hasattr(
                config, p
            ), f"({config.__class__.__name__}) doesn't have a `{p}` attribute"
            setattr(config, p, getattr(training_args, p))

    tokenizer = BartTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
    model = BartForConditionalGeneration.from_pretrained(
        model_args.model_name_or_path,
        from_tf=".ckpt" in model_args.model_name_or_path,
        config=config,
        cache_dir=model_args.cache_dir,
    )

    # use task specific params
    use_task_specific_params(model, data_args.task)

    # set num_beams for evaluation
    if data_args.eval_beams is None:
        data_args.eval_beams = model.config.num_beams

    # set decoder_start_token_id for MBart
    if model.config.decoder_start_token_id is None and isinstance(
            tokenizer, MBartTokenizer):
        assert (data_args.tgt_lang is not None and data_args.src_lang
                is not None), "mBart requires --tgt_lang and --src_lang"
        model.config.decoder_start_token_id = tokenizer.lang_code_to_id[
            data_args.tgt_lang]

    if model_args.freeze_embeds:
        freeze_embeds(model)
    if model_args.freeze_encoder:
        freeze_params(model.get_encoder())
        assert_all_frozen(model.get_encoder())

    dataset_class = Seq2SeqDataset

    # Get datasets
    train_dataset = (dataset_class(
        tokenizer,
        type_path="train",
        data_dir=data_args.data_dir,
        n_obs=data_args.n_train,
        max_target_length=data_args.max_target_length,
        max_source_length=data_args.max_source_length,
        prefix=model.config.prefix or "",
    ) if training_args.do_train else None)
    eval_dataset = (dataset_class(
        tokenizer,
        type_path="val",
        data_dir=data_args.data_dir,
        n_obs=data_args.n_val,
        max_target_length=data_args.val_max_target_length,
        max_source_length=data_args.max_source_length,
        prefix=model.config.prefix or "",
    ) if training_args.do_eval or
                    training_args.evaluation_strategy != EvaluationStrategy.NO
                    else None)
    test_dataset = (dataset_class(
        tokenizer,
        type_path="test",
        data_dir=data_args.data_dir,
        n_obs=data_args.n_test,
        max_target_length=data_args.test_max_target_length,
        max_source_length=data_args.max_source_length,
        prefix=model.config.prefix or "",
    ) if training_args.do_predict else None)

    # Initialize our Trainer
    compute_metrics_fn = (build_compute_metrics_fn(data_args.task, tokenizer)
                          if training_args.predict_with_generate else None)
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=Seq2SeqDataCollator(tokenizer, data_args,
                                          training_args.tpu_num_cores),
        compute_metrics=compute_metrics_fn,
        tokenizer=tokenizer,
    )

    all_metrics = {}
    # Training
    if training_args.do_train:
        logger.info("*** Train ***")

        train_result = trainer.train(
            model_path=model_args.model_name_or_path if os.path.
            isdir(model_args.model_name_or_path) else None)
        metrics = train_result.metrics
        metrics["train_n_objs"] = data_args.n_train

        trainer.save_model()  # this also saves the tokenizer

        if trainer.is_world_process_zero():
            handle_metrics("train", metrics, training_args.output_dir)
            all_metrics.update(metrics)

            # Need to save the state, since Trainer.save_model saves only the tokenizer with the model
            trainer.state.save_to_json(
                os.path.join(training_args.output_dir, "trainer_state.json"))

            # For convenience, we also re-save the tokenizer to the same directory,
            # so that you can share your model easily on huggingface.co/models =)
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        metrics = trainer.evaluate(metric_key_prefix="val",
                                   max_length=data_args.val_max_target_length,
                                   num_beams=data_args.eval_beams)
        metrics["val_n_objs"] = data_args.n_val
        metrics["val_loss"] = round(metrics["val_loss"], 4)

        if trainer.is_world_process_zero():

            handle_metrics("val", metrics, training_args.output_dir)
            all_metrics.update(metrics)

    if training_args.do_predict:
        logger.info("*** Predict ***")

        test_output = trainer.predict(
            test_dataset=test_dataset,
            metric_key_prefix="test",
            max_length=data_args.val_max_target_length,
            num_beams=data_args.eval_beams,
        )
        metrics = test_output.metrics
        metrics["test_n_objs"] = data_args.n_test

        if trainer.is_world_process_zero():
            metrics["test_loss"] = round(metrics["test_loss"], 4)
            handle_metrics("test", metrics, training_args.output_dir)
            all_metrics.update(metrics)

            if training_args.predict_with_generate:
                test_preds = tokenizer.batch_decode(
                    test_output.predictions,
                    skip_special_tokens=True,
                    clean_up_tokenization_spaces=True)
                test_preds = lmap(str.strip, test_preds)
                write_txt_file(
                    test_preds,
                    os.path.join(training_args.output_dir,
                                 "test_generations.txt"))

    if trainer.is_world_process_zero():
        save_json(all_metrics,
                  os.path.join(training_args.output_dir, "all_results.json"))

    return all_metrics
コード例 #3
0
def run_generate():
    parser = argparse.ArgumentParser(
        epilog=
        "Unspecified args like --num_beams=2 --decoder_start_token_id=4 are passed to model.generate"
    )
    parser.add_argument("--data_dir", type=str, help="like cnn_dm/test.source")
    parser.add_argument(
        "--model_name",
        type=str,
        help="like facebook/bart-large-cnn,t5-base, etc.",
        default="sshleifer/distilbart-xsum-12-3",
    )
    parser.add_argument("--save_dir",
                        type=str,
                        help="where to save",
                        default="tmp_gen")
    parser.add_argument("--max_source_length", type=int, default=None)
    parser.add_argument(
        "--type_path",
        type=str,
        default="test",
        help="which subset to evaluate typically train/val/test")
    parser.add_argument("--reference_path",
                        type=str,
                        required=False,
                        help="like cnn_dm/test.target")
    parser.add_argument("--task",
                        type=str,
                        default="summarization",
                        help="used for task_specific_params + metrics")
    parser.add_argument("--bs",
                        type=int,
                        default=8,
                        required=False,
                        help="batch size")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        required=False,
                        help="should be passed by distributed.launch")

    parser.add_argument("--n_obs",
                        type=int,
                        default=None,
                        required=False,
                        help="How many observations. Defaults to all.")
    parser.add_argument(
        "--sync_timeout",
        type=int,
        default=600,
        required=False,
        help=
        "How long should master process wait for other processes to finish.",
    )
    parser.add_argument("--fp16", action="store_true")
    parser.add_argument("--debug", action="store_true")
    start_time = time.time()
    args, rest = parser.parse_known_args()
    generate_kwargs = parse_numeric_n_bool_cl_kwargs(rest)
    if generate_kwargs and args.local_rank <= 0:
        print(f"parsed the following generate kwargs: {generate_kwargs}")
    json_save_dir = Path(args.save_dir + "_tmp")
    Path(json_save_dir).mkdir(exist_ok=True)  # this handles locking.
    intermediate_files = list(json_save_dir.glob("rank_*.json"))
    if intermediate_files:
        raise ValueError(
            f"Found files at {json_save_dir} please move or remove them.")
        # In theory, a node could finish and save before another node hits this. If this happens, we can address later.

    Path(args.save_dir).mkdir(exist_ok=True)
    results, num_replicas = eval_data_dir(
        args.data_dir,
        json_save_dir,
        args.model_name,
        type_path=args.type_path,
        bs=args.bs,
        fp16=args.fp16,
        task=args.task,
        local_rank=args.local_rank,
        n_obs=args.n_obs,
        max_source_length=args.max_source_length,
        **generate_kwargs,
    )

    if args.local_rank <= 0:
        save_dir = Path(args.save_dir)
        save_dir.mkdir(exist_ok=True)
        partial_results = gather_results_from_each_node(
            num_replicas, json_save_dir, args.sync_timeout)
        preds = combine_partial_results(partial_results)
        tgt_file = Path(args.data_dir).joinpath(args.type_path + ".target")
        labels = [x.rstrip() for x in open(tgt_file).readlines()][:len(preds)]

        # Calculate metrics, save metrics,  and save _generations.txt
        calc_bleu = "translation" in args.task
        score_fn = calculate_bleu if calc_bleu else calculate_rouge
        metric_name = "bleu" if calc_bleu else "rouge"
        metrics: Dict = score_fn(preds, labels)
        metrics["n_obs"] = len(preds)
        runtime = time.time() - start_time
        metrics["seconds_per_sample"] = round(runtime / metrics["n_obs"], 4)
        metrics["n_gpus"] = num_replicas
        # TODO(@stas00): add whatever metadata to metrics
        metrics_save_path = save_dir.joinpath(
            f"{args.type_path}_{metric_name}.json")
        save_json(metrics, metrics_save_path, indent=None)
        print(metrics)
        write_txt_file(preds,
                       save_dir.joinpath(f"{args.type_path}_generations.txt"))
        if args.debug:
            write_txt_file(labels,
                           save_dir.joinpath(f"{args.type_path}.target"))
        else:
            shutil.rmtree(json_save_dir)
コード例 #4
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments))

    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    config = AutoConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )

    extra_model_params = ("encoder_layerdrop", "decoder_layerdrop", "dropout",
                          "attention_dropout")
    for p in extra_model_params:
        if getattr(training_args, p, None):
            assert hasattr(
                config, p
            ), f"({config.__class__.__name__}) doesn't have a `{p}` attribute"
            setattr(config, p, getattr(training_args, p))

    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
    model = AutoModelForSeq2SeqLM.from_pretrained(
        model_args.model_name_or_path,
        from_tf=".ckpt" in model_args.model_name_or_path,
        config=config,
        cache_dir=model_args.cache_dir,
    )

    # use task specific params
    use_task_specific_params(model, data_args.task)

    # set num_beams for evaluation
    if data_args.eval_beams is None:
        data_args.eval_beams = model.config.num_beams

    # set decoder_start_token_id for MBart
    if model.config.decoder_start_token_id is None and isinstance(
            tokenizer, MBartTokenizer):
        assert (data_args.tgt_lang is not None and data_args.src_lang
                is not None), "mBart requires --tgt_lang and --src_lang"
        model.config.decoder_start_token_id = tokenizer.lang_code_to_id[
            data_args.tgt_lang]

    if model_args.freeze_embeds:
        freeze_embeds(model)
    if model_args.freeze_encoder:
        freeze_params(model.get_encoder())
        assert_all_frozen(model.get_encoder())

    dataset_class = Seq2SeqDataset if hasattr(
        tokenizer, "prepare_seq2seq_batch") else LegacySeq2SeqDataset

    # Get datasets
    train_dataset = (dataset_class(
        tokenizer,
        type_path="train",
        data_dir=data_args.data_dir,
        n_obs=data_args.n_train,
        max_target_length=data_args.max_target_length,
        max_source_length=data_args.max_source_length,
        prefix=model.config.prefix or "",
    ) if training_args.do_train else None)
    eval_dataset = (dataset_class(
        tokenizer,
        type_path="val",
        data_dir=data_args.data_dir,
        n_obs=data_args.n_val,
        max_target_length=data_args.val_max_target_length,
        max_source_length=data_args.max_source_length,
        prefix=model.config.prefix or "",
    ) if training_args.do_eval or
                    training_args.evaluation_strategy != EvaluationStrategy.NO
                    else None)
    test_dataset = (dataset_class(
        tokenizer,
        type_path="test",
        data_dir=data_args.data_dir,
        n_obs=data_args.n_test,
        max_target_length=data_args.test_max_target_length,
        max_source_length=data_args.max_source_length,
        prefix=model.config.prefix or "",
    ) if training_args.do_predict else None)

    # Initialize our Trainer
    compute_metrics_fn = (build_compute_metrics_fn(data_args.task, tokenizer)
                          if training_args.predict_with_generate else None)
    trainer = Seq2SeqTrainer(
        model=model,
        config=config,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=Seq2SeqDataCollator(tokenizer, data_args,
                                          training_args.tpu_num_cores),
        compute_metrics=compute_metrics_fn,
        data_args=data_args,
    )

    # Training
    if training_args.do_train:
        trainer.train(model_path=model_args.model_name_or_path if os.path.
                      isdir(model_args.model_name_or_path) else None)
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_process_zero():
            trainer.state.save_to_json(
                os.path.join(training_args.output_dir, "trainer_state.json"))
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    eval_results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        result = trainer.evaluate()

        if trainer.is_world_process_zero():
            logger.info("***** Eval results *****")
            for key, value in result.items():
                logger.info("  %s = %s", key, value)
            save_json(
                result,
                os.path.join(training_args.output_dir, "eval_results.json"))
            eval_results.update(result)

    if training_args.do_predict:
        logging.info("*** Test ***")

        test_output = trainer.predict(test_dataset=test_dataset)
        test_metrics = {
            k.replace("eval", "test"): v
            for k, v in test_output.metrics.items()
        }

        if trainer.is_world_process_zero():
            logger.info("***** Test results *****")
            for key, value in test_metrics.items():
                logger.info("  %s = %s", key, value)

            save_json(
                test_metrics,
                os.path.join(training_args.output_dir, "test_results.json"))
            eval_results.update(test_metrics)

            if training_args.predict_with_generate:
                test_preds = tokenizer.batch_decode(
                    test_output.predictions,
                    skip_special_tokens=True,
                    clean_up_tokenization_spaces=True)
                test_preds = lmap(str.strip, test_preds)
                write_txt_file(
                    test_preds,
                    os.path.join(training_args.output_dir,
                                 "test_generations.txt"))

    if trainer.is_world_process_zero():
        save_json(eval_results, "all_results.json")
    return eval_results
コード例 #5
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments))

    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    check_output_dir(training_args)

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.parallel_mode == ParallelMode.DISTRIBUTED),
        training_args.fp16,
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    config = AutoConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )

    extra_model_params = ("encoder_layerdrop", "decoder_layerdrop", "dropout",
                          "attention_dropout")
    for p in extra_model_params:
        if getattr(training_args, p, None):
            assert hasattr(
                config, p
            ), f"({config.__class__.__name__}) doesn't have a `{p}` attribute"
            setattr(config, p, getattr(training_args, p))

    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
    model = AutoModelForSeq2SeqLM.from_pretrained(
        model_args.model_name_or_path,
        from_tf=".ckpt" in model_args.model_name_or_path,
        config=config,
        cache_dir=model_args.cache_dir,
    )

    # use task specific params
    use_task_specific_params(model, data_args.task)

    # set num_beams for evaluation
    if data_args.eval_beams is None:
        data_args.eval_beams = model.config.num_beams

    # set decoder_start_token_id for MBart
    if model.config.decoder_start_token_id is None and isinstance(
            tokenizer, MBartTokenizer):
        assert (data_args.tgt_lang is not None and data_args.src_lang
                is not None), "mBart requires --tgt_lang and --src_lang"
        model.config.decoder_start_token_id = tokenizer.lang_code_to_id[
            data_args.tgt_lang]

    if model_args.freeze_embeds:
        freeze_embeds(model)
    if model_args.freeze_encoder:
        freeze_params(model.get_encoder())
        assert_all_frozen(model.get_encoder())

    dataset_class = Seq2SeqDataset

    # Get datasets
    train_dataset = (dataset_class(
        tokenizer,
        type_path="train",
        data_dir=data_args.data_dir,
        n_obs=data_args.n_train,
        max_target_length=data_args.max_target_length,
        max_source_length=data_args.max_source_length,
        prefix=model.config.prefix or "",
    ) if training_args.do_train else None)
    eval_dataset = (dataset_class(
        tokenizer,
        type_path="val",
        data_dir=data_args.data_dir,
        n_obs=data_args.n_val,
        max_target_length=data_args.val_max_target_length,
        max_source_length=data_args.max_source_length,
        prefix=model.config.prefix or "",
    ) if training_args.do_eval or
                    training_args.evaluation_strategy != EvaluationStrategy.NO
                    else None)
    test_dataset = (dataset_class(
        tokenizer,
        type_path="test",
        data_dir=data_args.data_dir,
        n_obs=data_args.n_test,
        max_target_length=data_args.test_max_target_length,
        max_source_length=data_args.max_source_length,
        prefix=model.config.prefix or "",
    ) if training_args.do_predict else None)

    # Initialize our Trainer
    compute_metrics_fn = (build_compute_metrics_fn(data_args.task, tokenizer)
                          if training_args.predict_with_generate else None)
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=Seq2SeqDataCollator(tokenizer, data_args,
                                          training_args.tpu_num_cores),
        compute_metrics=compute_metrics_fn,
        tokenizer=tokenizer,
    )

    all_metrics = {}
    # Training
    if training_args.do_train:
        logger.info("*** Train ***")

        train_result = trainer.train(
            model_path=model_args.model_name_or_path if os.path.
            isdir(model_args.model_name_or_path) else None)
        metrics = train_result.metrics
        metrics["train_n_objs"] = data_args.n_train

        trainer.save_model()  # this also saves the tokenizer

        if trainer.is_world_process_zero():
            handle_metrics("train", metrics, training_args.output_dir)
            all_metrics.update(metrics)

            # Need to save the state, since Trainer.save_model saves only the tokenizer with the model
            trainer.state.save_to_json(
                os.path.join(training_args.output_dir, "trainer_state.json"))

            # For convenience, we also re-save the tokenizer to the same directory,
            # so that you can share your model easily on huggingface.co/models =)
            tokenizer.save_pretrained(training_args.output_dir)

    if training_args.tune:

        def eval_func_for_lpot(model):
            trainer.model = model
            results = trainer.evaluate(
                eval_dataset=eval_dataset,
                metric_key_prefix="val",
                max_length=data_args.val_max_target_length,
                num_beams=data_args.eval_beams)
            assert data_args.task.startswith("summarization") or data_args.task.startswith("translation") , \
                "data_args.task should startswith summarization or translation"
            task_metrics_keys = [
                'val_bleu', 'val_rouge1', 'val_rouge2', 'val_rougeL',
                'val_rougeLsum'
            ]
            for key in task_metrics_keys:
                if key in results.keys():
                    logger.info("Finally Eval {}:{}".format(key, results[key]))
                    if 'bleu' in key:
                        acc = results[key]
                        break
                    if 'rouge' in key:
                        acc = sum(
                            [v
                             for k, v in results.items() if "rouge" in k]) / 4
                        break
            return acc

        from lpot.experimental import Quantization, common
        quantizer = Quantization("./conf.yaml")
        quantizer.model = common.Model(model)
        quantizer.calib_dataloader = common.DataLoader(
            eval_dataset,
            batch_size=training_args.eval_batch_size,
            collate_fn=Seq2SeqDataCollator_lpot(tokenizer, data_args,
                                                training_args.tpu_num_cores))
        quantizer.eval_func = eval_func_for_lpot
        q_model = quantizer()
        q_model.save(training_args.tuned_checkpoint)
        exit(0)

    if training_args.benchmark:
        if training_args.int8:
            from lpot.utils.pytorch import load
            new_model = load(
                os.path.abspath(
                    os.path.expanduser(training_args.tuned_checkpoint)), model)
        else:
            new_model = model
        trainer.model = new_model
        results = trainer.evaluate(
            eval_dataset=eval_dataset,
            metric_key_prefix="val",
            max_length=data_args.val_max_target_length,
            num_beams=data_args.eval_beams,
            iters=training_args.iters,
            warmup_iter=training_args.warmup_iter,
        )
        if data_args.task.startswith("summarization"):
            print('Accuracy: %.4f' %
                  (sum([v for k, v in results.items() if "rouge" in k]) / 4))
        if data_args.task.startswith("translation"):
            print('Accuracy: %.4f' % (results['val_bleu']))
        print('Throughput: %.3f samples/sec' %
              (results["val_samples_per_second"]))
        print('Latency: %.3f ms' %
              (1 * 1000 / results["val_samples_per_second"]))
        print('Batch size = %d' % training_args.per_device_eval_batch_size)
        exit(0)

    if training_args.accuracy_only:
        if training_args.int8:
            from lpot.utils.pytorch import load
            new_model = load(
                os.path.abspath(
                    os.path.expanduser(training_args.tuned_checkpoint)), model)
        else:
            new_model = model
        trainer.model = new_model
        results = trainer.evaluate(
            eval_dataset=eval_dataset,
            metric_key_prefix="val",
            max_length=data_args.val_max_target_length,
            num_beams=data_args.eval_beams,
        )
        if data_args.task.startswith("summarization"):
            print('Accuracy: %.4f' %
                  (sum([v for k, v in results.items() if "rouge" in k]) / 4))
        if data_args.task.startswith("translation"):
            print('Accuracy: %.4f' % (results['val_bleu']))
        print('Latency: %.3f ms' %
              (1 * 1000 / results["val_samples_per_second"]))
        print('Batch size = %d' % training_args.per_device_eval_batch_size)
        exit(0)

    # Evaluation
    if training_args.do_eval:
        logger.info("*** Evaluate ***")
        metrics = trainer.evaluate(
            metric_key_prefix="val",
            max_length=data_args.val_max_target_length,
            num_beams=data_args.eval_beams,
        )
        metrics["val_n_objs"] = data_args.n_val
        metrics["val_loss"] = round(metrics["val_loss"], 4)

        if trainer.is_world_process_zero():

            handle_metrics("val", metrics, training_args.output_dir)
            all_metrics.update(metrics)

    if training_args.do_predict:
        logger.info("*** Predict ***")

        test_output = trainer.predict(
            test_dataset=test_dataset,
            metric_key_prefix="test",
            max_length=data_args.val_max_target_length,
            num_beams=data_args.eval_beams,
        )
        metrics = test_output.metrics
        metrics["test_n_objs"] = data_args.n_test

        if trainer.is_world_process_zero():
            metrics["test_loss"] = round(metrics["test_loss"], 4)
            handle_metrics("test", metrics, training_args.output_dir)
            all_metrics.update(metrics)

            if training_args.predict_with_generate:
                test_preds = tokenizer.batch_decode(
                    test_output.predictions,
                    skip_special_tokens=True,
                    clean_up_tokenization_spaces=True)
                test_preds = lmap(str.strip, test_preds)
                write_txt_file(
                    test_preds,
                    os.path.join(training_args.output_dir,
                                 "test_generations.txt"))

    if trainer.is_world_process_zero():
        save_json(all_metrics,
                  os.path.join(training_args.output_dir, "all_results.json"))

    return all_metrics
コード例 #6
0
    rst_vecfunc = np.vectorize(make_result)
    brc_vecfunc = np.vectorize(make_branch)
    ovf_vecfunc = np.vectorize(make_overflow)

    rst = rst_vecfunc(src1, src2, mod)
    brc = brc_vecfunc(src1, src2, mod)
    ovf = ovf_vecfunc(src1, src2, mod)

    return [src1, src2, mod, rst, brc, ovf]


data_list = alu_data_gen()
binary_data = [
    itb(data_list[0:2], 8),
    itb([data_list[2]], 3),
    itb([data_list[3]], 8),
    itb(data_list[4:], 1)
]

# print(binary_data)
# print( itb(binary_data, 1))

for i in range(data_list[0].shape[0]):
    print(
        f'src1 : {data_list[0][i]}\t src1 : {data_list[1][i]}\t mod : {tell_mod(data_list[2][i])}\n \
    result : {data_list[3][i]}\n')

concated_binary = itb(binary_data, 1)

utils.write_txt_file('test1.tv', concated_binary)
コード例 #7
0
import numpy as np
import numpy.random as rd
from utils import vec_bin_array


def random_datagen(num):

    weight1 = rd.randint(-2**30, 2**30, num)
    weight2 = rd.randint(-2**30, 2**30, num)

    act1 = rd.randint(-2 ^ 30, 2 ^ 30, num)
    act2 = rd.randint(-2 ^ 30, 2 ^ 30, num)

    mod = rd.randint(0, 3, num, dtype=np.int32)

    result1 = rd.randint(-2 ^ 30, 2 ^ 30, num)
    result2 = rd.randint(-2 ^ 30, 2 ^ 30, num)

    print(weight1[0], weight2[0])

    return weight1, weight2, act1, act2, mod, result1, result2


weight1, weight2, act1, act2, mod, result1, result2 = random_datagen(4)

# utils.write_txt_file(4, 'test.tv', vec_bin_array(weight1, 64), vec_bin_array(weight2, 64), vec_bin_array(act1, 64), vec_bin_array(act2, 64), vec_bin_array(mod, 3), vec_bin_array(result1, 64), vec_bin_array(result2, 64)    )

utils.write_txt_file(1, 'test3.tv',
                     vec_bin_array(np.array([-5], dtype=np.int8), 8),
                     vec_bin_array(np.array([0]), 4))