예제 #1
0
파일: dry_run.py 프로젝트: lxindex/allennlp
def dry_run_from_params(params: Params, serialization_dir: str) -> None:
    prepare_environment(params)

    vocab_params = params.pop("vocabulary", {})
    os.makedirs(serialization_dir, exist_ok=True)
    vocab_dir = os.path.join(serialization_dir, "vocabulary")

    if os.path.isdir(vocab_dir) and os.listdir(vocab_dir) is not None:
        raise ConfigurationError("The 'vocabulary' directory in the provided "
                                 "serialization directory is non-empty")

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(
        params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(
                f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info(
        "From dataset instances, %s will be considered for vocabulary creation.",
        ", ".join(datasets_for_vocab_creation))

    instances = [
        instance for key, dataset in all_datasets.items()
        for instance in dataset if key in datasets_for_vocab_creation
    ]

    vocab = Vocabulary.from_params(vocab_params, instances)
    dataset = Batch(instances)
    dataset.index_instances(vocab)
    dataset.print_statistics()
    vocab.print_statistics()

    logger.info(f"writing the vocabulary to {vocab_dir}.")
    vocab.save_to_files(vocab_dir)

    model = Model.from_params(vocab=vocab, params=params.pop('model'))
    trainer_params = params.pop("trainer")
    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)

    frozen_parameter_names, tunable_parameter_names = \
                   get_frozen_and_tunable_parameter_names(model)

    stdout_handler = prepare_global_logging(serialization_dir, False)

    logger.info("Following parameters are Frozen  (without gradient):")
    for name in frozen_parameter_names:
        logger.info(name)
    logger.info("Following parameters are Tunable (with gradient):")
    for name in tunable_parameter_names:
        logger.info(name)

    cleanup_global_logging(stdout_handler)
예제 #2
0
def main(args):
    params = Params.from_file(args.config_path)
    stdout_handler = prepare_global_logging(args.output_dir, False)
    prepare_environment(params)

    reader = DatasetReader.from_params(params["dataset_reader"])
    train_dataset = reader.read(params.pop("train_data_path", None))
    valid_dataset = reader.read(params.pop("validation_data_path", None))
    test_data_path = params.pop("test_data_path", None)
    if test_data_path:
        test_dataset = reader.read(test_data_path)
        vocab = Vocabulary.from_instances(train_dataset + valid_dataset +
                                          test_dataset)
    else:
        test_dataset = None
        vocab = Vocabulary.from_instances(train_dataset + valid_dataset)

    model_params = params.pop("model", None)
    model = Model.from_params(model_params.duplicate(), vocab=vocab)
    vocab.save_to_files(os.path.join(args.output_dir, "vocabulary"))
    # copy config file
    with open(args.config_path, "r", encoding="utf-8") as f_in:
        with open(os.path.join(args.output_dir, "config.json"),
                  "w",
                  encoding="utf-8") as f_out:
            f_out.write(f_in.read())

    iterator = DataIterator.from_params(params.pop("iterator", None))
    iterator.index_with(vocab)

    trainer_params = params.pop("trainer", None)
    trainer = Trainer.from_params(model=model,
                                  serialization_dir=args.output_dir,
                                  iterator=iterator,
                                  train_data=train_dataset,
                                  validation_data=valid_dataset,
                                  params=trainer_params.duplicate())
    trainer.train()

    # evaluate on the test set
    if test_dataset:
        logging.info("Evaluating on the test set")
        import torch  # import here to ensure the republication of the experiment
        model.load_state_dict(
            torch.load(os.path.join(args.output_dir, "best.th")))
        test_metrics = evaluate(model,
                                test_dataset,
                                iterator,
                                cuda_device=trainer_params.pop(
                                    "cuda_device", 0),
                                batch_weight_key=None)
        logging.info(f"Metrics on the test set: {test_metrics}")
        with open(os.path.join(args.output_dir, "test_metrics.txt"),
                  "w",
                  encoding="utf-8") as f_out:
            f_out.write(f"Metrics on the test set: {test_metrics}")

    cleanup_global_logging(stdout_handler)
예제 #3
0
def elmo_command(args):
    elmo_embedder = ElmoEmbedder(args.options_file, args.weight_file,
                                 args.cuda_device)
    output_format = u""
    if args.all:
        output_format = u"all"
    elif args.top:
        output_format = u"top"
    elif args.average:
        output_format = u"average"

    prepare_global_logging(os.path.realpath(os.path.dirname(args.output_file)),
                           args.file_friendly_logging)

    with torch.no_grad():
        elmo_embedder.embed_file(args.input_file, args.output_file,
                                 output_format, args.batch_size,
                                 args.forget_sentences, args.use_sentence_keys)
예제 #4
0
파일: elmo.py 프로젝트: apmoore1/allennlp
def elmo_command(args):
    elmo_embedder = ElmoEmbedder(args.options_file, args.weight_file, args.cuda_device)
    output_format = ""
    if args.all:
        output_format = "all"
    elif args.top:
        output_format = "top"
    elif args.average:
        output_format = "average"

    prepare_global_logging(os.path.realpath(os.path.dirname(args.output_file)), args.file_friendly_logging)

    with torch.no_grad():
        elmo_embedder.embed_file(
                args.input_file,
                args.output_file,
                output_format,
                args.batch_size,
                args.forget_sentences,
                args.use_sentence_keys)
예제 #5
0
def train_model(params: Params,
                serialization_dir: str,
                selector: str,
                num_ensemble_models: Optional[int],
                file_friendly_logging: bool = False,
                recover: bool = False,
                force: bool = False) -> Model:
    """
    Trains the model specified in the given :class:`Params` object, using the data and training
    parameters also specified in that object, and saves the results in ``serialization_dir``.

    Parameters
    ----------
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    recover : ``bool``, optional (default=False)
        If ``True``, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see the ``fine-tune`` command.

    Returns
    -------
    best_model: ``Model``
        The model with the best epoch weights.
    """
    prepare_environment(params)

    create_serialization_dir(params, serialization_dir, recover, force)
    prepare_global_logging(serialization_dir, file_friendly_logging)

    cuda_device = params.params.get('trainer').get('cuda_device', -1)
    if isinstance(cuda_device, list):
        for device in cuda_device:
            check_for_gpu(device)
    else:
        check_for_gpu(cuda_device)

    params.to_file(os.path.join(serialization_dir, CONFIG_NAME))

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("From dataset instances, %s will be considered for vocabulary creation.",
                ", ".join(datasets_for_vocab_creation))
    vocab = Vocabulary.from_params(
            params.pop("vocabulary", {}),
            (instance for key, dataset in all_datasets.items()
             for instance in dataset
             if key in datasets_for_vocab_creation)
    )

    model_params = params.pop('model')
    if selector == 'qbc':
        assert num_ensemble_models is not None
        models_list = [Model.from_params(vocab=vocab, params=model_params.duplicate()) for i in range(num_ensemble_models)]
        ensemble_model = CorefEnsemble(models_list)
        model = ensemble_model.submodels[0]
    else:
        model = Model.from_params(vocab=vocab, params=model_params)
        ensemble_model = None

    # Initializing the model can have side effect of expanding the vocabulary
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)
    validation_iterator_params = params.pop("validation_iterator", None)
    if validation_iterator_params:
        validation_iterator = DataIterator.from_params(validation_iterator_params)
        validation_iterator.index_with(vocab)
    else:
        validation_iterator = None
    held_out_iterator_params = params.pop("held_out_iterator", None)
    if held_out_iterator_params:
        held_out_iterator = DataIterator.from_params(held_out_iterator_params)
        held_out_iterator.index_with(vocab)
    else:
        held_out_iterator = None

    train_data = all_datasets['train']
    held_out_train_data = all_datasets.get('held_out_train')
    validation_data = all_datasets.get('validation')
    test_data = all_datasets.get('test')

    trainer_params = params.pop("trainer")
    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)

    frozen_parameter_names, tunable_parameter_names = \
                   get_frozen_and_tunable_parameter_names(model)
    logger.info("Following parameters are Frozen  (without gradient):")
    for name in frozen_parameter_names:
        logger.info(name)
    logger.info("Following parameters are Tunable (with gradient):")
    for name in tunable_parameter_names:
        logger.info(name)

    trainer_choice = trainer_params.pop("type")
    trainer = ALCorefTrainer.by_name(trainer_choice).from_params(model=model,
                                                                serialization_dir=serialization_dir,
                                                                iterator=iterator,
                                                                train_data=train_data,
                                                                held_out_train_data=held_out_train_data,
                                                                validation_data=validation_data,
                                                                params=trainer_params,
                                                                validation_iterator=validation_iterator,
                                                                held_out_iterator=held_out_iterator,
                                                                ensemble_model=ensemble_model)
    evaluate_on_test = params.pop_bool("evaluate_on_test", False)
    params.assert_empty('base train command')

    try:
        metrics, query_info = trainer.train()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info("Training interrupted by the user. Attempting to create "
                         "a model archive using the current best epoch weights.")
            archive_model(serialization_dir, files_to_archive=params.files_to_archive)
        raise

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)

    best_model = None
    logger.info("Loading the best epoch weights.")
    best_model_state_path = os.path.join(serialization_dir, 'best.th')
    best_model_state = torch.load(best_model_state_path)
    best_model = model
    best_model.load_state_dict(best_model_state)
    
    if test_data and evaluate_on_test:
        logger.info("The model will be evaluated using the best epoch weights.")
        test_metrics = evaluate(
            best_model, test_data, validation_iterator or iterator,
            cuda_device=trainer._cuda_devices[0],
            batch_weight_key="",
        )
        for key, value in test_metrics.items():
            metrics["test_" + key] = value
    return best_model, metrics, query_info
예제 #6
0
def debug_vocab(parameter_filename: str,
                serialization_dir: str,
                overrides: str = "",
                file_friendly_logging: bool = False,
                recover: bool = False,
                force: bool = False) -> Model:
    """
    A wrapper around :func:`train_model` which loads the params from a file.

    Parameters
    ----------
    parameter_filename : ``str``
        A json parameter file specifying an AllenNLP experiment.
    serialization_dir : ``str``
        The directory in which to save results and logs. We just pass this along to
        :func:`train_model`.
    overrides : ``str``
        A JSON string that we will use to override values in the input parameter file.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we make our output more friendly to saved model files.  We just pass this
        along to :func:`train_model`.
    recover : ``bool`, optional (default=False)
        If ``True``, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see the ``fine-tune`` command.
    force : ``bool``, optional (default=False)
        If ``True``, we will overwrite the serialization directory if it already exists.
    """
    # Load the experiment config from a file and pass it to ``train_model``.
    params = Params.from_file(parameter_filename, overrides)

    prepare_global_logging(serialization_dir, file_friendly_logging)

    check_for_gpu(params.get('trainer').get('cuda_device', -1))

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(
        params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(
                f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info(
        "From dataset instances, %s will be considered for vocabulary creation.",
        ", ".join(datasets_for_vocab_creation))
    vocab = Vocabulary.from_params(
        params.pop("vocabulary", {}),
        (instance for key, dataset in all_datasets.items()
         for instance in dataset if key in datasets_for_vocab_creation))

    model = Model.from_params(vocab=vocab, params=params.pop('model'))

    vocab = model.vocab
    vocab_namespace_dict = vocab._token_to_index
    vocab_oov_token = vocab._oov_token
    vocab_non_padded_namespaces = vocab._non_padded_namespaces  # this is a set

    vocab_tokens_dict = vocab_namespace_dict['tokens']
    vocab_labels_dict = vocab_namespace_dict['labels']

    print()
    print("Vocab's OOV token: " + vocab_oov_token)
    print("Non-padded namespaces in vocab: " +
          str(list(vocab_non_padded_namespaces)))
    print()

    print("Number of words in vocab's tokens dict: " +
          str(len(vocab_tokens_dict)))
    if any(
            namespace_match(pattern, 'tokens')
            for pattern in vocab_non_padded_namespaces):
        is_padded = False
    else:
        is_padded = True
    print("tokens will return True for is_padded: " + str(is_padded))
    print("Vocab's OOV token is in its tokens dict (should be True): " +
          str(vocab_oov_token in vocab_tokens_dict))
    print()

    print("Number of words in vocab's labels dict: " +
          str(len(vocab_labels_dict)))
    if any(
            namespace_match(pattern, 'labels')
            for pattern in vocab_non_padded_namespaces):
        is_padded = False
    else:
        is_padded = True
    print("labels will return True for is_padded: " + str(is_padded))
    print("Vocab's OOV token is in its labels dict (should be False): " +
          str(vocab_oov_token in vocab_labels_dict))
예제 #7
0
def train_model(params: Params,
                serialization_dir: str,
                file_friendly_logging: bool = False,
                recover: bool = False,
                force: bool = False) -> Model:
    """
    Trains the model specified in the given :class:`Params` object, using the data and training
    parameters also specified in that object, and saves the results in ``serialization_dir``.

    Parameters
    ----------
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    recover : ``bool``, optional (default=False)
        If ``True``, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see the ``fine-tune`` command.

    Returns
    -------
    best_model: ``Model``
        The model with the best epoch weights.
    """
    prepare_environment(params)

    create_serialization_dir(params, serialization_dir, recover, force)
    prepare_global_logging(serialization_dir, file_friendly_logging)

    cuda_device = params.params.get('trainer').get('cuda_device', -1)
    if isinstance(cuda_device, list):
        for device in cuda_device:
            check_for_gpu(device)
    else:
        check_for_gpu(cuda_device)

    params.to_file(os.path.join(serialization_dir, CONFIG_NAME))

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("From dataset instances, %s will be considered for vocabulary creation.",
                ", ".join(datasets_for_vocab_creation))
    vocab = Vocabulary.from_params(
            params.pop("vocabulary", {}),
            (instance for key, dataset in all_datasets.items()
             for instance in dataset
             if key in datasets_for_vocab_creation)
    )

    model = Model.from_params(vocab=vocab, params=params.pop('model'))

############################################################################################################################################
    prev_state_dict = torch.load("/home/ubuntu/Hurricanes/model/bestS.th", map_location='cpu')
    for n, p in model.named_parameters():
        if (
                n in prev_state_dict
                and n != 'linear.weight'
                and n != 'linear.bias'
                and n != 'classifier_feedforward._linear_layers.1.weight'
                and n != 'classifier_feedforward._linear_layers.1.bias'
                and n != 'classifier_feedforward._linear_layers.0.weight'
                and n != 'classifier_feedforward._linear_layers.0.bias'
        ):
            w = prev_state_dict[n]
            p.data.copy_(w.data)




    # Initializing the model can have side effect of expanding the vocabulary
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)
    validation_iterator_params = params.pop("validation_iterator", None)
    if validation_iterator_params:
        validation_iterator = DataIterator.from_params(validation_iterator_params)
        validation_iterator.index_with(vocab)
    else:
        validation_iterator = None

    train_data = all_datasets['train']
    validation_data = all_datasets.get('validation')
    test_data = all_datasets.get('test')

    trainer_params = params.pop("trainer")
    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)

    frozen_parameter_names, tunable_parameter_names = \
                   get_frozen_and_tunable_parameter_names(model)
    logger.info("Following parameters are Frozen  (without gradient):")
    for name in frozen_parameter_names:
        logger.info(name)
    logger.info("Following parameters are Tunable (with gradient):")
    for name in tunable_parameter_names:
        logger.info(name)

    trainer_choice = trainer_params.pop_choice("type",
                                               Trainer.list_available(),
                                               default_to_first_choice=True)
    trainer = Trainer.by_name(trainer_choice).from_params(model=model,
                                                          serialization_dir=serialization_dir,
                                                          iterator=iterator,
                                                          train_data=train_data,
                                                          validation_data=validation_data,
                                                          params=trainer_params,
                                                          validation_iterator=validation_iterator)

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)
    params.assert_empty('base train command')

    try:
        metrics = trainer.train()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info("Training interrupted by the user. Attempting to create "
                         "a model archive using the current best epoch weights.")
            archive_model(serialization_dir, files_to_archive=params.files_to_archive)
        raise

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)

    logger.info("Loading the best epoch weights.")
    best_model_state_path = os.path.join(serialization_dir, 'best.th')
    best_model_state = torch.load(best_model_state_path)
    best_model = model
    best_model.load_state_dict(best_model_state)

    if test_data and evaluate_on_test:
        logger.info("The model will be evaluated using the best epoch weights.")
        test_metrics = evaluate(
                best_model, test_data, validation_iterator or iterator,
                cuda_device=trainer._cuda_devices[0] # pylint: disable=protected-access
        )
        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif test_data:
        logger.info("To evaluate on the test set after training, pass the "
                    "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    dump_metrics(os.path.join(serialization_dir, "metrics.json"), metrics, log=True)

    return best_model
예제 #8
0
    is_master_rank = (dist.get_rank() == args.local_rank)

    serialize_config_file = os.path.join(serialization_dir, CONFIG_NAME)
    recover = os.path.exists(serialize_config_file)
    if is_master_rank:
        if not os.path.exists(serialization_dir):
            os.makedirs(serialization_dir, exist_ok=True)
            params = ConstParams.from_file(param_fil)
            params.to_file(serialize_config_file)
    dist.barrier()
    params = ConstParams.from_file(serialize_config_file)

    log_dir = os.path.join(serialization_dir, str(dist.get_rank()))
    os.makedirs(log_dir, exist_ok=True)
    stdout_handler = prepare_global_logging(log_dir,
                                            file_friendly_logging=False)
    prepare_environment(params)

    cuda_device = params.trainer.get('cuda_device', -1)
    check_for_gpu(cuda_device)

    trainer_type = params.trainer.type

    trainer = TrainerBase.from_params(params, serialization_dir, recover)
    params_cnt, params_trainable_cnt = count_parameters(trainer.model)
    print("all params cnt: ", params_cnt)
    print("all trainable params cnt: ", params_trainable_cnt)

    metrics = trainer.train()

    cleanup_global_logging(stdout_handler)
예제 #9
0
def train_model(
    params: Params,
    serialization_dir: str,
    file_friendly_logging: bool = False,
    recover: bool = False,
    force: bool = False,
    cache_directory: str = None,
    cache_prefix: str = None,
) -> Model:
    create_serialization_dir(params, serialization_dir, recover, force)
    params.to_file(os.path.join(serialization_dir, CONFIG_NAME))

    stdout_handler = prepare_global_logging(serialization_dir,
                                            file_friendly_logging)
    prepare_environment(params)

    cuda_device = params.params.get("trainer").get("cuda_device", -1)
    check_for_gpu(cuda_device)

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)

    trainer_type = params.get("trainer", {}).get("type", "default")

    if True:
        # Special logic to instantiate backward-compatible trainer.
        pieces = TrainerPieces.from_params(params, serialization_dir, recover,
                                           cache_directory, cache_prefix)
        logger.info("Using MultiTrainer")
        from lm.trainining.MultiTaskTrainer import MultiTaskTrainer
        # MultiTrainer
        trainer = MultiTaskTrainer.from_params(
            model=pieces.model,
            serialization_dir=serialization_dir,
            iterator=pieces.iterator,
            train_data=pieces.train_dataset,
            validation_data=pieces.validation_dataset,
            params=pieces.params,
            validation_iterator=pieces.validation_iterator,
        )

        evaluation_iterator = pieces.validation_iterator or pieces.iterator
        evaluation_dataset = pieces.test_dataset

    else:
        if evaluate_on_test:
            raise ValueError(
                "--evaluate-on-test only works with the default Trainer. "
                "If you're using the CallbackTrainer you can use a callback "
                "to evaluate at Events.TRAINING_END; otherwise you'll have "
                "to run allennlp evaluate separately.")
        """
        The only main difference
        """
        print("Using MultuTrainer")
        logger.info("Using MultiTrainer")
        trainer = MultiTrainer.from_params(params, serialization_dir, recover,
                                           cache_directory, cache_prefix)
        evaluation_dataset = None

    params.assert_empty("base train command")

    try:
        metrics = trainer.train()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info(
                "Training interrupted by the user. Attempting to create "
                "a model archive using the current best epoch weights.")
            archive_model(serialization_dir,
                          files_to_archive=params.files_to_archive)
        raise

    # Evaluate
    if evaluation_dataset and evaluate_on_test:
        logger.info(
            "The model will be evaluated using the best epoch weights.")
        test_metrics = evaluate(
            trainer.model,
            evaluation_dataset,
            evaluation_iterator,
            cuda_device=trainer._cuda_devices[0],
            # TODO(brendanr): Pass in an arg following Joel's trainer refactor.
            batch_weight_key="",
        )

        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif evaluation_dataset:
        logger.info(
            "To evaluate on the test set after training, pass the "
            "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    cleanup_global_logging(stdout_handler)

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)
    dump_metrics(os.path.join(serialization_dir, "metrics.json"),
                 metrics,
                 log=True)

    # We count on the trainer to have the model with best weights
    return trainer.model
예제 #10
0
def train_model(params,
                serialization_dir,
                file_friendly_logging=False,
                recover=False,
                model="bidaf"):
    """
    Trains the model specified in the given :class:`Params` object, using the data and training
    parameters also specified in that object, and saves the results in ``serialization_dir``.

    Parameters
    ----------
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    recover : ``bool`, optional (default=False)
        If ``True``, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see the ``fine-tune`` command.
    """
    print("Starting training models...")
    prepare_environment(params)

    create_serialization_dir(params, serialization_dir, recover)
    prepare_global_logging(serialization_dir, file_friendly_logging)

    serialization_params = deepcopy(params).as_dict(quiet=True)
    with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file:
        json.dump(serialization_params, param_file, indent=4)

    all_datasets = datasets_from_params(params)
    print("get all of the dataset.")
    datasets_for_vocab_creation = set(
        params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(
                f"invalid 'dataset_for_vocab_creation' {dataset}")

    print("creatig vocaburary...")
    logger.info("Creating a vocabulary using %s data.",
                ", ".join(datasets_for_vocab_creation))
    vocab = Vocabulary.from_params(
        params.pop("vocabulary", {}),
        (instance for key, dataset in all_datasets.items()
         for instance in dataset if key in datasets_for_vocab_creation))
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    if model == "self":
        model = BiDAFSelfAttention.from_params(vocab, params.pop("model"))
    else:
        model = BidirectionalAttentionFlow.from_params(vocab,
                                                       params.pop("model"))
    print("Initialized a BiDAF model.")
    # This is for debugging.
    print(model)
    print(serialization_dir)

    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)
    print("create iterator")

    train_data = all_datasets['train']
    validation_data = all_datasets.get('validation')
    test_data = all_datasets.get('test')

    print("initalizing a trainer")
    trainer_params = params.pop("trainer")
    trainer = Trainer.from_params(model, serialization_dir, iterator,
                                  train_data, validation_data, trainer_params)

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)
    params.assert_empty('base train command')

    try:
        metrics = trainer.train()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info(
                "Training interrupted by the user. Attempting to create "
                "a model archive using the current best epoch weights.")
            archive_model(serialization_dir,
                          files_to_archive=params.files_to_archive)
        raise

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)

    if test_data and evaluate_on_test:
        test_metrics = evaluate(model,
                                test_data,
                                iterator,
                                cuda_device=trainer._cuda_devices[0])  # pylint: disable=protected-access
        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif test_data:
        logger.info(
            "To evaluate on the test set after training, pass the "
            "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    metrics_json = json.dumps(metrics, indent=2)
    with open(os.path.join(serialization_dir, "metrics.json"),
              "w") as metrics_file:
        metrics_file.write(metrics_json)
    logger.info("Metrics: %s", metrics_json)

    return model
예제 #11
0
parser.add_argument("-a", "--alpha", type=float, help="SIA alpha")
parser.add_argument("-b", "--beta", type=float, help="SIA alpha")
parser.add_argument("-o", "--output_dir", help="SIA alpha", default="results")
args = parser.parse_args()

# put it at the start of the file
logging.basicConfig(
    format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
    level=logging.INFO)
logger = logging.getLogger(__name__)

create_serialization_dir(Params({"seed": 123}),
                         args.output_dir,
                         recover=False,
                         force=True)
stdout_handler = prepare_global_logging(serialization_dir=args.output_dir,
                                        file_friendly_logging=False)
checkpointer = Checkpointer(args.output_dir,
                            keep_serialized_model_every_num_seconds=3600)

tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")


class SIAHeadlineDataset(Dataset):
    def __init__(self, data):
        super().__init__()
        self.dataset = data
        self.CLS = [101]
        self.SEP = [102]

    def __len__(self):
        return len(self.dataset)
예제 #12
0
파일: test.py 프로젝트: wjn922/allennlp
def main():
	###############################################################################################
	prepare_global_logging(serialization_dir=args.serialization_dir, file_friendly_logging=False)
	#DATA
	reader = MathDatasetReader(source_tokenizer=CharacterTokenizer(),
	                        target_tokenizer=CharacterTokenizer(),
	                        source_token_indexers={'tokens': SingleIdTokenIndexer(namespace='tokens')},
	                        target_token_indexers={'tokens': SingleIdTokenIndexer(namespace='tokens')},
	                        target=False,
	                        label=True,
	                        lazy=False)
	# train_data = reader.read("../../datasets/math/label-data/train-all")
	# val_data = reader.read("../../datasets/math/label-data/interpolate")
	val_data = reader.read("./generate_files")


	vocab = Vocabulary()
	vocab.add_tokens_to_namespace([START_SYMBOL, END_SYMBOL, ' ', '!', "'", '(', ')', '*', '+', ',', '-', '.', '/',
	                                    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '<', '=', '>', '?',
	                                    'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N',
	                                    'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b',
	                                    'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p',
	                                    'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '}'], namespace='tokens')
	vocab.add_tokens_to_namespace(['algebra', 'arithmetic', 'calculus', 'comparison',
	  								 'measurement', 'numbers', 'polynomials', 'probability'], namespace='labels')



	# MODEL
	embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
	                             embedding_dim=EMBEDDING_DIM)
	source_embedder = BasicTextFieldEmbedder({"tokens": embedding})

	if args.model == 'lstm':
		encoder = PytorchSeq2VecWrapper(torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, 
											num_layers=NUM_LAYERS, batch_first=True))
	elif args.model == 'cnn':
		encoder = CnnEncoder(embedding_dim=EMBEDDING_DIM, num_filters=NUM_FILTERS, output_dim=HIDDEN_DIM)
	else:
		raise NotImplemented("The classifier model should be LSTM or CNN")


	model = TextClassifier(vocab=vocab,
				source_text_embedder=source_embedder,
	            encoder=encoder,
	            )
	model.to(device)


	if not Path(args.serialization_dir).exists() or not Path(args.serialization_dir).is_dir():
  		raise NotImplementedError("The model seems not to exist")
	with open(Path(args.serialization_dir) / "best.th", "rb") as model_path:
  		model_state = torch.load(model_path, map_location=nn_util.device_mapping(-1))
  		model.load_state_dict(model_state)
	model.eval()

	predictor = TextClassifierPredictor(model, dataset_reader=reader)

	# TEST
	correct = 0
	total = 0

	pbar = tqdm(val_data)
	batch_instance = list()
	batch_gt = list()

	idx_last = 0
	for idx, instance in enumerate(pbar):
		if idx != (idx_last + BATCH_SIZE):
			batch_instance.append(instance)
			batch_gt.append(instance.fields["labels"].label) # str
		else:
			idx_last = idx
			outputs = predictor.predict(batch_instance)
			for i, output in enumerate(outputs):
				if batch_gt[i] == output['predict_labels']:
					correct += 1
				total += 1
			batch_instance = list()
			batch_gt = list()
			pbar.set_description("correct/total %.3f" % (correct / total))
예제 #13
0
def fine_tune_model(model: Model,
                    params: Params,
                    serialization_dir: str,
                    extend_vocab: bool = False,
                    file_friendly_logging: bool = False,
                    batch_weight_key: str = "",
                    embedding_sources_mapping: Dict[str, str] = None,
                    in_fold = None,
                    num_folds = None,
                    ewc_weight=None) -> Model:
    """
    Fine tunes the given model, using a set of parameters that is largely identical to those used
    for :func:`~allennlp.commands.train.train_model`, except that the ``model`` section is ignored,
    if it is present (as we are already given a ``Model`` here).

    The main difference between the logic done here and the logic done in ``train_model`` is that
    here we do not worry about vocabulary construction or creating the model object.  Everything
    else is the same.

    Parameters
    ----------
    model : ``Model``
        A model to fine tune.
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment
    serialization_dir : ``str``
        The directory in which to save results and logs.
    extend_vocab: ``bool``, optional (default=False)
        If ``True``, we use the new instances to extend your vocabulary.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    batch_weight_key : ``str``, optional (default="")
        If non-empty, name of metric used to weight the loss on a per-batch basis.
    embedding_sources_mapping: ``Dict[str, str]``, optional (default=None)
        mapping from model paths to the pretrained embedding filepaths
        used during fine-tuning.
    """
    prepare_environment(params)
    if os.path.exists(serialization_dir) and os.listdir(serialization_dir):
        raise ConfigurationError(f"Serialization directory ({serialization_dir}) "
                                 f"already exists and is not empty.")

    os.makedirs(serialization_dir, exist_ok=True)
    prepare_global_logging(serialization_dir, file_friendly_logging)

    serialization_params = deepcopy(params).as_dict(quiet=True)
    with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file:
        json.dump(serialization_params, param_file, indent=4)

    if params.pop('model', None):
        logger.warning("You passed parameters for the model in your configuration file, but we "
                       "are ignoring them, using instead the model parameters in the archive.")

    vocabulary_params = params.pop('vocabulary', {})
    if vocabulary_params.get('directory_path', None):
        logger.warning("You passed `directory_path` in parameters for the vocabulary in "
                       "your configuration file, but it will be ignored. ")

    all_datasets = datasets_from_params(params)
    vocab = model.vocab

    if extend_vocab:
        datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

        for dataset in datasets_for_vocab_creation:
            if dataset not in all_datasets:
                raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

        logger.info("Extending model vocabulary using %s data.", ", ".join(datasets_for_vocab_creation))
        vocab.extend_from_instances(vocabulary_params,
                                    (instance for key, dataset in all_datasets.items()
                                     for instance in dataset
                                     if key in datasets_for_vocab_creation))

        model.extend_embedder_vocab(embedding_sources_mapping)

    trainer_params = params.pop("trainer")
    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
                parameter.requires_grad_(False)

    frozen_parameter_names, tunable_parameter_names = \
                   get_frozen_and_tunable_parameter_names(model)
    logger.info("Following parameters are Frozen  (without gradient):")
    for name in frozen_parameter_names:
        logger.info(name)
    logger.info("Following parameters are Tunable (with gradient):")
    for name in tunable_parameter_names:
        logger.info(name)

    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    train_data = all_datasets['train']
    validation_data = all_datasets.get('validation')
    test_data = all_datasets.get('test')

    dl_params = params.pop("data_loader")
    if test_data is not None:
        rand = random.Random(1234)
        test_data.index_with(vocab)
        shuffled_test = copy(test_data.instances)
        rand.shuffle(shuffled_test)
        extra_test = shuffled_test[:2000]

        keys = deepcopy(dl_params.as_dict())
        keys.update({"dataset": AllennlpDataset(extra_test, vocab)})
        extra_test_loader = DataLoader.from_params(params.pop("test_data_loader", keys))

        keys = deepcopy(dl_params.as_dict())
        keys.update({"dataset": test_data})
        test_loader = DataLoader.from_params(params.pop("test_data_loader", keys))

    master_model = model
    global_metrics = {}
    training_metrics = []
    final_metrics = {}
    master_trainer = trainer_params.as_dict()

    if num_folds is not None:

        rand = random.Random(1234)

        fold_train = []
        fold_test = []

        fold_train_loader = []
        fold_test_loader = []

        shuffled_instances = copy(train_data.instances)
        rand.shuffle(shuffled_instances)



        kfold = KFold(n_splits=num_folds, random_state=None, shuffle=False)
        computed_folds = list(kfold.split(shuffled_instances))

        for fold in range(num_folds):
            train_indexes, test_indexes = computed_folds[fold]
            new_train = [shuffled_instances[i] for i in train_indexes]
            new_test = [shuffled_instances[i] for i in test_indexes]
            fold_train.append(AllennlpDataset(new_train, vocab=vocab))
            fold_test.append(AllennlpDataset(new_test, vocab=vocab))

            keys = deepcopy(dl_params.as_dict())
            keys.update({"dataset": fold_test[-1]})
            fold_test_loader.append(DataLoader.from_params(params.pop("fold_test_data_loader",keys)))

            keys = deepcopy(dl_params.as_dict())
            keys.update({"dataset": fold_train[-1]})
            fold_train_loader.append(DataLoader.from_params(params.pop("fold_train_data_loader", keys)))

        for fold in ([in_fold] if in_fold is not None else range(num_folds)):
            fold_model = deepcopy(master_model)
            eval_epoch_callback = EvalEpochCallback(fold, fold_test_loader[fold], test_loader, global_metrics)
            callbacks = [eval_epoch_callback]
            if ewc_weight is not None:
                ewc = EWC(extra_test_loader)

                def ewc_forward(*args, **kwargs) -> Dict[str, torch.Tensor]:
                    ewc_loss = 0
                    if ewc.model.training:
                        ewc_loss = ewc.penalty(ewc.model)
                    ret = ewc.model.old_forward(*args, **kwargs)
                    ret["loss"] += ewc_weight * ewc_loss
                    return ret

                fold_model.old_forward = fold_model.forward
                fold_model.forward = ewc_forward
                callbacks.append(CallLossCallback(ewc))

            trainer = Trainer.from_params(model=fold_model,
                                          serialization_dir=serialization_dir,
                                          data_loader=fold_train_loader[fold],
                                          train_data=train_data,
                                          validation_data=None,
                                          params=Params(deepcopy(master_trainer)),
                                          validation_data_loader=None,
                                          epoch_callbacks=callbacks)

            training_metrics.append(trainer.train())
            del fold_model
            del trainer
            del eval_epoch_callback

            state = glob(serialization_dir+"/*.th")
            for file in state:
                logger.info("deleting state - {}".format(file))
                os.unlink(file)
    else:
        callbacks = []
        if ewc_weight is not None:
            ewc = EWC(extra_test_loader)

            def ewc_forward(*args, **kwargs) -> Dict[str, torch.Tensor]:
                ewc_loss = 0
                if ewc.model.training:
                    ewc_loss = ewc.penalty(ewc.model)
                ret = ewc.model.old_forward(*args, **kwargs)
                ret["loss"] += ewc_weight * ewc_loss
                return ret

            model.old_forward = model.forward
            model.forward = ewc_forward
            callbacks.append(CallLossCallback(ewc))

        keys = deepcopy(dl_params.as_dict())
        keys.update({"dataset": train_data})
        train_data.index_with(vocab)
        train_data_loader = DataLoader.from_params(params.pop("train_loader",keys))

        if validation_data is not None:
            validation_data.index_with(vocab)
            keys = deepcopy(dl_params.as_dict())
            keys.update({"dataset": validation_data})

            validation_data_loader = DataLoader.from_params(params.pop("validation_loader", keys))
        else:
            validation_data_loader = None

        if "finetune" in dir(model):
            model.finetune()
            logger.info("Fine tuning model")
        trainer = Trainer.from_params(model=model,
                                      serialization_dir=serialization_dir,
                                      data_loader=train_data_loader,
                                      train_data=train_data,
                                      validation_data=None,
                                      params=Params(deepcopy(master_trainer)),
                                      validation_data_loader=validation_data_loader,
                                      epoch_callbacks=callbacks)

        training_metrics = trainer.train()
        archive_model(serialization_dir)

    final_metrics["fine_tune"] = global_metrics
    final_metrics["training"] = training_metrics

    metrics_json = json.dumps(final_metrics, indent=2)
    with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file:
        metrics_file.write(metrics_json)
    logger.info("Metrics: %s", metrics_json)
    return model
예제 #14
0
파일: train.py 프로젝트: wjn922/allennlp
def main():
    ###############################################################################################
    prepare_global_logging(serialization_dir=args.serialization_dir,
                           file_friendly_logging=False)
    #DATA
    reader = MathDatasetReader(source_tokenizer=CharacterTokenizer(),
                               target_tokenizer=CharacterTokenizer(),
                               source_token_indexers={
                                   'tokens':
                                   SingleIdTokenIndexer(namespace='tokens')
                               },
                               target_token_indexers={
                                   'tokens':
                                   SingleIdTokenIndexer(namespace='tokens')
                               },
                               target=False,
                               label=True,
                               lazy=True)
    train_data = reader.read("../../datasets/math/label-data/train-all")
    # val_data = reader.read("../../datasets/math/label-data/interpolate")

    vocab = Vocabulary()
    vocab.add_tokens_to_namespace([
        START_SYMBOL, END_SYMBOL, ' ', '!', "'", '(', ')', '*', '+', ',', '-',
        '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '<',
        '=', '>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
        'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y',
        'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
        'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{',
        '}'
    ],
                                  namespace='tokens')
    vocab.add_tokens_to_namespace([
        'algebra', 'arithmetic', 'calculus', 'comparison', 'measurement',
        'numbers', 'polynomials', 'probability'
    ],
                                  namespace='labels')

    # MODEL
    embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                          embedding_dim=EMBEDDING_DIM)
    source_embedder = BasicTextFieldEmbedder({"tokens": embedding})

    if args.model == 'lstm':
        encoder = PytorchSeq2VecWrapper(
            torch.nn.LSTM(EMBEDDING_DIM,
                          HIDDEN_DIM,
                          num_layers=NUM_LAYERS,
                          batch_first=True))
    elif args.model == 'cnn':
        encoder = CnnEncoder(embedding_dim=EMBEDDING_DIM,
                             num_filters=NUM_FILTERS,
                             output_dim=HIDDEN_DIM)
    else:
        raise NotImplemented("The classifier model should be LSTM or CNN")

    model = TextClassifier(
        vocab=vocab,
        source_text_embedder=source_embedder,
        encoder=encoder,
    )
    model.to(device)

    optimizer = optim.Adam(model.parameters(),
                           lr=1e-3,
                           betas=(0.9, 0.995),
                           eps=1e-6)

    train_iterator = BucketIterator(batch_size=BATCH_SIZE,
                                    max_instances_in_memory=1024,
                                    sorting_keys=[("source_tokens",
                                                   "num_tokens")])
    train_iterator = MultiprocessIterator(train_iterator, num_workers=16)
    train_iterator.index_with(vocab)

    val_iterator = BucketIterator(batch_size=BATCH_SIZE,
                                  max_instances_in_memory=1024,
                                  sorting_keys=[("source_tokens", "num_tokens")
                                                ])
    val_iterator = MultiprocessIterator(val_iterator, num_workers=16)
    val_iterator.index_with(vocab)
    #pdb.set_trace()

    LR_SCHEDULER = {"type": "exponential", "gamma": 0.5, "last_epoch": -1}
    lr_scheduler = LearningRateScheduler.from_params(optimizer,
                                                     Params(LR_SCHEDULER))

    # TRAIN
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=train_iterator,
                      validation_iterator=None,
                      train_dataset=train_data,
                      validation_dataset=None,
                      patience=None,
                      shuffle=True,
                      num_epochs=1,
                      summary_interval=100,
                      learning_rate_scheduler=lr_scheduler,
                      cuda_device=CUDA_DEVICES,
                      grad_norm=5,
                      grad_clipping=5,
                      model_save_interval=600,
                      serialization_dir=args.serialization_dir,
                      keep_serialized_model_every_num_seconds=3600,
                      should_log_parameter_statistics=True,
                      should_log_learning_rate=True)
    trainer.train()
예제 #15
0
def train_model(params: Params,
                serialization_dir: str,
                file_friendly_logging: bool = False,
                recover: bool = False,
                del_models: bool = False,
                del_vocab: bool = False,
                convert: bool = False) -> Model:
    """
    Trains the model specified in the given :class:`Params` object, using the data and training
    parameters also specified in that object, and saves the results in ``serialization_dir``.

    Parameters
    ----------
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    recover : ``bool``, optional (default=False)
        If ``True``, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see the ``fine-tune`` command.
    del_models : ``bool``, optional (default=False)
        If ``True``, we will delete existing models and logs if they already exist.
    del_vocab : ``bool``, optional (default=False)
        If ``True``, we will delete existing vocabulary if it already exists.

    Returns
    -------
    best_model: ``Model``
        The model with the best epoch weights.
    """
    cuda_device = params.params.get('trainer').get('cuda_device', -1)
    if cuda_device >= 0:
        check_for_gpu(cuda_device)
        torch.cuda.set_device(cuda_device)

    # Sometimes we might change the config a bit but still want to continue training
    # if recover:
    #     create_serialization_dir(
    #         params, serialization_dir, recover, del_models)
    if del_models:
        for path in glob(f'{serialization_dir}/*'):
            if os.path.isfile(path) and not path.endswith('config.yaml'):
                os.remove(path)
        log_path = f'{serialization_dir}/log'
        if os.path.isdir(log_path):
            shutil.rmtree(log_path)
    if del_vocab:
        vocab_path = f'{serialization_dir}/vocabulary'
        if os.path.isdir(vocab_path):
            shutil.rmtree(vocab_path)

    prepare_global_logging(serialization_dir, file_friendly_logging)

    cuda_device = params.params.get('trainer').get('cuda_device', -1)
    check_for_gpu(cuda_device)

    params.to_file(os.path.join(serialization_dir, CONFIG_NAME))

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)

    trainer_type = params.get("trainer", {}).get("type", "default")

    if trainer_type == 'default':
        # Special logic to instantiate backward-compatible trainer.
        pieces = TrainerPieces.from_params(params, serialization_dir, recover)  # pylint: disable=no-member
        trainer = Trainer.from_params(
            model=pieces.model,
            serialization_dir=serialization_dir,
            iterator=pieces.iterator,
            train_data=pieces.corpus.train,
            validation_data=pieces.corpus.valid,
            params=pieces.params,
            validation_iterator=pieces.validation_iterator)
        evaluation_iterator = pieces.validation_iterator or pieces.iterator
        evaluation_dataset = pieces.corpus.test
        batch_weight_key = pieces.batch_weight_key

    elif trainer_type == 'trainer_fp16_single':
        params.get("trainer").pop('type')
        # Special logic to instantiate backward-compatible trainer.
        pieces = TrainerPieces.from_params(params, serialization_dir, recover)  # pylint: disable=no-member
        trainer = TrainerF16SingleTask.from_params(
            model=pieces.model,
            serialization_dir=serialization_dir,
            files_to_archive=params.files_to_archive,
            iterator=pieces.iterator,
            train_data=pieces.corpus.train,
            validation_data=pieces.corpus.valid,
            params=pieces.params,
            validation_iterator=pieces.validation_iterator)
        evaluation_iterator = pieces.validation_iterator or pieces.iterator
        evaluation_dataset = pieces.corpus.test
        batch_weight_key = pieces.batch_weight_key

    else:
        trainer = TrainerBase.from_params(params, serialization_dir, recover)
        # TODO(joelgrus): handle evaluation in the general case
        evaluation_iterator = evaluation_dataset = None

    params.assert_empty('base train command')

    if convert:
        logging.info('In conversion mode.')
        trainer._save_checkpoint(epoch=0)
        create_model_archive(serialization_dir, params)
        sys.exit(0)

    try:
        metrics = trainer.train()
    except (KeyboardInterrupt, RuntimeError):
        # if we have completed an epoch, try to create a model archive.
        logging.info("Training stopped. Attempting to create "
                     "a model archive using the current best epoch weights.")
        create_model_archive(serialization_dir, params)
        raise

    # Evaluate
    if evaluation_dataset and evaluate_on_test:
        logger.info(
            "The model will be evaluated using the best epoch weights.")
        test_metrics = evaluate(
            trainer.model,
            evaluation_dataset,
            evaluation_iterator,
            cuda_device=trainer._cuda_devices[0],  # pylint: disable=protected-access,
            # TODO(brendanr): Pass in an arg following Joel's trainer refactor.
            batch_weight_key=batch_weight_key)

        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif evaluation_dataset:
        logger.info(
            "To evaluate on the test set after training, pass the "
            "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)
    dump_metrics(os.path.join(serialization_dir, "metrics.json"),
                 metrics,
                 log=True)

    # We count on the trainer to have the model with best weights
    return trainer.model
예제 #16
0
def fine_tune_model(model: Model,
                    params: Params,
                    serialization_dir: str,
                    file_friendly_logging: bool = False) -> Model:
    """
    Fine tunes the given model, using a set of parameters that is largely identical to those used
    for :func:`~allennlp.commands.train.train_model`, except that the ``model`` section is ignored,
    if it is present (as we are already given a ``Model`` here).

    The main difference between the logic done here and the logic done in ``train_model`` is that
    here we do not worry about vocabulary construction or creating the model object.  Everything
    else is the same.

    Parameters
    ----------
    archive : ``Archive``
        A saved model archive that is the result of running the ``train`` command.
    train_data_path : ``str``
        Path to the training data to use for fine-tuning.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    validation_data_path : ``str``, optional
        Path to the validation data to use while fine-tuning.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    """
    prepare_environment(params)
    os.makedirs(serialization_dir)
    prepare_global_logging(serialization_dir, file_friendly_logging)

    serialization_params = deepcopy(params).as_dict(quiet=True)
    with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file:
        json.dump(serialization_params, param_file, indent=4)

    if params.pop('model', None):
        logger.warning(
            "You passed parameters for the model in your configuration file, but we "
            "are ignoring them, using instead the model parameters in the archive."
        )

    if params.pop('vocabulary', None):
        logger.warning(
            "You passed parameters for the vocabulary in your configuration file, but "
            "we are ignoring them, using instead the vocabulary from the saved model."
        )

    vocab = model.vocab
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)

    all_datasets = datasets_from_params(params)

    train_data = all_datasets['train']
    validation_data = all_datasets.get('validation')
    test_data = all_datasets.get('test')

    trainer_params = params.pop("trainer")
    trainer = Trainer.from_params(model, serialization_dir, iterator,
                                  train_data, validation_data, trainer_params)

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)
    params.assert_empty('base train command')
    try:
        metrics = trainer.train()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info(
                "Fine-tuning interrupted by the user. Attempting to create "
                "a model archive using the current best epoch weights.")
            archive_model(serialization_dir,
                          files_to_archive=params.files_to_archive)
        raise

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)

    if test_data and evaluate_on_test:
        test_metrics = evaluate(model,
                                test_data,
                                iterator,
                                cuda_device=trainer._cuda_devices[0])  # pylint: disable=protected-access
        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif test_data:
        logger.info(
            "To evaluate on the test set after training, pass the "
            "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    metrics_json = json.dumps(metrics, indent=2)
    with open(os.path.join(serialization_dir, "metrics.json"),
              "w") as metrics_file:
        metrics_file.write(metrics_json)
    logger.info("Metrics: %s", metrics_json)

    return model
예제 #17
0
def train_model(params: Params,
                serialization_dir: str,
                file_friendly_logging: bool = False,
                recover: bool = False,
                force: bool = False) -> Model:
    """
    Trains the model specified in the given :class:`Params` object, using the data and training
    parameters also specified in that object, and saves the results in ``serialization_dir``.

    Parameters
    ----------
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    recover : ``bool``, optional (default=False)
        If ``True``, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see the ``fine-tune`` command.
    force : ``bool``, optional (default=False)
        If ``True``, we will overwrite the serialization directory if it already exists.

    Returns
    -------
    best_model: ``Model``
        The model with the best epoch weights.
    """
    prepare_environment(params)

    create_serialization_dir(params, serialization_dir, recover, force)
    prepare_global_logging(serialization_dir, file_friendly_logging)

    cuda_device = params.params.get('trainer').get('cuda_device', -1)
    if isinstance(cuda_device, list):
        for device in cuda_device:
            check_for_gpu(device)
    else:
        check_for_gpu(cuda_device)

    params.to_file(os.path.join(serialization_dir, CONFIG_NAME))

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("From dataset instances, %s will be considered for vocabulary creation.",
                ", ".join(datasets_for_vocab_creation))

    if recover and os.path.exists(os.path.join(serialization_dir, "vocabulary")):
        vocab = Vocabulary.from_files(os.path.join(serialization_dir, "vocabulary"))
    else:
        vocab = Vocabulary.from_params(
                params.pop("vocabulary", {}),
                (instance for key, dataset in all_datasets.items()
                 for instance in dataset
                 if key in datasets_for_vocab_creation)
        )

    model = Model.from_params(vocab=vocab, params=params.pop('model'))

    # Initializing the model can have side effect of expanding the vocabulary
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))
예제 #18
0
def train_model(params: Params,
                serialization_dir: str,
                file_friendly_logging: bool = False,
                recover: bool = False) -> Model:
    """
    Trains the model specified in the given :class:`Params` object, using the data and training
    parameters also specified in that object, and saves the results in ``serialization_dir``.

    Parameters
    ----------
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    recover : ``bool``, optional (default=False)
        If ``True``, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see the ``fine-tune`` command.

    Returns
    -------
    best_model: ``Model``
        The model with the best epoch weights.
    """
    prepare_environment(params)

    create_serialization_dir(params, serialization_dir, recover)
    prepare_global_logging(serialization_dir, file_friendly_logging)

    check_for_gpu(params.get('trainer').get('cuda_device', -1))

    params.to_file(os.path.join(serialization_dir, CONFIG_NAME))

    all_datasets, all_datasets_aux, all_datasets_aux2 = datasets_from_params(params)
    datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))
    datasets_for_vocab_creation_aux = set(params.pop("auxiliary_datasets_for_vocab_creation", all_datasets_aux))
    datasets_for_vocab_creation_aux2 = set(params.pop("auxiliary_datasets_for_vocab_creation_2", all_datasets_aux2))


    mixing_ratio = params.pop_float("mixing_ratio")
    mixing_ratio2 = params.pop_float("mixing_ratio2")

    cutoff_epoch = params.pop("cutoff_epoch", -1)

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("From dataset instances, %s will be considered for vocabulary creation.",
                ", ".join(datasets_for_vocab_creation))
    vocab_instances_aux = [
        instance for key, dataset in all_datasets_aux.items()
        for instance in dataset
        if key in datasets_for_vocab_creation_aux
    ]
    vocab_instances_aux.extend([
        instance for key, dataset in all_datasets_aux2.items()
        for instance in dataset
        if key in datasets_for_vocab_creation_aux2
    ])
    vocab = VocabularyMultitask.from_params(
            params.pop("vocabulary", {}),
            (instance for key, dataset in all_datasets.items()
             for instance in dataset
             if key in datasets_for_vocab_creation),
            instances_aux=vocab_instances_aux
    )
    model = Model.from_params(vocab=vocab, params=params.pop('model'))

    # Initializing the model can have side effect of expanding the vocabulary
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)

    iterator_aux = DataIterator.from_params(params.pop("iterator_aux"))
    iterator_aux.index_with(vocab)

    iterator_aux2 = DataIterator.from_params(params.pop("iterator_aux2"))
    iterator_aux2.index_with(vocab)

    validation_iterator_params = params.pop("validation_iterator", None)
    if validation_iterator_params:
        validation_iterator = DataIterator.from_params(validation_iterator_params)
        validation_iterator.index_with(vocab)
    else:
        validation_iterator = None

    # TODO: if validation in multi-task need to add validation iterator as above

    train_data = all_datasets.get('train')
    validation_data = all_datasets.get('validation')
    test_data = all_datasets.get('test')

    train_data_aux = all_datasets_aux.get('train_aux')
    validation_data_aux = all_datasets_aux.get('validation_aux')
    test_data_aux = all_datasets_aux.get('test_aux')

    train_data_aux2 = all_datasets_aux2.get('train_aux')
    validation_data_aux2 = all_datasets_aux2.get('validation_aux')
    test_data_aux2 = all_datasets_aux2.get('test_aux')

    trainer_params = params.pop("trainer")
    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)

    frozen_parameter_names, tunable_parameter_names = \
                   get_frozen_and_tunable_parameter_names(model)
    logger.info("Following parameters are Frozen  (without gradient):")
    for name in frozen_parameter_names:
        logger.info(name)
    logger.info("Following parameters are Tunable (with gradient):")
    for name in tunable_parameter_names:
        logger.info(name)

    trainer = MultiTaskTrainer2.from_params(model=model,
                                            serialization_dir=serialization_dir,
                                            iterator=iterator,
                                            iterator_aux=iterator_aux,
                                            iterator_aux2=iterator_aux2,
                                            train_data=train_data,
                                            train_data_aux=train_data_aux,
                                            train_data_aux2=train_data_aux2,
                                            mixing_ratio=mixing_ratio,
                                            mixing_ratio2=mixing_ratio2,
                                            cutoff_epoch=cutoff_epoch,
                                            validation_data_aux=validation_data_aux,
                                            validation_data_aux2=validation_data_aux2,
                                            validation_data=validation_data,
                                            params=trainer_params,
                                            validation_iterator=validation_iterator)

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)
    evaluate_aux_on_test = params.pop_bool("evaluate_aux_on_test", False)
    params.assert_empty('base train command')

    try:
        metrics = trainer.train()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info("Training interrupted by the user. Attempting to create "
                         "a model archive using the current best epoch weights.")
            archive_model(serialization_dir, files_to_archive=params.files_to_archive)
        raise

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)

    logger.info("Loading the best epoch weights.")
    best_model_state_path = os.path.join(serialization_dir, 'best.th')
    best_model_state = torch.load(best_model_state_path)
    best_model = model
    best_model.load_state_dict(best_model_state)

    if test_data and evaluate_on_test:
        logger.info("The model will be evaluated using the best epoch weights.")
        test_metrics = evaluate(
                best_model, test_data, validation_iterator or iterator,
                cuda_device=trainer._cuda_devices[0] # pylint: disable=protected-access
        )
        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif test_data:
        logger.info("To evaluate on the test set after training, pass the "
                    "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    if test_data_aux and evaluate_aux_on_test:
        # for instance in test_data_aux:
        #     instance.index_fields(vocab)
        # for instance in test_data_aux2:
        #     instance.index_fields(vocab)
        test_metrics_aux = evaluate(best_model, test_data_aux, iterator_aux,
                                    cuda_device=trainer._cuda_devices[0])  # pylint: disable=protected-access
        test_metrics_aux2 = evaluate(best_model, test_data_aux2, iterator_aux2,
                                     cuda_device=trainer._cuda_devices[0])  # pylint: disable=protected-access

        for key, value in test_metrics_aux.items():
            metrics["test_aux_" + key] = value
        for key, value in test_metrics_aux2.items():
            metrics["test_aux2_" + key] = value

    elif test_data_aux:
        logger.info("To evaluate on the auxiliary test set after training, pass the "
                    "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    dump_metrics(os.path.join(serialization_dir, "metrics.json"), metrics, log=True)

    return best_model
def train_model(params: Params,
                serialization_dir: str,
                results_fn: str,
                file_friendly_logging: bool = False,
                recover: bool = False,
                force: bool = False) -> Tuple[Model, Dict[str, Any]]:
    prepare_environment(params)

    create_serialization_dir(params, serialization_dir, recover, force)
    prepare_global_logging(serialization_dir, file_friendly_logging)

    cuda_device = params.params.get('trainer').get('cuda_device', -1)
    if isinstance(cuda_device, list):
        for device in cuda_device:
            check_for_gpu(device)
    else:
        check_for_gpu(cuda_device)

    params.to_file(os.path.join(serialization_dir, CONFIG_NAME))

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(
        params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(
                f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info(
        "From dataset instances, %s will be considered for vocabulary creation.",
        ", ".join(datasets_for_vocab_creation))
    vocab = Vocabulary.from_params(
        params.pop("vocabulary", {}),
        (instance for key, dataset in all_datasets.items()
         for instance in dataset if key in datasets_for_vocab_creation))

    model = Model.from_params(vocab=vocab, params=params.pop('model'))

    # Initializing the model can have side effect of expanding the vocabulary
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)
    validation_iterator_params = params.pop("validation_iterator", None)
    if validation_iterator_params:
        validation_iterator = DataIterator.from_params(
            validation_iterator_params)
        validation_iterator.index_with(vocab)
    else:
        validation_iterator = None
    held_out_iterator_params = params.pop("held_out_iterator", None)
    if held_out_iterator_params:
        held_out_iterator = DataIterator.from_params(held_out_iterator_params)
        held_out_iterator.index_with(vocab)
    else:
        held_out_iterator = None

    train_data = all_datasets['train']
    held_out_train_data = all_datasets.get('held_out_train')
    validation_data = all_datasets.get('validation')
    test_data = all_datasets.get('test')

    trainer_params = params.pop("trainer")
    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)

    frozen_parameter_names, tunable_parameter_names = \
                   get_frozen_and_tunable_parameter_names(model)
    logger.info("Following parameters are Frozen  (without gradient):")
    for name in frozen_parameter_names:
        logger.info(name)
    logger.info("Following parameters are Tunable (with gradient):")
    for name in tunable_parameter_names:
        logger.info(name)

    trainer_choice = trainer_params.pop_choice("type",
                                               Trainer.list_available(),
                                               default_to_first_choice=True)
    trainer = Trainer.by_name(trainer_choice).from_params(
        model=model,
        serialization_dir=serialization_dir,
        iterator=iterator,
        train_data=train_data,
        held_out_train_data=held_out_train_data,
        validation_data=validation_data,
        params=trainer_params,
        validation_iterator=validation_iterator,
        held_out_iterator=held_out_iterator)

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)
    params.assert_empty('base train command')

    try:
        metrics = trainer.train()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info(
                "Training interrupted by the user. Attempting to create "
                "a model archive using the current best epoch weights.")
            archive_model(serialization_dir,
                          files_to_archive=params.files_to_archive)
        raise

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)

    logger.info("Loading the best epoch weights.")
    best_model_state_path = os.path.join(serialization_dir, 'best.th')
    best_model_state = torch.load(best_model_state_path)
    best_model = model
    best_model.load_state_dict(best_model_state)

    if test_data and evaluate_on_test:
        logger.info(
            "The model will be evaluated using the best epoch weights.")
        test_metrics = evaluate(
            best_model,
            test_data,
            validation_iterator or iterator,
            cuda_device=trainer._cuda_devices[0]  # pylint: disable=protected-access
        )
        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif test_data:
        logger.info(
            "To evaluate on the test set after training, pass the "
            "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    dump_metrics(os.path.join(results_dir, results_fn), metrics, log=True)

    return best_model, metrics
def modified_train_model(serialization_dir,
                         training_config_filename,
                         cuda_device=-1,
                         file_friendly_logging: bool = False) -> Model:
    """
        Function not currently in use. This is from back when I was trying to keep each successive
        addition to the model's training in the same serialization directory.

    Trains the model specified in the given :class:`Params` object, using the data and training
    parameters also specified in that object, and saves the results in ``serialization_dir``.
    Parameters
    ----------
    serialization_dir : ``str``
        The directory in which to save results and logs.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    recover : ``bool``, optional (default=False)
        If ``True``, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see the ``fine-tune`` command.
    Returns
    -------
    best_model: ``Model``
        The model with the best epoch weights.
    """
    model, params, prev_optimizer_params, cur_optimizer_params = \
        load_model_from_serialization_dir(serialization_dir, training_config_filename, cuda_device=cuda_device)
    prepare_environment(params)

    prepare_global_logging(serialization_dir, file_friendly_logging)

    cuda_device = params.params.get('trainer').get('cuda_device', -1)
    if isinstance(cuda_device, list):
        for device in cuda_device:
            check_for_gpu(device)
    else:
        check_for_gpu(cuda_device)

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(
        params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(
                f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info(
        "From dataset instances, %s will be considered for vocabulary creation.",
        ", ".join(datasets_for_vocab_creation))
    vocab = Vocabulary.from_params(
        params.pop("vocabulary", {}),
        (instance for key, dataset in all_datasets.items()
         for instance in dataset if key in datasets_for_vocab_creation))

    params.pop('model')

    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)
    validation_iterator_params = params.pop("validation_iterator", None)
    if validation_iterator_params:
        validation_iterator = DataIterator.from_params(
            validation_iterator_params)
        validation_iterator.index_with(vocab)
    else:
        validation_iterator = None

    train_data = all_datasets['train']
    validation_data = all_datasets.get('validation')
    test_data = all_datasets.get('test')

    trainer_params = params.pop("trainer")
    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)

    list_of_cur_optimizer_param_keys = [
        key for key in cur_optimizer_params.as_flat_dict().keys()
    ]
    list_of_prev_optimizer_param_keys = [
        key for key in prev_optimizer_params.as_flat_dict().keys()
    ]
    optimizer_params_match = True
    for key in list_of_cur_optimizer_param_keys:
        if key not in list_of_prev_optimizer_param_keys:
            optimizer_params_match = False
            break
    for key in list_of_prev_optimizer_param_keys:
        if key not in list_of_cur_optimizer_param_keys:
            optimizer_params_match = False
            break
    if not optimizer_params_match:
        # a list of each p is what will be passed to the optimizer constructor while constructing Trainer--
        # adjust if necessary (i.e., if we changed optimizers)
        model_params = [[n, p] for n, p in model.named_parameters()
                        if p.requires_grad]
        assert "parameter_groups" not in list_of_cur_optimizer_param_keys, \
            "Current way of dealing with optimizer change doesn't take parameter groups into account"
        assert "parameter_groups" not in list_of_prev_optimizer_param_keys, \
            "Current way of dealing with optimizer change doesn't take parameter groups into account"
        for param_tup in model_params:
            # modify the second element of param_tup in-place (it's a dict) to match the keys specified in
            # cur_optimizer_params
            param_dict = param_tup[1]
            keys_to_del = []
            keys_already_in_dict = []
            try:
                for key in param_dict.keys():
                    if not key in list_of_cur_optimizer_param_keys:
                        keys_to_del.append(key)
                    else:
                        keys_already_in_dict.append(key)
                for key in keys_to_del:
                    del param_dict[key]
                for key_to_have in list_of_cur_optimizer_param_keys:
                    if key_to_have != "type" and key_to_have not in keys_already_in_dict:
                        param_dict[key_to_have] = cur_optimizer_params.get(
                            key_to_have)
            except:
                pass

    trainer = Trainer.from_params(model=model,
                                  serialization_dir=serialization_dir,
                                  iterator=iterator,
                                  train_data=train_data,
                                  validation_data=validation_data,
                                  params=trainer_params,
                                  validation_iterator=validation_iterator)

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)
    params.assert_empty('base train command')

    try:
        metrics = trainer.train()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info(
                "Training interrupted by the user. Attempting to create "
                "a model archive using the current best epoch weights.")
            archive_model(serialization_dir,
                          files_to_archive=params.files_to_archive)
        raise

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)

    logger.info("Loading the best epoch weights.")
    best_model_state_path = os.path.join(serialization_dir, 'best.th')
    best_model_state = torch.load(best_model_state_path)
    best_model = model
    best_model.load_state_dict(best_model_state)

    if test_data and evaluate_on_test:
        logger.info(
            "The model will be evaluated using the best epoch weights.")
        test_metrics = evaluate(
            best_model,
            test_data,
            validation_iterator or iterator,
            cuda_device=trainer._cuda_devices[0]  # pylint: disable=protected-access
        )
        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif test_data:
        logger.info(
            "To evaluate on the test set after training, pass the "
            "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    dump_metrics(os.path.join(serialization_dir, "metrics.json"),
                 metrics,
                 log=True)

    return best_model
예제 #21
0
def train_model(params: Params,
                serialization_dir: str,
                file_friendly_logging: bool = False,
                recover: bool = False) -> Model:
    """
    Trains the model specified in the given :class:`Params` object, using the data and training
    parameters also specified in that object, and saves the results in ``serialization_dir``.

    Parameters
    ----------
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    recover : ``bool`, optional (default=False)
        If ``True``, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see the ``fine-tune`` command.
    """
    prepare_environment(params)

    create_serialization_dir(params, serialization_dir, recover)
    prepare_global_logging(serialization_dir, file_friendly_logging)

    serialization_params = deepcopy(params).as_dict(quiet=True)
    with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file:
        json.dump(serialization_params, param_file, indent=4)

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_for_vocab_creation))
    vocab = Vocabulary.from_params(params.pop("vocabulary", {}),
                                   (instance for key, dataset in all_datasets.items()
                                    for instance in dataset
                                    if key in datasets_for_vocab_creation))
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    model = Model.from_params(vocab, params.pop('model'))
    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)

    train_data = all_datasets['train']
    validation_data = all_datasets.get('validation')
    test_data = all_datasets.get('test')

    trainer_params = params.pop("trainer")
    trainer = Trainer.from_params(model,
                                  serialization_dir,
                                  iterator,
                                  train_data,
                                  validation_data,
                                  trainer_params)

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)
    params.assert_empty('base train command')

    try:
        metrics = trainer.train()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info("Training interrupted by the user. Attempting to create "
                         "a model archive using the current best epoch weights.")
            archive_model(serialization_dir, files_to_archive=params.files_to_archive)
        raise

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)

    if test_data and evaluate_on_test:
        test_metrics = evaluate(model, test_data, iterator, cuda_device=trainer._cuda_devices[0])  # pylint: disable=protected-access
        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif test_data:
        logger.info("To evaluate on the test set after training, pass the "
                    "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    metrics_json = json.dumps(metrics, indent=2)
    with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file:
        metrics_file.write(metrics_json)
    logger.info("Metrics: %s", metrics_json)

    return model
예제 #22
0
def fine_tune_model(model: Model,
                    params: Params,
                    serialization_dir: str,
                    extend_vocab: bool = False,
                    file_friendly_logging: bool = False,
                    batch_weight_key: str = "") -> Model:
    """
    Fine tunes the given model, using a set of parameters that is largely identical to those used
    for :func:`~allennlp.commands.train.train_model`, except that the ``model`` section is ignored,
    if it is present (as we are already given a ``Model`` here).

    The main difference between the logic done here and the logic done in ``train_model`` is that
    here we do not worry about vocabulary construction or creating the model object.  Everything
    else is the same.

    Parameters
    ----------
    archive : ``Archive``
        A saved model archive that is the result of running the ``train`` command.
    train_data_path : ``str``
        Path to the training data to use for fine-tuning.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    validation_data_path : ``str``, optional
        Path to the validation data to use while fine-tuning.
    extend_vocab: ``bool``, optional (default=False)
        If ``True``, we use the new instances to extend your vocabulary.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    """
    prepare_environment(params)
    if os.path.exists(serialization_dir) and os.listdir(serialization_dir):
        raise ConfigurationError(
            f"Serialization directory ({serialization_dir}) "
            f"already exists and is not empty.")

    os.makedirs(serialization_dir, exist_ok=True)
    prepare_global_logging(serialization_dir, file_friendly_logging)

    serialization_params = deepcopy(params).as_dict(quiet=True)
    with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file:
        json.dump(serialization_params, param_file, indent=4)

    if params.pop('model', None):
        logger.warning(
            "You passed parameters for the model in your configuration file, but we "
            "are ignoring them, using instead the model parameters in the archive."
        )

    vocabulary_params = params.pop('vocabulary', {})
    if vocabulary_params.get('directory_path', None):
        logger.warning(
            "You passed `directory_path` in parameters for the vocabulary in "
            "your configuration file, but it will be ignored. ")

    all_datasets = datasets_from_params(params)
    vocab = model.vocab

    if extend_vocab:
        datasets_for_vocab_creation = set(
            params.pop("datasets_for_vocab_creation", all_datasets))

        for dataset in datasets_for_vocab_creation:
            if dataset not in all_datasets:
                raise ConfigurationError(
                    f"invalid 'dataset_for_vocab_creation' {dataset}")

        logger.info("Extending model vocabulary using %s data.",
                    ", ".join(datasets_for_vocab_creation))
        vocab.extend_from_instances(
            vocabulary_params,
            (instance for key, dataset in all_datasets.items()
             for instance in dataset if key in datasets_for_vocab_creation))

    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(model.vocab)
    validation_iterator_params = params.pop("validation_iterator", None)
    if validation_iterator_params:
        validation_iterator = DataIterator.from_params(
            validation_iterator_params)
        validation_iterator.index_with(vocab)
    else:
        validation_iterator = None

    train_data = all_datasets['train']
    validation_data = all_datasets.get('validation')
    test_data = all_datasets.get('test')

    trainer_params = params.pop("trainer")
    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)

    frozen_parameter_names, tunable_parameter_names = \
                   get_frozen_and_tunable_parameter_names(model)
    logger.info("Following parameters are Frozen  (without gradient):")
    for name in frozen_parameter_names:
        logger.info(name)
    logger.info("Following parameters are Tunable (with gradient):")
    for name in tunable_parameter_names:
        logger.info(name)

    trainer_type = trainer_params.pop("type", "default")
    if trainer_type == "default":
        trainer = Trainer.from_params(model=model,
                                      serialization_dir=serialization_dir,
                                      iterator=iterator,
                                      train_data=train_data,
                                      validation_data=validation_data,
                                      params=trainer_params,
                                      validation_iterator=validation_iterator)
    else:
        raise ConfigurationError(
            "currently fine-tune only works with the default Trainer")

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)

    params.assert_empty('base train command')
    try:
        metrics = trainer.train()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info(
                "Fine-tuning interrupted by the user. Attempting to create "
                "a model archive using the current best epoch weights.")
            archive_model(serialization_dir,
                          files_to_archive=params.files_to_archive)
        raise

    # Evaluate
    if test_data and evaluate_on_test:
        logger.info(
            "The model will be evaluated using the best epoch weights.")
        test_metrics = evaluate(
            model,
            test_data,
            validation_iterator or iterator,
            cuda_device=trainer._cuda_devices[0],  # pylint: disable=protected-access,
            batch_weight_key=batch_weight_key)

        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif test_data:
        logger.info(
            "To evaluate on the test set after training, pass the "
            "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)

    metrics_json = json.dumps(metrics, indent=2)
    with open(os.path.join(serialization_dir, "metrics.json"),
              "w") as metrics_file:
        metrics_file.write(metrics_json)
    logger.info("Metrics: %s", metrics_json)

    return model
예제 #23
0
        #if semi_supervision:
        params['unlabelled_train_data_path'] = os.path.join(
            params['data_dir'], 'shuffle' + str(shuffle_id),
            params['unlabelled_train_data_file'])
        params['unlabelled_dataset_reader']['start_from'] = train_size

        params['dataset_reader']['how_many_sentences'] = train_size
        #params['model']['train_size'] = train_size
        params['serialization_dir'] = os.path.join(
            os.path.dirname(os.path.dirname(params['serialization_dir'])),
            'shuffle' + str(shuffle_id), 'ts' + str(params['train_size']))

        serialization_dir = params['serialization_dir']
        training_util.create_serialization_dir(params, serialization_dir,
                                               args.recover, args.force)
        common_util.prepare_global_logging(serialization_dir, True)
        params.to_file(os.path.join(serialization_dir, CONFIG_NAME))

        for key in [
                'warmup_epochs', 'unlabelled_train_data_file',
                'test_data_file', 'data_dir', 'cuda_device',
                'serialization_dir', 'train_data_file', 'validation_data_file',
                'constraints_wt', 'train_size', 'shuffle_id',
                'semi_supervised', 'which_mixer', 'distributed_lambda_update'
        ]:
            params.pop(key, None)
        #Pdb().set_trace()
        pieces = gan_trainer_hm.TrainerPiecesForSemi.from_params(
            params, serialization_dir, args.recover, semi_supervision)  # pylint: disable=no-member

        trainer = Trainer.from_params(
예제 #24
0
def fine_tune_model(model: Model,
                    params: Params,
                    serialization_dir: str,
                    file_friendly_logging: bool = False) -> Model:
    """
    Fine tunes the given model, using a set of parameters that is largely identical to those used
    for :func:`~allennlp.commands.train.train_model`, except that the ``model`` section is ignored,
    if it is present (as we are already given a ``Model`` here).

    The main difference between the logic done here and the logic done in ``train_model`` is that
    here we do not worry about vocabulary construction or creating the model object.  Everything
    else is the same.

    Parameters
    ----------
    archive : ``Archive``
        A saved model archive that is the result of running the ``train`` command.
    train_data_path : ``str``
        Path to the training data to use for fine-tuning.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    validation_data_path : ``str``, optional
        Path to the validation data to use while fine-tuning.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    """
    prepare_environment(params)
    os.makedirs(serialization_dir)
    prepare_global_logging(serialization_dir, file_friendly_logging)

    serialization_params = deepcopy(params).as_dict(quiet=True)
    with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file:
        json.dump(serialization_params, param_file, indent=4)

    if params.pop('model', None):
        logger.warning("You passed parameters for the model in your configuration file, but we "
                       "are ignoring them, using instead the model parameters in the archive.")

    vocabulary_params = params.pop('vocabulary', {})
    if vocabulary_params.get('directory_path', None):
        logger.warning("You passed `directory_path` in parameters for the vocabulary in "
                       "your configuration file, but it will be ignored. "
                       "Vocabulary from the saved model will be extended with current data.")

    all_datasets = datasets_from_params(params)

    datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("Extending model vocabulary using %s data.", ", ".join(datasets_for_vocab_creation))
    vocab = model.vocab
    vocab.extend_from_instances(vocabulary_params,
                                (instance for key, dataset in all_datasets.items()
                                 for instance in dataset
                                 if key in datasets_for_vocab_creation))
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)

    train_data = all_datasets['train']
    validation_data = all_datasets.get('validation')
    test_data = all_datasets.get('test')

    trainer_params = params.pop("trainer")
    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)

    frozen_parameter_names, tunable_parameter_names = \
                   get_frozen_and_tunable_parameter_names(model)
    logger.info("Following parameters are Frozen  (without gradient):")
    for name in frozen_parameter_names:
        logger.info(name)
    logger.info("Following parameters are Tunable (with gradient):")
    for name in tunable_parameter_names:
        logger.info(name)

    trainer = Trainer.from_params(model,
                                  serialization_dir,
                                  iterator,
                                  train_data,
                                  validation_data,
                                  trainer_params)

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)
    params.assert_empty('base train command')
    try:
        metrics = trainer.train()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info("Fine-tuning interrupted by the user. Attempting to create "
                         "a model archive using the current best epoch weights.")
            archive_model(serialization_dir, files_to_archive=params.files_to_archive)
        raise

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)

    if test_data and evaluate_on_test:
        test_metrics = evaluate(model, test_data, iterator, cuda_device=trainer._cuda_devices[0])  # pylint: disable=protected-access
        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif test_data:
        logger.info("To evaluate on the test set after training, pass the "
                    "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    metrics_json = json.dumps(metrics, indent=2)
    with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file:
        metrics_file.write(metrics_json)
    logger.info("Metrics: %s", metrics_json)

    return model
folds = [train_dataset[:ltd // 2], train_dataset[ltd // 2:]]

# FIXME: their code trains on each fold before updating either one. Consider doing this also...
for big_iter in range(iterations):
    print(" ---- BIG ITERATION: {} ---- ".format(big_iter))
    for i, fold in enumerate(folds):

        model, optimizer, cuda_device = get_model(pretrained_file,
                                                  WORD_EMB_DIM, vocab,
                                                  len(reader.alltags))

        iterator = BasicIterator(batch_size=batch_size)
        iterator.index_with(vocab)

        ser_dir_iter = serialization_dir + "/iter-{}-{}".format(big_iter, i)
        prepare_global_logging(ser_dir_iter, False)

        trainer = Trainer(model=model,
                          optimizer=optimizer,
                          iterator=iterator,
                          train_dataset=fold,
                          validation_dataset=validation_dataset,
                          patience=10,
                          num_epochs=45,
                          validation_metric="+f1-measure-overall",
                          cuda_device=cuda_device,
                          num_serialized_models_to_keep=3,
                          serialization_dir=ser_dir_iter)

        metrics = trainer.train()
예제 #26
0
def train_model(params: Params,
                serialization_dir: str,
                file_friendly_logging: bool = False,
                recover: bool = False,
                force: bool = False) -> Model:
    """
    Trains the model specified in the given :class:`Params` object, using the data and training
    parameters also specified in that object, and saves the results in ``serialization_dir``.

    Parameters
    ----------
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    recover : ``bool``, optional (default=False)
        If ``True``, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see the ``fine-tune`` command.

    Returns
    -------
    best_model: ``Model``
        The model with the best epoch weights.
    """
    prepare_environment(params)

    create_serialization_dir(params, serialization_dir, recover, force)
    prepare_global_logging(serialization_dir, file_friendly_logging)

    cuda_device = params.params.get('trainer').get('cuda_device', -1)
    if isinstance(cuda_device, list):
        for device in cuda_device:
            check_for_gpu(device)
    else:
        check_for_gpu(cuda_device)

    params.to_file(os.path.join(serialization_dir, CONFIG_NAME))

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("From dataset instances, %s will be considered for vocabulary creation.",
                ", ".join(datasets_for_vocab_creation))
    vocab = Vocabulary.from_params(
            params.pop("vocabulary", {}),
            (instance for key, dataset in all_datasets.items()
             for instance in dataset
             if key in datasets_for_vocab_creation)
    )

    model = Model.from_params(vocab=vocab, params=params.pop('model'))

    # Initializing the model can have side effect of expanding the vocabulary
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)
    validation_iterator_params = params.pop("validation_iterator", None)
    if validation_iterator_params:
        validation_iterator = DataIterator.from_params(validation_iterator_params)
        validation_iterator.index_with(vocab)
    else:
        validation_iterator = None

    train_data = all_datasets['train']
    validation_data = all_datasets.get('validation')
    test_data = all_datasets.get('test')

    trainer_params = params.pop("trainer")
    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)

    frozen_parameter_names, tunable_parameter_names = \
                   get_frozen_and_tunable_parameter_names(model)
    logger.info("Following parameters are Frozen  (without gradient):")
    for name in frozen_parameter_names:
        logger.info(name)
    logger.info("Following parameters are Tunable (with gradient):")
    for name in tunable_parameter_names:
        logger.info(name)

    trainer_choice = trainer_params.pop_choice("type",
                                               Trainer.list_available(),
                                               default_to_first_choice=True)
    trainer = Trainer.by_name(trainer_choice).from_params(model=model,
                                                          serialization_dir=serialization_dir,
                                                          iterator=iterator,
                                                          train_data=train_data,
                                                          validation_data=validation_data,
                                                          params=trainer_params,
                                                          validation_iterator=validation_iterator)

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)
    params.assert_empty('base train command')

    try:
        metrics = trainer.train()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info("Training interrupted by the user. Attempting to create "
                         "a model archive using the current best epoch weights.")
            archive_model(serialization_dir, files_to_archive=params.files_to_archive)
        raise

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)

    logger.info("Loading the best epoch weights.")
    best_model_state_path = os.path.join(serialization_dir, 'best.th')
    best_model_state = torch.load(best_model_state_path)
    best_model = model
    best_model.load_state_dict(best_model_state)

    if test_data and evaluate_on_test:
        logger.info("The model will be evaluated using the best epoch weights.")
        test_metrics = evaluate(
                best_model, test_data, validation_iterator or iterator,
                cuda_device=trainer._cuda_devices[0] # pylint: disable=protected-access
        )
        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif test_data:
        logger.info("To evaluate on the test set after training, pass the "
                    "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    dump_metrics(os.path.join(serialization_dir, "metrics.json"), metrics, log=True)

    return best_model
예제 #27
0
def _train_worker(
    process_rank: int,
    params: Params,
    serialization_dir: str,
    file_friendly_logging: bool = False,
    recover: bool = False,
    cache_directory: str = None,
    cache_prefix: str = None,
    include_package: List[str] = None,
    node_rank: int = 0,
    master_addr: str = "127.0.0.1",
    master_port: int = 29500,
    world_size: int = 1,
    distributed_device_ids: List[str] = None,
) -> Optional[Model]:
    """
    Helper to train the configured model/experiment. In distributed mode, this is spawned as a
    worker process. In a single GPU experiment, this returns the ``Model`` object and in distributed
    training, nothing is returned.

    # Parameters

    process_rank : ``int``
        The process index that is initialized using the GPU device id.
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    recover : ``bool``, optional (default=False)
        If ``True``, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see the ``fine-tune`` command.
    cache_directory : ``str``, optional
        For caching data pre-processing.  See :func:`allennlp.training.util.datasets_from_params`.
    cache_prefix : ``str``, optional
        For caching data pre-processing.  See :func:`allennlp.training.util.datasets_from_params`.
    include_package : ``List[str]``, optional
        In distributed mode, since this function would have been spawned as a separate process,
        the extra imports need to be done again. NOTE: This does not have any effect in single
        GPU training.
    node_rank : ``int``, optional
        Rank of the node
    world_size : ``int``, optional
        The number of processes involved in distributed training.

    # Returns

    best_model : ``Model``
        The model with the best epoch weights.
    """
    prepare_global_logging(serialization_dir,
                           file_friendly_logging,
                           rank=process_rank,
                           world_size=world_size)
    prepare_environment(params)

    distributed = world_size > 1

    # not using `allennlp.common.util.is_master` as the process group is yet to be initialized
    master = process_rank == 0

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)

    if distributed:
        # Since the worker is spawned and not forked, the extra imports
        # need to be done again.
        if include_package is not None:
            for package_name in include_package:
                import_submodules(package_name)

        num_procs_per_node = len(distributed_device_ids)
        # The Unique identifier of the worker process among all the processes in the
        # distributed training group is computed here. This is used while initializing
        # the process group using `init_process_group`
        global_rank = node_rank * num_procs_per_node + process_rank

        # In distributed training, the configured device is always going to be a list.
        # The corresponding gpu id for the particular worker is obtained by picking the id
        # from the device list with the rank as index
        gpu_id = distributed_device_ids[process_rank]  # type: ignore

        # Till now, "cuda_device" might not be set in the trainer params.
        # But a worker trainer needs to only know about its specific GPU id.
        params["trainer"]["cuda_device"] = gpu_id
        params["trainer"]["world_size"] = world_size
        params["trainer"]["distributed"] = True

        torch.cuda.set_device(gpu_id)
        dist.init_process_group(
            backend="nccl",
            init_method=f"tcp://{master_addr}:{master_port}",
            world_size=world_size,
            rank=global_rank,
        )
        logging.info(f"Process group of world size {world_size} initialized "
                     f"for distributed training in worker {global_rank}")

    trainer_type = params.get("trainer", {}).get("type", "default")

    if trainer_type == "default":
        # Special logic to instantiate backward-compatible trainer.
        pieces = TrainerPieces.from_params(params, serialization_dir, recover,
                                           cache_directory, cache_prefix)
        trainer = Trainer.from_params(
            model=pieces.model,
            serialization_dir=serialization_dir,
            iterator=pieces.iterator,
            train_data=pieces.train_dataset,
            validation_data=pieces.validation_dataset,
            params=pieces.params,
            validation_iterator=pieces.validation_iterator,
        )

        evaluation_iterator = pieces.validation_iterator or pieces.iterator
        evaluation_dataset = pieces.test_dataset

    else:
        if evaluate_on_test:
            raise ValueError(
                "--evaluate-on-test only works with the default Trainer. "
                "If you're using the CallbackTrainer you can use a callback "
                "to evaluate at Events.TRAINING_END; otherwise you'll have "
                "to run allennlp evaluate separately.")

        trainer = TrainerBase.from_params(params, serialization_dir, recover,
                                          cache_directory, cache_prefix)
        evaluation_dataset = None

    params.assert_empty("base train command")

    try:
        if distributed:  # let the setup get ready for all the workers
            dist.barrier()

        metrics = trainer.train()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if master and os.path.exists(
                os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info(
                "Training interrupted by the user. Attempting to create "
                "a model archive using the current best epoch weights.")
            archive_model(serialization_dir,
                          files_to_archive=params.files_to_archive)
        raise

    if master:
        if evaluation_dataset and evaluate_on_test:
            logger.info(
                "The model will be evaluated using the best epoch weights.")
            test_metrics = evaluate(
                trainer.model,
                evaluation_dataset,
                evaluation_iterator,
                cuda_device=trainer.cuda_device,
                # TODO(brendanr): Pass in an arg following Joel's trainer refactor.
                batch_weight_key="",
            )

            for key, value in test_metrics.items():
                metrics["test_" + key] = value
        elif evaluation_dataset:
            logger.info(
                "To evaluate on the test set after training, pass the "
                "'evaluate_on_test' flag, or use the 'allennlp evaluate' command."
            )
        dump_metrics(os.path.join(serialization_dir, "metrics.json"),
                     metrics,
                     log=True)

    if not distributed:
        return trainer.model

    return None  # to make mypy happy
예제 #28
0
    print("serialization directory exists... ")
    r = input(
        "Serialization dir {} exists. Remove? y/n  ".format(serialization_dir))
    if r == "y":
        shutil.rmtree(serialization_dir)
    else:
        print("Not removing directory")
        sys.exit()

iterator = BasicIterator(batch_size=batch_size)
iterator.index_with(vocab)

model, optimizer, cuda_device = get_model(pretrained_file, WORD_EMB_DIM, vocab,
                                          len(reader.alltags))

prepare_global_logging(serialization_dir, False)

trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=train_dataset,
                  validation_dataset=validation_dataset,
                  patience=10,
                  num_epochs=75,
                  validation_metric="+f1-measure-overall",
                  cuda_device=cuda_device,
                  num_serialized_models_to_keep=3,
                  serialization_dir=serialization_dir)

metrics = trainer.train()
예제 #29
0
def train_model(params: Params,
                serialization_dir: str,
                file_friendly_logging: bool = False,
                recover: bool = False,
                force: bool = False,
                cache_directory: str = None,
                cache_prefix: str = None) -> Model:
    """
    Trains the model specified in the given :class:`Params` object, using the data and training
    parameters also specified in that object, and saves the results in ``serialization_dir``.

    Parameters
    ----------
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    recover : ``bool``, optional (default=False)
        If ``True``, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see the ``fine-tune`` command.
    force : ``bool``, optional (default=False)
        If ``True``, we will overwrite the serialization directory if it already exists.
    cache_directory : ``str``, optional
        For caching data pre-processing.  See :func:`allennlp.training.util.datasets_from_params`.
    cache_prefix : ``str``, optional
        For caching data pre-processing.  See :func:`allennlp.training.util.datasets_from_params`.

    Returns
    -------
    best_model: ``Model``
        The model with the best epoch weights.
    """
    create_serialization_dir(params, serialization_dir, recover, force)
    stdout_handler = prepare_global_logging(serialization_dir,
                                            file_friendly_logging)
    prepare_environment(params)

    cuda_device = params.params.get('trainer').get('cuda_device', -1)
    check_for_gpu(cuda_device)

    params.to_file(os.path.join(serialization_dir, CONFIG_NAME))

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)

    trainer_type = params.get("trainer", {}).get("type", "default")

    if trainer_type == "default":
        # Special logic to instantiate backward-compatible trainer.
        pieces = TrainerPieces.from_params(
            params,  # pylint: disable=no-member
            serialization_dir,
            recover,
            cache_directory,
            cache_prefix)
        trainer = Trainer.from_params(
            model=pieces.model,
            serialization_dir=serialization_dir,
            iterator=pieces.iterator,
            train_data=pieces.train_dataset,
            validation_data=pieces.validation_dataset,
            params=pieces.params,
            validation_iterator=pieces.validation_iterator)

        evaluation_iterator = pieces.validation_iterator or pieces.iterator
        evaluation_dataset = pieces.test_dataset

    else:
        if evaluate_on_test:
            raise ValueError(
                "--evaluate-on-test only works with the default Trainer. "
                "If you're using the CallbackTrainer you can use a callback "
                "to evaluate at Events.TRAINING_END; otherwise you'll have "
                "to run allennlp evaluate separately.")

        trainer = TrainerBase.from_params(params, serialization_dir, recover,
                                          cache_directory, cache_prefix)
        evaluation_dataset = None

    params.assert_empty('base train command')

    try:
        metrics = trainer.train()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info(
                "Training interrupted by the user. Attempting to create "
                "a model archive using the current best epoch weights.")
            archive_model(serialization_dir,
                          files_to_archive=params.files_to_archive)
        raise

    # Evaluate
    if evaluation_dataset and evaluate_on_test:
        logger.info(
            "The model will be evaluated using the best epoch weights.")
        test_metrics = evaluate(
            trainer.model,
            evaluation_dataset,
            evaluation_iterator,
            cuda_device=trainer._cuda_devices[0],  # pylint: disable=protected-access,
            # TODO(brendanr): Pass in an arg following Joel's trainer refactor.
            batch_weight_key="")

        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif evaluation_dataset:
        logger.info(
            "To evaluate on the test set after training, pass the "
            "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    cleanup_global_logging(stdout_handler)

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)
    dump_metrics(os.path.join(serialization_dir, "metrics.json"),
                 metrics,
                 log=True)

    # We count on the trainer to have the model with best weights
    return trainer.model
예제 #30
0
    default=False,
    help='outputs tqdm status on separate lines and slows tqdm refresh rate')

args = parser.parse_args()
params = Params.from_file(args.param_path, args.overrides)
random_seed, numpy_seed, pytorch_seed = random.randint(
    0, 999999999), random.randint(0, 999999999), random.randint(0, 999999999)
params["random_seed"] = random_seed
params["numpy_seed"] = numpy_seed
params["pytorch_seed"] = pytorch_seed
prepare_environment(params)
from graph_dependency_parser.components.evaluation.predictors import Evaluator, EmptyMRPEvaluator
from graph_dependency_parser.train.amtrainer import AMTrainer, TrainerPieces
serialization_dir = args.serialization_dir
create_serialization_dir(params, serialization_dir, args.recover, args.force)
stdout_handler = prepare_global_logging(serialization_dir,
                                        args.file_friendly_logging)

cuda_device = params.params.get('trainer').get('cuda_device', -1)
check_for_gpu(cuda_device)

params.to_file(os.path.join(serialization_dir, CONFIG_NAME))

evaluate_on_test = params.pop_bool("evaluate_on_test", False)

test_evaluators = params.pop("test_evaluators", [])
test_evaluators: List[Tuple[str, Evaluator]] = [
    (name, Evaluator.from_params(evaluator)) for formalism in test_evaluators
    for name, evaluator in formalism
]
if len(test_evaluators) == 0:
    logger.warning(
예제 #31
0
파일: train.py 프로젝트: wyxingyuX/allennlp
def train_model(params: Params,
                serialization_dir: str,
                file_friendly_logging: bool = False,
                recover: bool = False) -> Model:
    """
    Trains the model specified in the given :class:`Params` object, using the data and training
    parameters also specified in that object, and saves the results in ``serialization_dir``.

    Parameters
    ----------
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    recover : ``bool``, optional (default=False)
        If ``True``, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see the ``fine-tune`` command.

    Returns
    -------
    best_model: ``Model``
        The model with the best epoch weights.
    """
    prepare_environment(params)

    create_serialization_dir(params, serialization_dir, recover)
    prepare_global_logging(serialization_dir, file_friendly_logging)

    check_for_gpu(params.params.get('trainer').get('cuda_device', -1))

    serialization_params = deepcopy(params).as_dict(quiet=True)
    with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file:
        json.dump(serialization_params, param_file, indent=4)

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(
        params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(
                f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("Creating a vocabulary using %s data.",
                ", ".join(datasets_for_vocab_creation))
    vocab = Vocabulary.from_params(
        params.pop("vocabulary", {}),
        (instance for key, dataset in all_datasets.items()
         for instance in dataset if key in datasets_for_vocab_creation))

    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    model = Model.from_params(vocab=vocab, params=params.pop('model'))
    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)
    validation_iterator_params = params.pop("validation_iterator", None)
    if validation_iterator_params:
        validation_iterator = DataIterator.from_params(
            validation_iterator_params)
        validation_iterator.index_with(vocab)
    else:
        validation_iterator = None

    train_data = all_datasets['train']
    validation_data = all_datasets.get('validation')
    test_data = all_datasets.get('test')

    trainer_params = params.pop("trainer")
    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)

    frozen_parameter_names, tunable_parameter_names = \
                   get_frozen_and_tunable_parameter_names(model)
    logger.info("Following parameters are Frozen  (without gradient):")
    for name in frozen_parameter_names:
        logger.info(name)
    logger.info("Following parameters are Tunable (with gradient):")
    for name in tunable_parameter_names:
        logger.info(name)

    trainer = Trainer.from_params(model,
                                  serialization_dir,
                                  iterator,
                                  train_data,
                                  validation_data,
                                  trainer_params,
                                  validation_iterator=validation_iterator)

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)
    params.assert_empty('base train command')

    try:
        metrics = trainer.train()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info(
                "Training interrupted by the user. Attempting to create "
                "a model archive using the current best epoch weights.")
            archive_model(serialization_dir,
                          files_to_archive=params.files_to_archive)
        raise

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)

    logger.info("Loading the best epoch weights.")
    best_model_state_path = os.path.join(serialization_dir, 'best.th')
    best_model_state = torch.load(best_model_state_path)
    best_model = model
    best_model.load_state_dict(best_model_state)

    if test_data and evaluate_on_test:
        logger.info(
            "The model will be evaluated using the best epoch weights.")
        test_metrics = evaluate(
            best_model,
            test_data,
            validation_iterator or iterator,
            cuda_device=trainer._cuda_devices[0]  # pylint: disable=protected-access
        )
        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif test_data:
        logger.info(
            "To evaluate on the test set after training, pass the "
            "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    metrics_json = json.dumps(metrics, indent=2)
    with open(os.path.join(serialization_dir, "metrics.json"),
              "w") as metrics_file:
        metrics_file.write(metrics_json)
    logger.info("Metrics: %s", metrics_json)

    return best_model
예제 #32
0
def train_model(params: Params,
                serialization_dir: str,
                file_friendly_logging: bool = False,
                recover: bool = False,
                force: bool = False,
                debate_mode: List[str] = ('f'),
                judge_filename: str = None,
                update_judge: bool = False,
                eval_mode: bool = False,
                reward_method: str = None,
                detach_value_head: bool = False,
                breakpoint_level: int = 0,
                search_outputs_path: str = None,
                accumulation_steps: int = 1,
                multi_gpu: bool = False,
                choice_mode: str = None,
                qa_loss_weight: float = 0.,
                influence_reward: bool = False,
                theory_of_mind: bool = False,
                num_pred_rounds: int = -1,
                x_order_prob: float = 0.,
                require_action: bool = False,
                single_shot: bool = False) -> Model:
    """
    Trains the model specified in the given :class:`Params` object, using the data and training
    parameters also specified in that object, and saves the results in ``serialization_dir``.

    Parameters
    ----------
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    debate_mode : ``List[str]``
        List of debate turns (e.g. aa, ar, rr, Ar) => capitalization implies search agent
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    recover : ``bool``, optional (default=False)
        If ``True``, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see the ``fine-tune`` command.
    force : ``bool``, optional (default=False)
        If ``True``, we will overwrite the serialization directory if it already exists.
    judge_filename : ``str``, optional (default=None)
        Path to judge config or pre-trained judge model. If config, judge trained during debate. Necessary parameter
        if running in debate mode.
    update_judge : ``bool``, optional (default=False)
        Boolean whether or not to update Judge model during debate training.
    eval_mode : ``bool``, optional (default=False)
        Boolean whether or not to run in eval-only mode, on test data. Does not update/train any of the models.
    reward_method : ``str``, optional (default=False)
        Choice of reward function (RL) or loss function (Supervised Learning) for training debate agents
    detach_value_head : ``bool``, optional (default=False)
        Boolean whether or not to detatch value function gradient updates from the policy network. This prevents
        value function gradients from affecting policy network parameters.
    breakpoint_level : ``int`` optional (default=0)
        Debugging option to set breakpoint sensitivity (0 - no breakpoints).
    id_to_search_filename : ``str`` optional (default=None)
        Path to file with search predictions for each agent - necessary for supervised training
    accumulation_steps : ``int`` (default=1)
        Number of gradient steps to accumulate over before performing an update. Poor-man's batching for instances where
        number of examples per batch is small (limited GPU memory)
    multi_gpu : ``bool`` (default=False)
        Boolean whether or not to run models/training in model parallel mode. Requires specifying GPU allocations for
        trainer, judge, and debaters in the training config file (see training_config/bidaf.race.size=0.5.gpu=2.jsonnet
        for example usage).

    Returns
    -------
    best_model: ``Model``
        The model with the best epoch weights.
    """
    assert (
        not single_shot
    ) or eval_mode, 'Using single shot prediction outside eval_mode not yet supported.'
    assert (not single_shot) or (num_pred_rounds == -1), \
        'Using single shot prediction for a specific number of rounds is not yet supported.'
    # Get number of debate turns, and assert that not performing judge-only training
    num_no_qa_turns = sum([(('l' in debate_turn) or ('w' in debate_turn))
                           for debate_turn in debate_mode])
    if (qa_loss_weight > 0) and (num_no_qa_turns == 0):
        warnings.warn(
            'Unused argument qa_loss_weight in debate mode ' +
            str(debate_mode) +
            '. If this was unintentional, please remove the -q flag.',
            UserWarning)
    not_using_trained_debater = len(
        set('ablwⅰⅱⅲⅳ').intersection(''.join(debate_mode))) == 0
    if (judge_filename is not None) and not_using_trained_debater:
        warnings.warn(
            'Unnecessary to have debaters in debate mode ' + str(debate_mode) +
            '. If this was unintentional, please remove the -j flag.',
            UserWarning)

    prepare_environment(params)
    create_serialization_dir(params, serialization_dir, recover, force)
    prepare_global_logging(serialization_dir, file_friendly_logging)

    # Check that all Desired CUDA Devices exist => trainer => cuda_devices should contain list of required devices
    cuda_device = params.params.get('trainer').get('cuda_device', -1)
    check_for_gpu(cuda_device)

    # Build Allocation Dictionary (to be passed to all future functions)
    if multi_gpu:
        gpu_allocations, allocation_dict = params.params.pop(
            'gpu_allocations', {}), {}
        assert len(gpu_allocations
                   ) == 3, 'Must set gpu_allocations in config if multi-gpu'
        for k in ['debate', 'judge', 'trainer']:
            assert gpu_allocations[
                k] in cuda_device, "Desired GPU not available... current: %s" % str(
                    cuda_device)
            allocation_dict[k] = gpu_allocations[k]
    else:
        allocation_dict = {}

    params.to_file(os.path.join(serialization_dir, CONFIG_NAME))

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)

    trainer_type = params.get("trainer", {}).get("type", "default")

    if trainer_type == "default":
        # Special logic to instantiate backward-compatible trainer.
        params['dataset_reader'][
            'debate_mode'] = debate_mode  # If debate_mode requires sample duplicates
        pieces = TrainerPieces.from_params(params,
                                           serialization_dir,
                                           cuda_device,
                                           recover,
                                           judge_filename=judge_filename,
                                           update_judge=update_judge,
                                           eval_mode=eval_mode,
                                           reward_method=reward_method,
                                           detach_value_head=detach_value_head,
                                           allocation_dict=allocation_dict,
                                           qa_loss_weight=qa_loss_weight,
                                           influence_reward=influence_reward,
                                           theory_of_mind=theory_of_mind)  # pylint: disable=no-member
        trainer = Trainer.from_params(
            model=pieces.model,
            serialization_dir=serialization_dir,
            debate_mode=debate_mode,
            iterator=pieces.iterator,
            train_data=pieces.train_dataset,
            validation_data=pieces.validation_dataset,
            params=pieces.params,
            validation_iterator=pieces.validation_iterator,
            eval_mode=eval_mode,
            breakpoint_level=breakpoint_level,
            search_outputs_path=search_outputs_path,
            accumulation_steps=accumulation_steps,
            allocation_dict=allocation_dict,
            choice_mode=choice_mode,
            num_pred_rounds=num_pred_rounds,
            x_order_prob=x_order_prob,
            require_action=require_action,
            single_shot=single_shot)
        evaluation_iterator = pieces.validation_iterator or pieces.iterator
        evaluation_dataset = pieces.test_dataset
    else:
        assert (len(debate_mode)
                == 1) and (debate_mode[0]
                           == 'f'), 'TrainerBase untested for debate training.'
        trainer = TrainerBase.from_params(params, serialization_dir, recover)
        evaluation_iterator = evaluation_dataset = None

    params.assert_empty('base train command')

    try:
        metrics = trainer.train()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if os.path.exists(os.path.join(serialization_dir,
                                       _DEFAULT_WEIGHTS)) and not eval_mode:
            logging.info(
                "Training interrupted by the user. Attempting to create "
                "a model archive using the current best epoch weights.")
            archive_model(serialization_dir,
                          files_to_archive=params.files_to_archive)
        raise

    # Evaluate
    if evaluation_dataset and evaluate_on_test:
        logger.info(
            "The model will be evaluated using the best epoch weights.")
        test_metrics = evaluate(
            trainer.model,
            evaluation_dataset,
            evaluation_iterator,
            cuda_device=trainer._cuda_devices[0],  # pylint: disable=protected-access,
            batch_weight_key="")

        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif evaluation_dataset:
        logger.info(
            "To evaluate on the test set after training, pass the "
            "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    # Now tar up results
    if not eval_mode:
        archive_model(serialization_dir,
                      files_to_archive=params.files_to_archive)
        dump_metrics(os.path.join(serialization_dir, "metrics.json"),
                     metrics,
                     log=True)
    else:
        dump_metrics(os.path.join(
            serialization_dir,
            "metrics.eval.d=" + '-'.join(debate_mode) + ".json"),
                     metrics,
                     log=True)

    # We count on the trainer to have the model with best weights
    return trainer.model
예제 #33
0
def _train_worker(
    process_rank: int,
    params: Params,
    serialization_dir: str,
    file_friendly_logging: bool = False,
    include_package: List[str] = None,
    batch_weight_key: str = "",
    node_rank: int = 0,
    master_addr: str = "127.0.0.1",
    master_port: int = 29500,
    world_size: int = 1,
    distributed_device_ids: List[str] = None,
) -> Optional[Model]:
    """
    Helper to train the configured model/experiment. In distributed mode, this is spawned as a
    worker process. In a single GPU experiment, this returns the ``Model`` object and in distributed
    training, nothing is returned.

    # Parameters

    process_rank : ``int``
        The process index that is initialized using the GPU device id.
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    include_package : ``List[str]``, optional
        In distributed mode, since this function would have been spawned as a separate process,
        the extra imports need to be done again. NOTE: This does not have any effect in single
        GPU training.
    batch_weight_key : ``str``, optional (default="")
        If non-empty, name of metric used to weight the loss on a per-batch basis.
    node_rank : ``int``, optional
        Rank of the node.
    master_addr : ``str``, optional (default="127.0.0.1")
        Address of the master node for distributed training.
    master_port : ``str``, optional (default="29500")
        Port of the master node for distributed training.
    world_size : ``int``, optional
        The number of processes involved in distributed training.
    distributed_device_ids: ``List[str]``, optional
        IDs of the devices used involved in distributed training.

    # Returns

    best_model : ``Model``
        The model with the best epoch weights.
    """
    common_util.prepare_global_logging(serialization_dir,
                                       file_friendly_logging,
                                       rank=process_rank,
                                       world_size=world_size)
    common_util.prepare_environment(params)

    distributed = world_size > 1

    # not using `allennlp.common.util.is_master` as the process group is yet to be initialized
    master = process_rank == 0

    include_package = include_package or []

    if distributed:
        # Since the worker is spawned and not forked, the extra imports need to be done again.
        import_plugins()
        for package_name in include_package:
            common_util.import_submodules(package_name)

        num_procs_per_node = len(distributed_device_ids)
        # The Unique identifier of the worker process among all the processes in the
        # distributed training group is computed here. This is used while initializing
        # the process group using `init_process_group`
        global_rank = node_rank * num_procs_per_node + process_rank

        # Number of processes per node is useful to know if a process
        # is a master in the local node(node in which it is running)
        os.environ["ALLENNLP_PROCS_PER_NODE"] = str(num_procs_per_node)

        # In distributed training, the configured device is always going to be a list.
        # The corresponding gpu id for the particular worker is obtained by picking the id
        # from the device list with the rank as index
        gpu_id = distributed_device_ids[process_rank]  # type: ignore

        # Till now, "cuda_device" might not be set in the trainer params.
        # But a worker trainer needs to only know about its specific GPU id.
        params["trainer"]["cuda_device"] = gpu_id
        params["trainer"]["world_size"] = world_size
        params["trainer"]["distributed"] = True

        torch.cuda.set_device(int(gpu_id))
        dist.init_process_group(
            backend="nccl",
            init_method=f"tcp://{master_addr}:{master_port}",
            world_size=world_size,
            rank=global_rank,
        )
        logging.info(f"Process group of world size {world_size} initialized "
                     f"for distributed training in worker {global_rank}")

    train_loop = TrainModel.from_params(
        params=params,
        serialization_dir=serialization_dir,
        local_rank=process_rank,
        batch_weight_key=batch_weight_key,
    )

    try:
        if distributed:  # let the setup get ready for all the workers
            dist.barrier()

        metrics = train_loop.run()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if master and os.path.exists(
                os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info(
                "Training interrupted by the user. Attempting to create "
                "a model archive using the current best epoch weights.")
            archive_model(serialization_dir)
        raise

    if master:
        train_loop.finish(metrics)

    if not distributed:
        return train_loop.model

    return None  # to make mypy happy
def main(serialization_dir, evaluation_data_file, split, cuda_device, weights_file, overrides):

    archive_file = os.path.join(serialization_dir, "model.tar.gz")

    logging_dir = os.path.join(serialization_dir, "logging")

    if os.path.isfile(archive_file):
        weights_file = None
        archive = load_archive(archive_file, cuda_device, overrides, weights_file)
        config = archive.config
        prepare_environment(config)
        prepare_global_logging(logging_dir, file_friendly_logging=False, file_name=split)
        model = archive.model
    else:
        # Load config
        config = Params.from_file(os.path.join(serialization_dir, CONFIG_NAME), overrides)
        prepare_environment(config)
        prepare_global_logging(logging_dir, file_friendly_logging=False, file_name=split)

        if weights_file:
            weights_path = os.path.join(serialization_dir, weights_file)
        else:
            weights_path = os.path.join(serialization_dir, _WEIGHTS_NAME)

        logger.info("Using weights_file located at : %s", weights_path)
        # Instantiate model. Use a duplicate of the config, as it will get consumed.
        model = Model.load(config.duplicate(),
                           weights_file=weights_path,
                           serialization_dir=serialization_dir,
                           cuda_device=cuda_device)

    # Eval mode ON
    model.eval()

    # Load the evaluation data

    # Try to use the validation dataset reader if there is one - otherwise fall back
    # to the default dataset_reader used for both training and validation.
    validation_dataset_reader_params = config.pop('validation_dataset_reader', None)
    if validation_dataset_reader_params is not None:
        dataset_reader = DatasetReader.from_params(validation_dataset_reader_params)
    else:
        dataset_reader = DatasetReader.from_params(config.pop('dataset_reader'))

    if evaluation_data_file is None:
        logger.info("--evaluation_data_file not provided. So using --split=%s to read data", split)
        data_path_key = split + '_data_path'
        evaluation_data_path = config.pop(data_path_key)
    else:
        evaluation_data_path = evaluation_data_file

    logger.info("Reading evaluation data from %s", evaluation_data_path)

    instances = dataset_reader.read(evaluation_data_path)
    logger.info("No. of instances = %d", len(instances))

    iterator = BasicIterator(batch_size=128)
    iterator.index_with(model.vocab)

    metrics, model_predictions = get_model_predictions(model, instances, iterator, args.cuda_device)

    logger.info("Finished evaluating.")
    logger.info("Metrics:")
    for key, metric in metrics.items():
        logger.info("%s: %s", key, metric)

    write_predictions(serialization_dir=serialization_dir, instances=instances,
                 model_predictions=model_predictions, split=split)

    analyze_gold_data(serialization_dir=serialization_dir, instances=instances, split=split)

    analyze_model_predictions(serialization_dir=serialization_dir, instances=instances,
                              model_predictions=model_predictions, split=split)

    analyze_bio_violations(instances=instances, model_predictions=model_predictions)