Esempio n. 1
0
def make_vocab_from_params(params: Params, serialization_dir: str):
    prepare_environment(params)

    vocab_params = params.pop("vocabulary", {})
    os.makedirs(serialization_dir, exist_ok=True)
    vocab_dir = os.path.join(serialization_dir, "vocabulary")

    if os.path.isdir(vocab_dir) and os.listdir(vocab_dir) is not None:
        raise ConfigurationError("The 'vocabulary' directory in the provided "
                                 "serialization directory is non-empty")

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("From dataset instances, %s will be considered for vocabulary creation.",
                ", ".join(datasets_for_vocab_creation))

    instances = [instance for key, dataset in all_datasets.items()
                 for instance in dataset
                 if key in datasets_for_vocab_creation]

    vocab = Vocabulary.from_params(vocab_params, instances)

    logger.info(f"writing the vocabulary to {vocab_dir}.")
    vocab.save_to_files(vocab_dir)
    logger.info("done creating vocab")
Esempio n. 2
0
def make_vocab_from_params(params: Params):
    prepare_environment(params)

    vocab_params = params.pop("vocabulary", {})
    vocab_dir = vocab_params.get('directory_path')
    if vocab_dir is None:
        raise ConfigurationError("To use `make-vocab` your configuration must contain a value "
                                 "at vocabulary.directory_path")

    os.makedirs(vocab_dir, exist_ok=True)

    all_datasets = datasets_from_params(params)

    datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_for_vocab_creation))
    vocab = Vocabulary.from_params(Params({}),
                                   (instance for key, dataset in all_datasets.items()
                                    for instance in dataset
                                    if key in datasets_for_vocab_creation))

    vocab.save_to_files(vocab_dir)
    logger.info("done creating vocab")
Esempio n. 3
0
def dry_run_from_params(params: Params, serialization_dir: str) -> None:
    prepare_environment(params)

    vocab_params = params.pop("vocabulary", {})
    os.makedirs(serialization_dir, exist_ok=True)
    vocab_dir = os.path.join(serialization_dir, "vocabulary")

    if os.path.isdir(vocab_dir) and os.listdir(vocab_dir) is not None:
        raise ConfigurationError("The 'vocabulary' directory in the provided "
                                 "serialization directory is non-empty")

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("From dataset instances, %s will be considered for vocabulary creation.",
                ", ".join(datasets_for_vocab_creation))

    instances = [instance for key, dataset in all_datasets.items()
                 for instance in dataset
                 if key in datasets_for_vocab_creation]

    vocab = Vocabulary.from_params(vocab_params, instances)
    dataset = Batch(instances)
    dataset.index_instances(vocab)
    dataset.print_statistics()
    vocab.print_statistics()

    logger.info(f"writing the vocabulary to {vocab_dir}.")
    vocab.save_to_files(vocab_dir)

    model = Model.from_params(vocab=vocab, params=params.pop('model'))
    trainer_params = params.pop("trainer")
    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)

    frozen_parameter_names, tunable_parameter_names = \
                   get_frozen_and_tunable_parameter_names(model)
    logger.info("Following parameters are Frozen  (without gradient):")
    for name in frozen_parameter_names:
        logger.info(name)
    logger.info("Following parameters are Tunable (with gradient):")
    for name in tunable_parameter_names:
        logger.info(name)
Esempio n. 4
0
def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]:
    # Disable some of the more verbose logging statements
    logging.getLogger('allennlp.common.params').disabled = True
    logging.getLogger('allennlp.nn.initializers').disabled = True
    logging.getLogger('allennlp.modules.token_embedders.embedding').setLevel(logging.INFO)

    # Load from archive
    archive = load_archive(args.archive_file, args.cuda_device, args.overrides, args.weights_file)
    config = archive.config
    prepare_environment(config)
    model = archive.model
    model.eval()

    # Load the evaluation data

    # Try to use the validation dataset reader if there is one - otherwise fall back
    # to the default dataset_reader used for both training and validation.
    validation_dataset_reader_params = config.pop('validation_dataset_reader', None)
    if validation_dataset_reader_params is not None:
        dataset_reader = DatasetReader.from_params(validation_dataset_reader_params)
    else:
        dataset_reader = DatasetReader.from_params(config.pop('dataset_reader'))
    evaluation_data_path = args.input_file
    logger.info("Reading evaluation data from %s", evaluation_data_path)
    instances = dataset_reader.read(evaluation_data_path)

    iterator_params = config.pop("validation_iterator", None)
    if iterator_params is None:
        iterator_params = config.pop("iterator")
    iterator = DataIterator.from_params(iterator_params)
    iterator.index_with(model.vocab)

    metrics = evaluate(model, instances, iterator, args.cuda_device, args.batch_weight_key)

    logger.info("Finished evaluating.")
    logger.info("Metrics:")
    for key, metric in metrics.items():
        logger.info("%s: %s", key, metric)

    output_file = args.output_file
    if output_file:
        with open(output_file, "w") as file:
            json.dump(metrics, file, indent=4)
    return metrics
Esempio n. 5
0
def main(args: argparse.Namespace):

    for package_name in args.include_package:
        import_module_and_submodules(package_name)

    params = Params.from_file(args.param_path, args.overrides)

    random_seed, numpy_seed, pytorch_seed = 41, 11, 302
    if not args.fix:
        random_seed, numpy_seed, pytorch_seed = random.randint(
            0, 999999999), random.randint(0, 999999999), random.randint(
                0, 999999999)

    params["random_seed"] = random_seed
    params["numpy_seed"] = numpy_seed
    params["pytorch_seed"] = pytorch_seed
    prepare_environment(params)
    serialization_dir = args.serialization_dir
    create_serialization_dir(params, serialization_dir, args.recover,
                             args.force)
    prepare_global_logging(serialization_dir, args.file_friendly_logging)

    hyperparams = list(
        get_hyperparams(params.as_dict(infer_type_and_cast=True)))

    params.to_file(os.path.join(serialization_dir, CONFIG_NAME))

    test_file = params.params.get("test_data_path", None)
    validation_data_path = params.get("validation_data_path", None)

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)

    test_command = None
    if evaluate_on_test:
        test_command = BaseEvaluationCommand.from_params(
            params.pop("test_command"))

    cuda_device = params.params.get('trainer').get('cuda_device', -1)
    check_for_gpu(cuda_device)

    train_model = TrainPipelineModel.from_params(
        params=params, serialization_dir=serialization_dir, local_rank=0)

    trainer = train_model.trainer

    if trainer.validation_command is not None:
        trainer.validation_command.maybe_set_gold_file(validation_data_path)

    params.assert_empty('base train command')

    if args.comet is not None:
        experiment = Experiment(api_key=args.comet,
                                workspace=args.workspace,
                                project_name=args.project,
                                parse_args=False,
                                auto_output_logging=None)
        if args.tags:
            experiment.add_tags(args.tags)
        with open(args.param_path) as fil:
            code = "".join(fil.readlines())
        code += "\n\n#=============Full details=============\n\n"
        full_details = _jsonnet.evaluate_file(args.param_path)
        code += full_details
        code += "\n\n#=============IMPORTANT: overwritten options============\n\n"
        code += args.overrides
        experiment.set_code(code, overwrite=True)

        for key, val in hyperparams:
            experiment.log_parameter(key, val)

        experiment.log_parameter("model_directory", serialization_dir)
        experiment.log_parameter("cuda_device", cuda_device)
        experiment.log_parameter("hostname", socket.gethostname())
        experiment.log_parameter("random_seed", random_seed)
        experiment.log_parameter("numpy_seed", numpy_seed)
        experiment.log_parameter("pytorch_seed", pytorch_seed)
    else:
        experiment = None

    try:
        metrics = trainer.train(experiment)
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info(
                "Training interrupted by the user. Attempting to create "
                "a model archive using the current best epoch weights.")
            archive_model(serialization_dir)
        raise

    # Evaluate
    if test_file and evaluate_on_test:
        logger.info(
            "The model will be evaluated using the best epoch weights (see pred_test.txt)."
        )
        trainer.annotator.annotate_file(
            trainer.model, test_file,
            os.path.join(serialization_dir, "pred_test.txt"))

        if test_command:
            logger.info("Comparing against gold standard.")
            test_command.maybe_set_gold_file(test_file)
            test_metrics = test_command.evaluate(
                os.path.join(serialization_dir, "pred_test.txt"))
            if experiment:
                with experiment.test():
                    experiment.log_metrics({
                        k: v
                        for k, v in test_metrics.items() if np.isscalar(v)
                    })
            metrics = merge_dicts(metrics, "test", test_metrics)

    dump_metrics(os.path.join(serialization_dir, "metrics.json"),
                 metrics,
                 log=True)

    if not args.no_archive:
        # Now tar up results
        archive_model(serialization_dir)
Esempio n. 6
0
def train_model(params: Params,
                serialization_dir: str,
                file_friendly_logging: bool = False,
                recover: bool = False,
                force: bool = False) -> Model:
    """
    Trains the model specified in the given :class:`Params` object, using the data and training
    parameters also specified in that object, and saves the results in ``serialization_dir``.

    Parameters
    ----------
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    recover : ``bool``, optional (default=False)
        If ``True``, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see the ``fine-tune`` command.

    Returns
    -------
    best_model: ``Model``
        The model with the best epoch weights.
    """
    prepare_environment(params)

    create_serialization_dir(params, serialization_dir, recover, force)
    prepare_global_logging(serialization_dir, file_friendly_logging)

    cuda_device = params.params.get('trainer').get('cuda_device', -1)
    if isinstance(cuda_device, list):
        for device in cuda_device:
            check_for_gpu(device)
    else:
        check_for_gpu(cuda_device)

    params.to_file(os.path.join(serialization_dir, CONFIG_NAME))

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("From dataset instances, %s will be considered for vocabulary creation.",
                ", ".join(datasets_for_vocab_creation))
    vocab = Vocabulary.from_params(
            params.pop("vocabulary", {}),
            (instance for key, dataset in all_datasets.items()
             for instance in dataset
             if key in datasets_for_vocab_creation)
    )

    model = Model.from_params(vocab=vocab, params=params.pop('model'))

    # Initializing the model can have side effect of expanding the vocabulary
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)
    validation_iterator_params = params.pop("validation_iterator", None)
    if validation_iterator_params:
        validation_iterator = DataIterator.from_params(validation_iterator_params)
        validation_iterator.index_with(vocab)
    else:
        validation_iterator = None

    train_data = all_datasets['train']
    validation_data = all_datasets.get('validation')
    test_data = all_datasets.get('test')

    trainer_params = params.pop("trainer")
    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)

    frozen_parameter_names, tunable_parameter_names = \
                   get_frozen_and_tunable_parameter_names(model)
    logger.info("Following parameters are Frozen  (without gradient):")
    for name in frozen_parameter_names:
        logger.info(name)
    logger.info("Following parameters are Tunable (with gradient):")
    for name in tunable_parameter_names:
        logger.info(name)

    trainer_choice = trainer_params.pop_choice("type",
                                               Trainer.list_available(),
                                               default_to_first_choice=True)
    trainer = Trainer.by_name(trainer_choice).from_params(model=model,
                                                          serialization_dir=serialization_dir,
                                                          iterator=iterator,
                                                          train_data=train_data,
                                                          validation_data=validation_data,
                                                          params=trainer_params,
                                                          validation_iterator=validation_iterator)

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)
    params.assert_empty('base train command')

    try:
        metrics = trainer.train()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info("Training interrupted by the user. Attempting to create "
                         "a model archive using the current best epoch weights.")
            archive_model(serialization_dir, files_to_archive=params.files_to_archive)
        raise

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)

    logger.info("Loading the best epoch weights.")
    best_model_state_path = os.path.join(serialization_dir, 'best.th')
    best_model_state = torch.load(best_model_state_path)
    best_model = model
    best_model.load_state_dict(best_model_state)

    if test_data and evaluate_on_test:
        logger.info("The model will be evaluated using the best epoch weights.")
        test_metrics = evaluate(
                best_model, test_data, validation_iterator or iterator,
                cuda_device=trainer._cuda_devices[0] # pylint: disable=protected-access
        )
        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif test_data:
        logger.info("To evaluate on the test set after training, pass the "
                    "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    dump_metrics(os.path.join(serialization_dir, "metrics.json"), metrics, log=True)

    return best_model
Esempio n. 7
0
def train_model(params: Params, serialization_dir: str) -> Model:
    """
    This function can be used as an entry point to running models in AllenNLP
    directly from a JSON specification using a :class:`Driver`. Note that if
    you care about reproducibility, you should avoid running code using Pytorch
    or numpy which affect the reproducibility of your experiment before you
    import and use this function, these libraries rely on random seeds which
    can be set in this function via a JSON specification file. Note that this
    function performs training and will also evaluate the trained model on
    development and test sets if provided in the parameter json.

    Parameters
    ----------
    params: Params, required.
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir: str, required
        The directory in which to save results and logs.
    """
    prepare_environment(params)

    os.makedirs(serialization_dir, exist_ok=True)
    sys.stdout = TeeLogger(os.path.join(serialization_dir, "stdout.log"), sys.stdout)  # type: ignore
    sys.stderr = TeeLogger(os.path.join(serialization_dir, "stderr.log"), sys.stderr)  # type: ignore
    handler = logging.FileHandler(os.path.join(serialization_dir, "python_logging.log"))
    handler.setLevel(logging.INFO)
    handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(name)s - %(message)s'))
    logging.getLogger().addHandler(handler)
    serialization_params = deepcopy(params).as_dict(quiet=True)
    with open(os.path.join(serialization_dir, "model_params.json"), "w") as param_file:
        json.dump(serialization_params, param_file, indent=4)

    # Now we begin assembling the required parts for the Trainer.
    dataset_reader = DatasetReader.from_params(params.pop('dataset_reader'))

    train_data_path = params.pop('train_data_path')
    logger.info("Reading training data from %s", train_data_path)
    train_data = dataset_reader.read(train_data_path)

    all_datasets: Dict[str, Dataset] = {"train": train_data}

    validation_data_path = params.pop('validation_data_path', None)
    if validation_data_path is not None:
        logger.info("Reading validation data from %s", validation_data_path)
        validation_data = dataset_reader.read(validation_data_path)
        all_datasets["validation"] = validation_data
    else:
        validation_data = None

    test_data_path = params.pop("test_data_path", None)
    if test_data_path is not None:
        logger.info("Reading test data from %s", test_data_path)
        test_data = dataset_reader.read(test_data_path)
        all_datasets["test"] = test_data
    else:
        test_data = None

    datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_for_vocab_creation))
    vocab = Vocabulary.from_params(params.pop("vocabulary", {}),
                                   Dataset([instance for key, dataset in all_datasets.items()
                                            for instance in dataset.instances
                                            if key in datasets_for_vocab_creation]))
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    model = Model.from_params(vocab, params.pop('model'))
    iterator = DataIterator.from_params(params.pop("iterator"))

    train_data.index_instances(vocab)
    if validation_data:
        validation_data.index_instances(vocab)

    trainer_params = params.pop("trainer")
    trainer = Trainer.from_params(model,
                                  serialization_dir,
                                  iterator,
                                  train_data,
                                  validation_data,
                                  trainer_params)

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)
    params.assert_empty('base train command')
    trainer.train()

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)

    if test_data and evaluate_on_test:
        test_data.index_instances(vocab)
        evaluate(model, test_data, iterator, cuda_device=trainer._cuda_device)  # pylint: disable=protected-access

    elif test_data:
        logger.info("To evaluate on the test set after training, pass the "
                    "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    return model
Esempio n. 8
0
def _train_worker(
    process_rank: int,
    params: Params,
    serialization_dir: str,
    file_friendly_logging: bool = False,
    recover: bool = False,
    include_package: List[str] = None,
    batch_weight_key: str = "",
    node_rank: int = 0,
    master_addr: str = "127.0.0.1",
    master_port: int = 29500,
    world_size: int = 1,
    distributed_device_ids: List[str] = None,
    # For fine-tuning:
    model: Model = None,
    extend_vocab: bool = False,
    embedding_sources_mapping: Dict[str, str] = None,
) -> Optional[Model]:
    """
    Helper to train the configured model/experiment. In distributed mode, this is spawned as a
    worker process. In a single GPU experiment, this returns the ``Model`` object and in distributed
    training, nothing is returned.

    # Parameters

    process_rank : ``int``
        The process index that is initialized using the GPU device id.
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    recover : ``bool``, optional (default=False)
        If ``True``, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see the ``fine-tune`` command.
    include_package : ``List[str]``, optional
        In distributed mode, since this function would have been spawned as a separate process,
        the extra imports need to be done again. NOTE: This does not have any effect in single
        GPU training.
    node_rank : ``int``, optional
        Rank of the node
    world_size : ``int``, optional
        The number of processes involved in distributed training.

    # Returns

    best_model : ``Model``
        The model with the best epoch weights.
    """
    common_util.prepare_global_logging(
        serialization_dir, file_friendly_logging, rank=process_rank, world_size=world_size
    )
    common_util.prepare_environment(params)

    distributed = world_size > 1

    # not using `allennlp.common.util.is_master` as the process group is yet to be initialized
    master = process_rank == 0

    if distributed:
        # Since the worker is spawned and not forked, the extra imports
        # need to be done again.
        if include_package is not None:
            for package_name in include_package:
                common_util.import_submodules(package_name)

        num_procs_per_node = len(distributed_device_ids)
        # The Unique identifier of the worker process among all the processes in the
        # distributed training group is computed here. This is used while initializing
        # the process group using `init_process_group`
        global_rank = node_rank * num_procs_per_node + process_rank

        # Number of processes per node is useful to know if a process
        # is a master in the local node(node in which it is running)
        os.environ["ALLENNLP_PROCS_PER_NODE"] = str(num_procs_per_node)

        # In distributed training, the configured device is always going to be a list.
        # The corresponding gpu id for the particular worker is obtained by picking the id
        # from the device list with the rank as index
        gpu_id = distributed_device_ids[process_rank]  # type: ignore

        # Till now, "cuda_device" might not be set in the trainer params.
        # But a worker trainer needs to only know about its specific GPU id.
        params["trainer"]["cuda_device"] = gpu_id
        params["trainer"]["world_size"] = world_size
        params["trainer"]["distributed"] = True

        torch.cuda.set_device(int(gpu_id))
        dist.init_process_group(
            backend="nccl",
            init_method=f"tcp://{master_addr}:{master_port}",
            world_size=world_size,
            rank=global_rank,
        )
        logging.info(
            f"Process group of world size {world_size} initialized "
            f"for distributed training in worker {global_rank}"
        )

    train_loop = TrainModel.from_params(
        params=params,
        serialization_dir=serialization_dir,
        local_rank=process_rank,
        batch_weight_key=batch_weight_key,
    )

    try:
        if distributed:  # let the setup get ready for all the workers
            dist.barrier()

        metrics = train_loop.run()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if master and os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info(
                "Training interrupted by the user. Attempting to create "
                "a model archive using the current best epoch weights."
            )
            archive_model(serialization_dir)
        raise

    if master:
        train_loop.finish(metrics)

    if not distributed:
        return train_loop.model

    return None  # to make mypy happy
def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]:
    # Disable some of the more verbose logging statements
    logging.getLogger('allennlp.common.params').disabled = True
    logging.getLogger('allennlp.nn.initializers').disabled = True
    logging.getLogger('allennlp.modules.token_embedders.embedding').setLevel(
        logging.INFO)

    # Load from archive
    archive = load_archive(args.archive_file, args.cuda_device, args.overrides)
    config = archive.config
    prepare_environment(config)
    model = archive.model
    model.eval()

    # Load the evaluation data
    dataset_reader_config = config.pop('dataset_reader')

    if "evaluator_type" in config:
        eval_type = config.get("evaluator_type")
    else:
        dataset_reader_type = dataset_reader_config.get("type")
        eval_type = dataset_reader_type
    dataset_reader = DatasetReader.from_params(dataset_reader_config)

    evaluation_data_paths_list = []
    evaluation_data_short_names = []
    output_files_list = args.output_file.split(";")

    if args.evaluation_data_file:
        evaluation_data_paths_list.append(args.evaluation_data_file)
        evaluation_data_short_names.append("input")
    else:
        if "validation_data_path" in config:
            evaluation_data_paths_list.append(config["validation_data_path"])
            evaluation_data_short_names.append("dev")

        if "test_data_path" in config:
            evaluation_data_paths_list.append(config["test_data_path"])
            evaluation_data_short_names.append("test")

    metrics_out = {}

    iterator = DataIterator.from_params(config.pop("iterator"))
    iterator.index_with(model.vocab)

    for i in range(len(evaluation_data_paths_list)):
        evaluation_data_path = evaluation_data_paths_list[i]
        evaluation_data_short_name = evaluation_data_path if len(evaluation_data_short_names) - 1 < i \
                                                          else evaluation_data_short_names[i]

        if len(output_files_list) == len(evaluation_data_paths_list):
            out_file = output_files_list[i]
        else:
            out_file = "{0}_{1}.txt".format(output_files_list[0],
                                            evaluation_data_short_name)

        logger.info("Reading evaluation data from %s", evaluation_data_path)
        dataset = dataset_reader.read(evaluation_data_path)

        metrics = evaluate(model, dataset, iterator, out_file, eval_type)
        if out_file is not None:
            logging.info("Predictions exported to {0}".format(out_file))

        logger.info("Finished evaluating.")
        logger.info("Metrics:")
        for key, metric in metrics.items():
            logger.info("%s: %s", key, metric)

        if len(evaluation_data_paths_list) == 1:
            metrics_out = metrics
        else:
            metrics_out[evaluation_data_short_name] = metrics

    return metrics_out
def main(serialization_dir, evaluation_data_file, split, cuda_device, weights_file, overrides):

    archive_file = os.path.join(serialization_dir, "model.tar.gz")

    logging_dir = os.path.join(serialization_dir, "logging")

    if os.path.isfile(archive_file):
        weights_file = None
        archive = load_archive(archive_file, cuda_device, overrides, weights_file)
        config = archive.config
        prepare_environment(config)
        prepare_global_logging(logging_dir, file_friendly_logging=False, file_name=split)
        model = archive.model
    else:
        # Load config
        config = Params.from_file(os.path.join(serialization_dir, CONFIG_NAME), overrides)
        prepare_environment(config)
        prepare_global_logging(logging_dir, file_friendly_logging=False, file_name=split)

        if weights_file:
            weights_path = os.path.join(serialization_dir, weights_file)
        else:
            weights_path = os.path.join(serialization_dir, _WEIGHTS_NAME)

        logger.info("Using weights_file located at : %s", weights_path)
        # Instantiate model. Use a duplicate of the config, as it will get consumed.
        model = Model.load(config.duplicate(),
                           weights_file=weights_path,
                           serialization_dir=serialization_dir,
                           cuda_device=cuda_device)

    # Eval mode ON
    model.eval()

    # Load the evaluation data

    # Try to use the validation dataset reader if there is one - otherwise fall back
    # to the default dataset_reader used for both training and validation.
    validation_dataset_reader_params = config.pop('validation_dataset_reader', None)
    if validation_dataset_reader_params is not None:
        dataset_reader = DatasetReader.from_params(validation_dataset_reader_params)
    else:
        dataset_reader = DatasetReader.from_params(config.pop('dataset_reader'))

    if evaluation_data_file is None:
        logger.info("--evaluation_data_file not provided. So using --split=%s to read data", split)
        data_path_key = split + '_data_path'
        evaluation_data_path = config.pop(data_path_key)
    else:
        evaluation_data_path = evaluation_data_file

    logger.info("Reading evaluation data from %s", evaluation_data_path)

    instances = dataset_reader.read(evaluation_data_path)
    logger.info("No. of instances = %d", len(instances))

    iterator = BasicIterator(batch_size=128)
    iterator.index_with(model.vocab)

    metrics, model_predictions = get_model_predictions(model, instances, iterator, args.cuda_device)

    logger.info("Finished evaluating.")
    logger.info("Metrics:")
    for key, metric in metrics.items():
        logger.info("%s: %s", key, metric)

    write_predictions(serialization_dir=serialization_dir, instances=instances,
                 model_predictions=model_predictions, split=split)

    analyze_gold_data(serialization_dir=serialization_dir, instances=instances, split=split)

    analyze_model_predictions(serialization_dir=serialization_dir, instances=instances,
                              model_predictions=model_predictions, split=split)

    analyze_bio_violations(instances=instances, model_predictions=model_predictions)
Esempio n. 11
0
def _train_worker(
    process_rank: int,
    params: Params,
    serialization_dir: str,
    include_package: List[str] = None,
    dry_run: bool = False,
    node_rank: int = 0,
    master_addr: str = "127.0.0.1",
    master_port: int = 29500,
    world_size: int = 1,
    distributed_device_ids: List[int] = None,
) -> Optional[Model]:
    """
    Helper to train the configured model/experiment. In distributed mode, this is spawned as a
    worker process. In a single GPU experiment, this returns the `Model` object and in distributed
    training, nothing is returned.

    # Parameters

    process_rank : `int`
        The process index that is initialized using the GPU device id.
    params : `Params`
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : `str`
        The directory in which to save results and logs.
    include_package : `List[str]`, optional
        In distributed mode, since this function would have been spawned as a separate process,
        the extra imports need to be done again. NOTE: This does not have any effect in single
        GPU training.
    dry_run : `bool`, optional (default=`False`)
        Do not train a model, but create a vocabulary, show dataset statistics and other training
        information.
    node_rank : `int`, optional
        Rank of the node.
    master_addr : `str`, optional (default=`"127.0.0.1"`)
        Address of the master node for distributed training.
    master_port : `str`, optional (default=`"29500"`)
        Port of the master node for distributed training.
    world_size : `int`, optional
        The number of processes involved in distributed training.
    distributed_device_ids: `List[str]`, optional
        IDs of the devices used involved in distributed training.

    # Returns

    best_model : `Optional[Model]`
        The model with the best epoch weights or `None` if in distributed training or in dry run.
    """
    prepare_global_logging(
        serialization_dir, rank=process_rank, world_size=world_size,
    )
    common_util.prepare_environment(params)

    distributed = world_size > 1

    # not using `allennlp.common.util.is_master` as the process group is yet to be initialized
    master = process_rank == 0

    include_package = include_package or []

    if distributed:
        # Since the worker is spawned and not forked, the extra imports need to be done again.
        # Both the ones from the plugins and the ones from `include_package`.
        import_plugins()
        for package_name in include_package:
            common_util.import_module_and_submodules(package_name)

        num_procs_per_node = len(distributed_device_ids)
        # The Unique identifier of the worker process among all the processes in the
        # distributed training group is computed here. This is used while initializing
        # the process group using `init_process_group`
        global_rank = node_rank * num_procs_per_node + process_rank

        # Number of processes per node is useful to know if a process
        # is a master in the local node(node in which it is running)
        os.environ["ALLENNLP_PROCS_PER_NODE"] = str(num_procs_per_node)

        # In distributed training, the configured device is always going to be a list.
        # The corresponding gpu id for the particular worker is obtained by picking the id
        # from the device list with the rank as index
        gpu_id = distributed_device_ids[process_rank]  # type: ignore

        # Till now, "cuda_device" might not be set in the trainer params.
        # But a worker trainer needs to only know about its specific GPU id.
        params["trainer"]["cuda_device"] = gpu_id
        params["trainer"]["world_size"] = world_size
        params["trainer"]["distributed"] = True

        if gpu_id >= 0:
            torch.cuda.set_device(int(gpu_id))
            dist.init_process_group(
                backend="nccl",
                init_method=f"tcp://{master_addr}:{master_port}",
                world_size=world_size,
                rank=global_rank,
            )
        else:
            dist.init_process_group(
                backend="gloo",
                init_method=f"tcp://{master_addr}:{master_port}",
                world_size=world_size,
                rank=global_rank,
            )
        logging.info(
            f"Process group of world size {world_size} initialized "
            f"for distributed training in worker {global_rank}"
        )

    train_loop = TrainModel.from_params(
        params=params, serialization_dir=serialization_dir, local_rank=process_rank,
    )

    if dry_run:
        return None

    try:
        if distributed:  # let the setup get ready for all the workers
            dist.barrier()

        metrics = train_loop.run()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if master and os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info(
                "Training interrupted by the user. Attempting to create "
                "a model archive using the current best epoch weights."
            )
            archive_model(serialization_dir)
        raise

    if master:
        train_loop.finish(metrics)

    if not distributed:
        return train_loop.model

    return None
Esempio n. 12
0
def train_model(params: Params,
                serialization_dir: str,
                file_friendly_logging: bool = False,
                recover: bool = False) -> Model:
    """
    Trains the model specified in the given :class:`Params` object, using the data and training
    parameters also specified in that object, and saves the results in ``serialization_dir``.

    Parameters
    ----------
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    recover : ``bool``, optional (default=False)
        If ``True``, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see the ``fine-tune`` command.

    Returns
    -------
    best_model: ``Model``
        The model with the best epoch weights.
    """
    prepare_environment(params)

    create_serialization_dir(params, serialization_dir, recover)
    prepare_global_logging(serialization_dir, file_friendly_logging)

    check_for_gpu(params.params.get('trainer').get('cuda_device', -1))

    serialization_params = deepcopy(params).as_dict(quiet=True)
    with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file:
        json.dump(serialization_params, param_file, indent=4)

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(
        params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(
                f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("Creating a vocabulary using %s data.",
                ", ".join(datasets_for_vocab_creation))
    vocab = RegistrableVocabulary.from_params(
        params.pop("vocabulary", {"type": "vocabulary"}),
        (instance for key, dataset in all_datasets.items()
         for instance in dataset if key in datasets_for_vocab_creation))

    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    model = Model.from_params(vocab, params.pop('model'))
    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)

    train_data = all_datasets['train']
    validation_data = all_datasets.get('validation')
    test_data = all_datasets.get('test')

    trainer_params = params.pop("trainer")
    no_grad_regexes = trainer_params.pop("no_grad", ())

    nograd_parameter_names = []
    grad_parameter_names = []
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)
            nograd_parameter_names.append(name)
        else:
            grad_parameter_names.append(name)

    logger.info("Following parameters are Frozen  (without gradient):")
    for name in nograd_parameter_names:
        logger.info(name)
    logger.info("Following parameters are Tunable (with gradient):")
    for name in grad_parameter_names:
        logger.info(name)

    trainer = Trainer.from_params(model, serialization_dir, iterator,
                                  train_data, validation_data, trainer_params)

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)
    params.assert_empty('base train command')

    try:
        metrics = trainer.train()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info(
                "Training interrupted by the user. Attempting to create "
                "a model archive using the current best epoch weights.")
            archive_model(serialization_dir,
                          files_to_archive=params.files_to_archive)
        raise

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)

    logger.info("Loading the best epoch weights.")
    best_model_state_path = os.path.join(serialization_dir, 'best.th')
    best_model_state = torch.load(best_model_state_path)
    best_model = model
    best_model.load_state_dict(best_model_state)

    if test_data and evaluate_on_test:
        logger.info(
            "The model will be evaluated using the best epoch weights.")
        test_metrics = evaluate(best_model,
                                test_data,
                                iterator,
                                cuda_device=trainer._cuda_devices[0])  # pylint: disable=protected-access
        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif test_data:
        logger.info(
            "To evaluate on the test set after training, pass the "
            "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    metrics_json = json.dumps(metrics, indent=2)
    with open(os.path.join(serialization_dir, "metrics.json"),
              "w") as metrics_file:
        metrics_file.write(metrics_json)
    logger.info("Metrics: %s", metrics_json)

    return best_model
Esempio n. 13
0
def _train_worker(
    process_rank: int,
    params: Params,
    serialization_dir: str,
    file_friendly_logging: bool = False,
    recover: bool = False,
    include_package: List[str] = None,
    batch_weight_key: str = "",
    node_rank: int = 0,
    master_addr: str = "127.0.0.1",
    master_port: int = 29500,
    world_size: int = 1,
    distributed_device_ids: List[str] = None,
    # For fine-tuning:
    model: Model = None,
    extend_vocab: bool = False,
    embedding_sources_mapping: Dict[str, str] = None,
) -> Optional[Model]:
    """
    Helper to train the configured model/experiment. In distributed mode, this is spawned as a
    worker process. In a single GPU experiment, this returns the ``Model`` object and in distributed
    training, nothing is returned.

    # Parameters

    process_rank : ``int``
        The process index that is initialized using the GPU device id.
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    recover : ``bool``, optional (default=False)
        If ``True``, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see the ``fine-tune`` command.
    include_package : ``List[str]``, optional
        In distributed mode, since this function would have been spawned as a separate process,
        the extra imports need to be done again. NOTE: This does not have any effect in single
        GPU training.
    node_rank : ``int``, optional
        Rank of the node
    world_size : ``int``, optional
        The number of processes involved in distributed training.

    # Returns

    best_model : ``Model``
        The model with the best epoch weights.
    """
    prepare_global_logging(serialization_dir,
                           file_friendly_logging,
                           rank=process_rank,
                           world_size=world_size)
    prepare_environment(params)

    distributed = world_size > 1

    # not using `allennlp.common.util.is_master` as the process group is yet to be initialized
    master = process_rank == 0

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)

    if distributed:
        # Since the worker is spawned and not forked, the extra imports
        # need to be done again.
        if include_package is not None:
            for package_name in include_package:
                import_submodules(package_name)

        num_procs_per_node = len(distributed_device_ids)
        # The Unique identifier of the worker process among all the processes in the
        # distributed training group is computed here. This is used while initializing
        # the process group using `init_process_group`
        global_rank = node_rank * num_procs_per_node + process_rank

        # Number of processes per node is useful to know if a process
        # is a master in the local node(node in which it is running)
        os.environ["ALLENNLP_PROCS_PER_NODE"] = str(num_procs_per_node)

        # In distributed training, the configured device is always going to be a list.
        # The corresponding gpu id for the particular worker is obtained by picking the id
        # from the device list with the rank as index
        gpu_id = distributed_device_ids[process_rank]  # type: ignore

        # Till now, "cuda_device" might not be set in the trainer params.
        # But a worker trainer needs to only know about its specific GPU id.
        params["trainer"]["cuda_device"] = gpu_id
        params["trainer"]["world_size"] = world_size
        params["trainer"]["distributed"] = True

        torch.cuda.set_device(int(gpu_id))
        dist.init_process_group(
            backend="nccl",
            init_method=f"tcp://{master_addr}:{master_port}",
            world_size=world_size,
            rank=global_rank,
        )
        logging.info(f"Process group of world size {world_size} initialized "
                     f"for distributed training in worker {global_rank}")

    trainer_type = params.get("trainer", {}).get("type", "default")

    if trainer_type == "default":
        # Special logic to instantiate backward-compatible trainer.
        pieces = TrainerPieces.from_params(
            params=params,
            serialization_dir=serialization_dir,
            recover=recover,
            model=model,
            embedding_sources_mapping=embedding_sources_mapping,
            extend_vocab=extend_vocab,
        )
        trainer = Trainer.from_params(
            model=pieces.model,
            serialization_dir=serialization_dir,
            iterator=pieces.iterator,
            train_data=pieces.train_dataset,
            validation_data=pieces.validation_dataset,
            params=pieces.params,
            validation_iterator=pieces.validation_iterator,
            local_rank=process_rank,
        )

        evaluation_iterator = pieces.validation_iterator or pieces.iterator
        evaluation_dataset = pieces.test_dataset

    else:
        if evaluate_on_test:
            raise ValueError(
                "--evaluate-on-test only works with the default Trainer. "
                "If you're using the CallbackTrainer you can use a callback "
                "to evaluate at Events.TRAINING_END; otherwise you'll have "
                "to run allennlp evaluate separately.")

        trainer = TrainerBase.from_params(params, serialization_dir, recover)
        evaluation_dataset = None
        evaluation_iterator = None

    params.assert_empty("base train command")

    try:
        if distributed:  # let the setup get ready for all the workers
            dist.barrier()

        metrics = trainer.train()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if master and os.path.exists(
                os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info(
                "Training interrupted by the user. Attempting to create "
                "a model archive using the current best epoch weights.")
            archive_model(serialization_dir,
                          files_to_archive=params.files_to_archive)
        raise

    if master:
        if evaluation_dataset and evaluate_on_test:
            logger.info(
                "The model will be evaluated using the best epoch weights.")
            test_metrics = evaluate(
                trainer.model,
                evaluation_dataset,
                evaluation_iterator,
                cuda_device=trainer.cuda_device,
                batch_weight_key=batch_weight_key,
            )

            for key, value in test_metrics.items():
                metrics["test_" + key] = value
        elif evaluation_dataset:
            logger.info(
                "To evaluate on the test set after training, pass the "
                "'evaluate_on_test' flag, or use the 'allennlp evaluate' command."
            )
        dump_metrics(os.path.join(serialization_dir, "metrics.json"),
                     metrics,
                     log=True)

    if not distributed:
        return trainer.model

    return None  # to make mypy happy
Esempio n. 14
0
    help='a JSON structure used to override the experiment configuration')

parser.add_argument(
    '--file-friendly-logging',
    action='store_true',
    default=False,
    help='outputs tqdm status on separate lines and slows tqdm refresh rate')

args = parser.parse_args()
params = Params.from_file(args.param_path, args.overrides)
random_seed, numpy_seed, pytorch_seed = random.randint(
    0, 999999999), random.randint(0, 999999999), random.randint(0, 999999999)
params["random_seed"] = random_seed
params["numpy_seed"] = numpy_seed
params["pytorch_seed"] = pytorch_seed
prepare_environment(params)
from graph_dependency_parser.components.evaluation.predictors import Evaluator, EmptyMRPEvaluator
from graph_dependency_parser.train.amtrainer import AMTrainer, TrainerPieces
serialization_dir = args.serialization_dir
create_serialization_dir(params, serialization_dir, args.recover, args.force)
stdout_handler = prepare_global_logging(serialization_dir,
                                        args.file_friendly_logging)

cuda_device = params.params.get('trainer').get('cuda_device', -1)
check_for_gpu(cuda_device)

params.to_file(os.path.join(serialization_dir, CONFIG_NAME))

evaluate_on_test = params.pop_bool("evaluate_on_test", False)

test_evaluators = params.pop("test_evaluators", [])
Esempio n. 15
0
def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]:
    # Disable some of the more verbose logging statements
    logging.getLogger('allennlp.common.params').disabled = True
    logging.getLogger('allennlp.nn.initializers').disabled = True
    logging.getLogger('allennlp.modules.token_embedders.embedding').setLevel(
        logging.INFO)

    logging.info("Parameters:")
    for arg in vars(args):
        logging.info("{0}: {1}".format(arg, getattr(args, arg)))

    # Load from archive
    cuda_device = args.cuda_device

    logging.info("cuda_device:{0}".format(cuda_device))
    archive = load_archive(args.archive_file,
                           cuda_device=cuda_device,
                           overrides=args.overrides)
    config = archive.config
    prepare_environment(config)
    model = archive.model
    model.eval()

    # Load the evaluation data
    dataset_reader = DatasetReader.from_params(
        config.pop('validation_dataset_reader') if "validation_dataset_reader"
        in config else config.pop('dataset_reader'))
    evaluation_data_path = args.evaluation_data_file
    logger.info("Reading evaluation data from %s", evaluation_data_path)

    batch_size = args.batch_size
    start_id = args.start_id
    end_id = args.end_id

    dataset = dataset_reader.read(evaluation_data_path)
    file_mode = args.file_open_mode
    if start_id > 0 or end_id > 0:
        if not isinstance(dataset, list):
            raise ValueError(
                "dataset must be list when start_id and end_id are set")

        start_id = max(start_id, 0)
        if end_id <= 0:
            end_id = len(dataset)

        dataset = dataset[start_id:end_id]

    iterator_config = config.pop(
        'validation_iterator'
    ) if "validation_iterator" in config else config.pop('iterator')
    if batch_size > -1:
        if "base_iterator" in iterator_config:
            iterator_config["base_iterator"]["batch_size"] = batch_size
        else:
            iterator_config["batch_size"] = batch_size

    iterator = DataIterator.from_params(iterator_config)

    iterator.index_with(model.vocab)

    metrics = evaluate(model,
                       dataset,
                       iterator,
                       args.output_file,
                       file_mode=file_mode)
    if args.output_file:
        absolute_path = os.path.abspath(args.output_file)
        logging.info("Output saved to \n{}".format(absolute_path))
    logger.info("Finished evaluating.")
    logger.info("Metrics:")
    for key, metric in metrics.items():
        logger.info("%s: %s", key, metric)

    return metrics
Esempio n. 16
0
def main():
    parse = argparse.ArgumentParser("")
    parse.add_argument("command", type=str, help="one of the following options train, evaluate, generalize")
    parse.add_argument("--datasets", type=str, help="", default=None)
    parse.add_argument("--model", type=str, help="", default=None)
    parse.add_argument("--serialization_dir", type=str, help="the directory storing the intermediate files and output", default=None)
    parse.add_argument("--cuda_device", type=str, help="Cuda device ID", default="-1")
    parse.add_argument("--split", type=str, help="dev / test", default="dev")
    parse.add_argument("--bert_type", type=str, help="Base / Large /", default="Base")
    parse.add_argument("--config", type=str, help="dev / test", default=None)
    parse.add_argument("--output_path", type=str, help="directory to which results JSONs of eval will written", default='results/eval/')
    parse.add_argument("--models_dir", type=str, help="directory containing the models used for eval , (please add '/' at the end)", default=None)
    parse.add_argument("--data_dir", type=str, help="directory containing the multiqa format datasets , (please add '/' at the end and make sure to have a headers directory with all headers under your specified path)", default='https://multiqa.s3.amazonaws.com/data/')
    parse.add_argument("--t_total", type=str, help="used for training, see BERT's learning rate schedule for details", default=None)
    parse.add_argument("--sample_size", type=str, help="used for sampling a subset of the training data", default=-1)
    parse.add_argument("--validation_sample_size", type=str, help="used for sampling a subset of the training data", default=-1)
    parse.add_argument("--batch_size", type=str, help="the batch size", default=8)
    parse.add_argument("--max_instances_in_memory", type=str, help="max number instances in memrory during training", default=5000)
    parse.add_argument("--num_epochs", type=str, help="", default=2)
    parse.add_argument("--lr", type=str, help="learning rate", default=0.00003)
    args = parse.parse_args()

    import_submodules("models")

    # TODO add best config for specific datasets as default, not one general default...
    if args.config is None:
        config = 'models/MultiQA_BERT' + args.bert_type + '.jsonnet'
    else:
        config = args.config
    config_params = Params(json.loads(_jsonnet.evaluate_file(config)))

    if args.command == 'train':
        # building the default dataset urls
        train_datasets = [args.data_dir + dataset + '_train.jsonl.gz' for dataset in args.datasets.split(',')]
        val_datasets = [args.data_dir + dataset + '_' + args.split + '.jsonl.gz' for dataset in args.datasets.split(',')]

        # calculating the t_total
        if args.t_total == None:
            logging.info('getting headers of the chosen dataset in order to compute learning rate schedule t_total')
            total_number_of_examples = 0
            for header_url in [args.data_dir + 'headers/' + dataset + '_train.json' for dataset in
                              args.datasets.split(',')]:
                with open(cached_path(header_url),'r') as f:
                    header = json.load(f)
                    total_number_of_examples += header['number_of_qas']
            t_total = int(total_number_of_examples / float(config_params['iterator']['batch_size']) \
                    * float(config_params['trainer']['num_epochs'])) \
                    / len(args.cuda_device.split(','))

        if args.serialization_dir is None:
            serialization_dir = 'models/' + args.datasets.replace(',','_') + f"num_epochs_{args.num_epochs}_batch_size_{args.batch_size}_lr_{args.lr}"
        else:
            serialization_dir = args.serialization_dir

            
        print(" >>>>>>>> overriding the parameters <<<<<<<<<<< ") 
        overrides = {
            'train_data_path': ','.join(train_datasets),
            'validation_data_path': ','.join(val_datasets),
            'dataset_reader': {
                'sample_size': args.sample_size, 
            },
            'validation_dataset_reader': {
                'sample_size': args.validation_sample_size,
            },
            'iterator': {
                'batch_size': args.batch_size,
                'max_instances_in_memory': args.max_instances_in_memory,
             },
            'trainer': {
                'cuda_device': args.cuda_device,
                'num_epochs': args.num_epochs,
                'optimizer': {
                  't_total': t_total, 
                  'lr': args.lr,
                }
            }
        }

        overrides_str = str(overrides).replace('True', 'true').replace('False', 'false')
        train_model_from_file(config, serialization_dir, overrides_str, True, False, True, "", "")
    elif args.command == 'evaluate':
        print(" evaluate . . . ")
        if args.models_dir is None:
            model_path = 'https://multiqa.s3.amazonaws.com/models/BERT' + args.bert_type + '/' + args.model + '.tar.gz'
        else:
            model_path = args.models_dir + args.model + '.tar.gz'
        model_cached_path = cached_path(model_path)

        print(" loading models . . . .")
        overrides_str = ''
        # Load from archive
        archive = load_archive(model_cached_path, int(args.cuda_device), overrides_str, '')
        prepare_environment(config_params)
        model = archive.model
        model.eval()

        print(" loading data . . . .")

        # Load the evaluation data
        validation_dataset_reader_params = config_params.get('validation_dataset_reader', None)
        dataset_reader = DatasetReader.from_params(validation_dataset_reader_params)

        # print(" *  *  *  *  *  *  * ")
        # print(validation_dataset_reader_params)
        # print(dataset_reader)

        print(" looping over datasets . . . .")

        # running over all validation datasets specified
        val_dataset_names = args.datasets.split(',')
        val_datasets = [args.data_dir + dataset + '_' + args.split + '.jsonl.gz' for dataset in val_dataset_names]

        for val_dataset_path,val_dataset_name in zip(val_datasets,val_dataset_names):
            print(f" * * * val_dataset_name: {val_dataset_name}")
            # This is a bit strange but there is a lot of config "popping" going on implicitly in allennlp
            # so we need to have the full config reloaded every iteration...
            config_params = Params(json.loads(_jsonnet.evaluate_file(config)))

            print("Reading evaluation data from %s", val_dataset_path)
            logger.info("Reading evaluation data from %s", val_dataset_path)
            instances = dataset_reader.read(val_dataset_path)

            # loading iterator
            iterator_params = config_params.get("validation_iterator", None)
            iterator = DataIterator.from_params(iterator_params)
            iterator.index_with(model.vocab)

            metrics = evaluate(model, instances, iterator, int(args.cuda_device), '')

            logger.info("Finished evaluating " + val_dataset_name)
            print("Finished evaluating " + val_dataset_name)
            logger.info("Metrics:")
            for key, metric in metrics.items():
                logger.info("%s: %s", key, metric)

            if not os.path.exists(args.output_path):
                os.makedirs(args.output_path)
            output_path = args.output_path + args.model + '_BERT' + args.bert_type + '_eval-on_' \
                          + val_dataset_name + '_' + args.split + '.json'
            with open(output_path, "w") as file:
                json.dump(metrics, file, indent=4)
        return metrics

    elif args.command == 'generalize':
        logging.error('The command %s is not yet supported' % (args.command))
    else:
        logging.error('The command %s is not supported' % (args.command))
Esempio n. 17
0
def train_model(params: Params, serialization_dir: str) -> Model:
    """
    This function can be used as an entry point to running models in AllenNLP
    directly from a JSON specification using a :class:`Driver`. Note that if
    you care about reproducibility, you should avoid running code using Pytorch
    or numpy which affect the reproducibility of your experiment before you
    import and use this function, these libraries rely on random seeds which
    can be set in this function via a JSON specification file. Note that this
    function performs training and will also evaluate the trained model on
    development and test sets if provided in the parameter json.

    Parameters
    ----------
    params: Params, required.
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir: str, required
        The directory in which to save results and logs.
    """
    prepare_environment(params)

    os.makedirs(serialization_dir, exist_ok=True)
    sys.stdout = TeeLogger(os.path.join(serialization_dir, "stdout.log"), sys.stdout)  # type: ignore
    sys.stderr = TeeLogger(os.path.join(serialization_dir, "stderr.log"), sys.stderr)  # type: ignore
    handler = logging.FileHandler(os.path.join(serialization_dir, "python_logging.log"))
    handler.setLevel(logging.INFO)
    handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(name)s - %(message)s'))
    logging.getLogger().addHandler(handler)
    serialization_params = deepcopy(params).as_dict(quiet=True)
    with open(os.path.join(serialization_dir, "model_params.json"), "w") as param_file:
        json.dump(serialization_params, param_file, indent=4)

    # Now we begin assembling the required parts for the Trainer.
    dataset_reader = DatasetReader.from_params(params.pop('dataset_reader'))

    train_data_path = params.pop('train_data_path')
    logger.info("Reading training data from %s", train_data_path)
    train_data = dataset_reader.read(train_data_path)

    validation_data_path = params.pop('validation_data_path', None)
    if validation_data_path is not None:
        logger.info("Reading validation data from %s", validation_data_path)
        validation_data = dataset_reader.read(validation_data_path)
        combined_data = Dataset(train_data.instances + validation_data.instances)
    else:
        validation_data = None
        combined_data = train_data

    vocab = Vocabulary.from_params(params.pop("vocabulary", {}), combined_data)
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    model = Model.from_params(vocab, params.pop('model'))
    iterator = DataIterator.from_params(params.pop("iterator"))

    train_data.index_instances(vocab)
    if validation_data:
        validation_data.index_instances(vocab)

    trainer_params = params.pop("trainer")
    trainer = Trainer.from_params(model,
                                  serialization_dir,
                                  iterator,
                                  train_data,
                                  validation_data,
                                  trainer_params)
    params.assert_empty('base train command')
    trainer.train()

    # Now tar up results
    archive_model(serialization_dir)

    return model
Esempio n. 18
0
def train_model(params: Params,
                serialization_dir: str,
                file_friendly_logging: bool = False,
                recover: bool = False) -> Model:
    """
    Trains the model specified in the given :class:`Params` object, using the data and training
    parameters also specified in that object, and saves the results in ``serialization_dir``.

    Parameters
    ----------
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    recover : ``bool``, optional (default=False)
        If ``True``, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see the ``fine-tune`` command.

    Returns
    -------
    best_model: ``Model``
        The model with the best epoch weights.
    """
    prepare_environment(params)

    create_serialization_dir(params, serialization_dir, recover)
    prepare_global_logging(serialization_dir, file_friendly_logging)

    check_for_gpu(params.get('trainer').get('cuda_device', 0))

    params.to_file(os.path.join(serialization_dir, CONFIG_NAME))

    all_datasets = datasets_from_params(params)

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(
                f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info(
        "From dataset instances, %s will be considered for vocabulary creation.",
        ", ".join(datasets_for_vocab_creation))

    vocab_dir = os.path.join(serialization_dir, 'vocabulary')
    vocab = Vocabulary.from_files(vocab_dir)

    # vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    model = Model.from_params(vocab=vocab, params=params.pop('model'))
    print(model)
    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)
    validation_iterator_params = params.pop("validation_iterator", None)
    if validation_iterator_params:
        validation_iterator = DataIterator.from_params(
            validation_iterator_params)
        validation_iterator.index_with(vocab)
    else:
        validation_iterator = None

    # train_data = all_datasets['train']
    validation_data = all_datasets.get('validation')
    test_data = all_datasets.get('test')

    # trainer_params = params.pop("trainer")
    # no_grad_regexes = trainer_params.pop("no_grad", ())
    # for name, parameter in model.named_parameters():
    #     if any(re.search(regex, name) for regex in no_grad_regexes):
    #         parameter.requires_grad_(False)

    # frozen_parameter_names, tunable_parameter_names = \
    #                get_frozen_and_tunable_parameter_names(model)
    # logger.info("Following parameters are Frozen  (without gradient):")
    # for name in frozen_parameter_names:
    #     logger.info(name)
    # logger.info("Following parameters are Tunable (with gradient):")
    # for name in tunable_parameter_names:
    #     logger.info(name)

    # trainer = Trainer.from_params(model,
    #                               serialization_dir,
    #                               iterator,
    #                               train_data,
    #                               validation_data,
    #                               trainer_params,
    #                               validation_iterator=validation_iterator)

    evaluate_on_validation = True
    evaluate_on_test = True
    # params.assert_empty('base train command')

    # try:
    #     metrics = trainer.train()
    # except KeyboardInterrupt:
    #     # if we have completed an epoch, try to create a model archive.
    #     if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
    #         logging.info("Training interrupted by the user. Attempting to create "
    #                      "a model archive using the current best epoch weights.")
    #         archive_model(serialization_dir, files_to_archive=params.files_to_archive)
    #     raise

    # Now tar up results
    # archive_model(serialization_dir, files_to_archive=params.files_to_archive)

    metrics = {}

    logger.info("Loading the best epoch weights.")
    best_model_state_path = os.path.join(serialization_dir, 'best.th')
    best_model_state = torch.load(best_model_state_path)
    best_model = model
    best_model.load_state_dict(best_model_state)
    best_model.cuda()

    if validation_data and evaluate_on_validation:
        logger.info(
            "The model will be evaluated using the best epoch weights on validation"
        )
        validation_metrics = evaluate(
            best_model,
            validation_data,
            validation_iterator or iterator,
            cuda_device=0  # pylint: disable=protected-access
        )
        for key, value in validation_metrics.items():
            metrics["validation_" + key] = value

    if test_data and evaluate_on_test:
        logger.info(
            "The model will be evaluated using the best epoch weights on test")
        test_metrics = evaluate(
            best_model,
            test_data,
            validation_iterator or iterator,
            cuda_device=0  # pylint: disable=protected-access
        )
        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    dump_metrics(os.path.join(serialization_dir, "metrics_new.json"),
                 metrics,
                 log=True)

    return best_model
Esempio n. 19
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--input-file', type=str, help='path to the file containing the evaluation data')
    parser.add_argument('--output-file', type=str, help='path to output file')
    parser.add_argument('--weights-file',
                               type=str,
                               help='a path that overrides which weights file to use')
    parser.add_argument('--cuda-device',
                                 type=int,
                                 default=-1,
                                 help='id of GPU to use (if any)')
    parser.add_argument('--overrides',
                               type=str,
                               default="",
                               help='a JSON structure used to override the experiment configuration')
    parser.add_argument('--include-package', type=str, default='')
    parser.add_argument('--archive-file', type=str)
    args = parser.parse_args()
    
    if '/' in args.weights_file:
        label_file = args.weights_file[:args.weights_file.rfind('/') + 1]
    else:
        label_file = ''
    label_file += (args.input_file[args.input_file.rfind('/') + 1: args.input_file.rfind('.')] if '/' in args.input_file else
                   args.input_file[:args.input_file.rfind('.')])
    label_file += '_reallabel_guessedlabel.csv'
    print("Will write labels to " + label_file)
    print("Evaluating on " + args.input_file)
    print("Archive file being used is " + args.archive_file)
    print("Weights file being used is " + args.weights_file)
    print()

    logging.getLogger('allennlp.common.params').disabled = True
    logging.getLogger('allennlp.nn.initializers').disabled = True
    logging.getLogger('allennlp.modules.token_embedders.embedding').setLevel(logging.INFO)

    if args.include_package.strip() != '':
        import_submodules(args.include_package)
    import_submodules("attn_tests_lib")
    import_submodules("textcat")

    if args.overrides != '':
        with open(args.overrides, 'r') as f:
            args.overrides = " ".join([l.strip() for l in f.readlines()])
    archive = load_archive(args.archive_file, args.cuda_device, args.overrides, args.weights_file)
    config = archive.config
    prepare_environment(config)
    model = archive.model
    model.eval()

    if model._output_logit.get_output_dim() == 2:
        model.calculate_f1 = True
        model._f1 = F1Measure(1)

    validation_dataset_reader_params = config.pop('validation_dataset_reader', None)
    if validation_dataset_reader_params is not None:
        dataset_reader = DatasetReader.from_params(validation_dataset_reader_params)
    else:
        dataset_reader = DatasetReader.from_params(config.pop('dataset_reader'))
    evaluation_data_path = args.input_file
    logger.info("Reading evaluation data from %s", evaluation_data_path)
    instances = dataset_reader.read(evaluation_data_path)

    iterator_params = config.pop("validation_iterator", None)
    if iterator_params is None:
        iterator_params = config.pop("iterator")
    new_param_dict = {'type': 'basic'}
    if 'batch_size' in iterator_params.params:
        new_param_dict['batch_size'] = iterator_params.params['batch_size']
    if 'maximum_samples_per_batch' in iterator_params.params:
        new_param_dict['maximum_samples_per_batch'] = iterator_params.params['maximum_samples_per_batch']
    iterator_params.params = new_param_dict
    iterator = DataIterator.from_params(iterator_params)
    iterator.index_with(model.vocab)

    metrics = evaluate(model, instances, iterator, args.cuda_device, label_file)

    logger.info("Finished evaluating.")
    logger.info("Metrics:")
    for key, metric in metrics.items():
        logger.info("%s: %s", key, metric)

    print('\n' + json.dumps(metrics, indent=4))
    print("Successfully wrote labels to " + label_file)

    output_file = args.output_file
    if output_file:
        with open(output_file, "w") as file:
            json.dump(metrics, file, indent=4)
    return metrics
Esempio n. 20
0
def train_model(params: Params,
                serialization_dir: str,
                file_friendly_logging: bool = False,
                recover: bool = False,
                force: bool = False) -> Model:
    """
    Trains the model specified in the given :class:`Params` object, using the data and training
    parameters also specified in that object, and saves the results in ``serialization_dir``.

    Parameters
    ----------
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    recover : ``bool``, optional (default=False)
        If ``True``, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see the ``fine-tune`` command.
    force : ``bool``, optional (default=False)
        If ``True``, we will overwrite the serialization directory if it already exists.

    Returns
    -------
    best_model: ``Model``
        The model with the best epoch weights.
    """
    prepare_environment(params)
    create_serialization_dir(params, serialization_dir, recover, force)
    stdout_handler = prepare_global_logging(serialization_dir,
                                            file_friendly_logging)

    cuda_device = params.params.get('trainer').get('cuda_device', -1)
    check_for_gpu(cuda_device)

    params.to_file(os.path.join(serialization_dir, CONFIG_NAME))

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)

    trainer_type = params.get("trainer", {}).get("type", "default")

    if trainer_type == "default":
        # Special logic to instantiate backward-compatible trainer.
        pieces = TrainerPieces.from_params(params, serialization_dir, recover)  # pylint: disable=no-member
        trainer = Trainer.from_params(
            model=pieces.model,
            serialization_dir=serialization_dir,
            iterator=pieces.iterator,
            train_data=pieces.train_dataset,
            validation_data=pieces.validation_dataset,
            params=pieces.params,
            validation_iterator=pieces.validation_iterator)
        evaluation_iterator = pieces.validation_iterator or pieces.iterator
        evaluation_dataset = pieces.test_dataset

    else:
        trainer = TrainerBase.from_params(params, serialization_dir, recover)
        # TODO(joelgrus): handle evaluation in the general case
        evaluation_iterator = evaluation_dataset = None

    params.assert_empty('base train command')

    try:
        metrics = trainer.train()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info(
                "Training interrupted by the user. Attempting to create "
                "a model archive using the current best epoch weights.")
            archive_model(serialization_dir,
                          files_to_archive=params.files_to_archive)
        raise

    # Evaluate
    if evaluation_dataset and evaluate_on_test:
        logger.info(
            "The model will be evaluated using the best epoch weights.")
        test_metrics = evaluate(
            trainer.model,
            evaluation_dataset,
            evaluation_iterator,
            cuda_device=trainer._cuda_devices[0],  # pylint: disable=protected-access,
            # TODO(brendanr): Pass in an arg following Joel's trainer refactor.
            batch_weight_key="")

        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif evaluation_dataset:
        logger.info(
            "To evaluate on the test set after training, pass the "
            "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    cleanup_global_logging(stdout_handler)

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)
    dump_metrics(os.path.join(serialization_dir, "metrics.json"),
                 metrics,
                 log=True)

    # We count on the trainer to have the model with best weights
    return trainer.model
Esempio n. 21
0
if root_path not in sys.path:
    sys.path.insert(0, root_path)


import torch
import torch.nn.functional as F


import numpy
from overrides import overrides
from typing import Dict, Optional


from allennlp.common.util import prepare_environment
from allennlp.common.params import Params
prepare_environment(Params(params={}))
from allennlp.common import Params
from allennlp.common.checks import ConfigurationError
from allennlp.data import Vocabulary
from allennlp.modules import FeedForward, Seq2VecEncoder, TextFieldEmbedder
from allennlp.models.model import Model
from allennlp.nn import InitializerApplicator, RegularizerApplicator
from allennlp.nn import util
from allennlp.training.metrics import CategoricalAccuracy, Average


@Model.register("basic_classifier")
class BasicClassifier(Model):
    def __init__(self, vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 sentence_encoder: Seq2VecEncoder,
def find_learning_rate_model(params: Params,
                             serialization_dir: str,
                             start_lr: float = 1e-5,
                             end_lr: float = 10,
                             num_batches: int = 100,
                             linear_steps: bool = False,
                             stopping_factor: float = None,
                             force: bool = False) -> None:
    """
    Runs learning rate search for given `num_batches` and saves the results in ``serialization_dir``

    Parameters
    ----------
    trainer: :class:`~allennlp.common.registrable.Registrable`
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results.
    start_lr: ``float``
        Learning rate to start the search.
    end_lr: ``float``
        Learning rate upto which search is done.
    num_batches: ``int``
        Number of mini-batches to run Learning rate finder.
    linear_steps: ``bool``
        Increase learning rate linearly if False exponentially.
    stopping_factor: ``float``
        Stop the search when the current loss exceeds the best loss recorded by
        multiple of stopping factor. If ``None`` search proceeds till the ``end_lr``
    force: ``bool``
        If True and the serialization directory already exists, everything in it will
        be removed prior to finding the learning rate.
    """
    if os.path.exists(serialization_dir) and force:
        shutil.rmtree(serialization_dir)

    if os.path.exists(serialization_dir) and os.listdir(serialization_dir):
        raise ConfigurationError(
            f'Serialization directory {serialization_dir} already exists and is '
            f'not empty.')
    else:
        os.makedirs(serialization_dir, exist_ok=True)

    prepare_environment(params)

    cuda_device = params.params.get('trainer').get('cuda_device', -1)
    if isinstance(cuda_device, list):
        for device in cuda_device:
            check_for_gpu(device)
    else:
        check_for_gpu(cuda_device)

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(
        params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(
                f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info(
        "From dataset instances, %s will be considered for vocabulary creation.",
        ", ".join(datasets_for_vocab_creation))
    vocab = Vocabulary.from_params(
        params.pop("vocabulary", {}),
        (instance for key, dataset in all_datasets.items()
         for instance in dataset if key in datasets_for_vocab_creation))

    model = Model.from_params(vocab=vocab, params=params.pop('model'))
    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)

    train_data = all_datasets['train']

    trainer_params = params.pop("trainer")
    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)

    trainer = Trainer.from_params(model,
                                  serialization_dir,
                                  iterator,
                                  train_data,
                                  params=trainer_params,
                                  validation_data=None,
                                  validation_iterator=None)

    logger.info(
        f'Starting learning rate search from {start_lr} to {end_lr} in {num_batches} iterations.'
    )
    learning_rates, losses = search_learning_rate(
        trainer,
        start_lr=start_lr,
        end_lr=end_lr,
        num_batches=num_batches,
        linear_steps=linear_steps,
        stopping_factor=stopping_factor)
    logger.info(f'Finished learning rate search.')
    losses = _smooth(losses, 0.98)

    _save_plot(learning_rates, losses,
               os.path.join(serialization_dir, 'lr-losses.png'))
Esempio n. 23
0
def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]:
    # Disable some of the more verbose logging statements
    logging.getLogger('allennlp.common.params').disabled = True
    logging.getLogger('allennlp.nn.initializers').disabled = True
    logging.getLogger('allennlp.modules.token_embedders.embedding').setLevel(
        logging.INFO)

    # Load from archive
    archive = load_archive(args.archive_file, args.cuda_device, args.overrides,
                           args.weights_file)
    config = archive.config
    prepare_environment(config)
    model = archive.model
    model.eval()

    # Load the evaluation data

    # Try to use the validation dataset reader if there is one - otherwise fall back
    # to the default dataset_reader used for both training and validation.
    validation_dataset_reader_params = config.pop('validation_dataset_reader',
                                                  None)
    if validation_dataset_reader_params is not None:
        dataset_reader = DatasetReader.from_params(
            validation_dataset_reader_params)
    else:
        dataset_reader = DatasetReader.from_params(
            config.pop('dataset_reader'))
    evaluation_data_path = args.input_file
    logger.info("Reading evaluation data from %s", evaluation_data_path)
    instances = dataset_reader.read(evaluation_data_path)

    embedding_sources: Dict[str,
                            str] = (json.loads(args.embedding_sources_mapping)
                                    if args.embedding_sources_mapping else {})
    if args.extend_vocab:
        logger.info("Vocabulary is being extended with test instances.")
        model.vocab.extend_from_instances(Params({}), instances=instances)
        model.extend_embedder_vocab(embedding_sources)

    iterator_params = config.pop("validation_iterator", None)
    if iterator_params is None:
        iterator_params = config.pop("iterator")
    iterator = DataIterator.from_params(iterator_params)
    iterator.index_with(model.vocab)

    thrs = args.thresholds.replace("_", ",").split(",")

    for thr in thrs:
        model._temperature_threshold = float(thr)
        metrics = evaluate(model, instances, iterator, args.cuda_device,
                           args.batch_weight_key)

        logger.info("Finished evaluating.")
        logger.info("Metrics:")
        for key, metric in metrics.items():
            logger.info("%s: %s: %s", thr, key, metric)

        output_file = args.output_file
        if output_file:
            with open(output_file + "_" + thr, "w") as file:
                json.dump(metrics, file, indent=4)
    return metrics
Esempio n. 24
0
def find_learning_rate_model(
    params: Params,
    serialization_dir: str,
    start_lr: float = 1e-5,
    end_lr: float = 10,
    num_batches: int = 100,
    linear_steps: bool = False,
    stopping_factor: float = None,
    force: bool = False,
) -> None:
    """
    Runs learning rate search for given `num_batches` and saves the results in ``serialization_dir``

    # Parameters

    params : [`Params`](../common/params.md#params)
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : `str`
        The directory in which to save results.
    start_lr : `float`
        Learning rate to start the search.
    end_lr : `float`
        Learning rate upto which search is done.
    num_batches : `int`
        Number of mini-batches to run Learning rate finder.
    linear_steps : `bool`
        Increase learning rate linearly if False exponentially.
    stopping_factor : `float`
        Stop the search when the current loss exceeds the best loss recorded by
        multiple of stopping factor. If `None` search proceeds till the `end_lr`
    force : `bool`
        If True and the serialization directory already exists, everything in it will
        be removed prior to finding the learning rate.
    """
    create_serialization_dir(params,
                             serialization_dir,
                             recover=False,
                             force=force)

    prepare_environment(params)

    cuda_device = params.params.get("trainer").get("cuda_device", -1)
    check_for_gpu(cuda_device)
    distributed_params = params.params.get("distributed")
    # See https://github.com/allenai/allennlp/issues/3658
    assert not distributed_params, "find-lr is not compatible with DistributedDataParallel."

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(
        params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(
                f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info(
        "From dataset instances, %s will be considered for vocabulary creation.",
        ", ".join(datasets_for_vocab_creation),
    )
    vocab = Vocabulary.from_params(
        params.pop("vocabulary", {}),
        instances=(instance for key, dataset in all_datasets.items()
                   for instance in dataset
                   if key in datasets_for_vocab_creation),
    )

    train_data = all_datasets["train"]
    train_data.index_with(vocab)
    model = Model.from_params(vocab=vocab, params=params.pop("model"))
    data_loader = DataLoader.from_params(dataset=train_data,
                                         params=params.pop("data_loader"))

    trainer_params = params.pop("trainer")

    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)

    trainer_choice = trainer_params.pop("type", "gradient_descent")
    if trainer_choice != "gradient_descent":
        raise ConfigurationError(
            "currently find-learning-rate only works with the GradientDescentTrainer"
        )
    trainer: GradientDescentTrainer = Trainer.from_params(  # type: ignore
        model=model,
        serialization_dir=serialization_dir,
        data_loader=data_loader,
        params=trainer_params,
    )

    logger.info(
        f"Starting learning rate search from {start_lr} to {end_lr} in {num_batches} iterations."
    )
    learning_rates, losses = search_learning_rate(
        trainer,
        start_lr=start_lr,
        end_lr=end_lr,
        num_batches=num_batches,
        linear_steps=linear_steps,
        stopping_factor=stopping_factor,
    )
    logger.info(f"Finished learning rate search.")
    losses = _smooth(losses, 0.98)

    _save_plot(learning_rates, losses,
               os.path.join(serialization_dir, "lr-losses.png"))
Esempio n. 25
0
def train_model(params: Params, serialization_dir: str) -> Model:
    """
    This function can be used as an entry point to running models in AllenNLP
    directly from a JSON specification using a :class:`Driver`. Note that if
    you care about reproducibility, you should avoid running code using Pytorch
    or numpy which affect the reproducibility of your experiment before you
    import and use this function, these libraries rely on random seeds which
    can be set in this function via a JSON specification file. Note that this
    function performs training and will also evaluate the trained model on
    development and test sets if provided in the parameter json.

    Parameters
    ----------
    params: Params, required.
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir: str, required
        The directory in which to save results and logs.
    """
    prepare_environment(params)

    os.makedirs(serialization_dir, exist_ok=True)
    sys.stdout = TeeLogger(os.path.join(serialization_dir, "stdout.log"),
                           sys.stdout)  # type: ignore
    sys.stderr = TeeLogger(os.path.join(serialization_dir, "stderr.log"),
                           sys.stderr)  # type: ignore
    handler = logging.FileHandler(
        os.path.join(serialization_dir, "python_logging.log"))
    handler.setLevel(logging.INFO)
    handler.setFormatter(
        logging.Formatter(
            '%(asctime)s - %(levelname)s - %(name)s - %(message)s'))
    logging.getLogger().addHandler(handler)
    serialization_params = deepcopy(params).as_dict(quiet=True)
    with open(os.path.join(serialization_dir, "model_params.json"),
              "w") as param_file:
        json.dump(serialization_params, param_file, indent=4)

    # Now we begin assembling the required parts for the Trainer.

    # 1. Primary training data.
    dataset_reader = DatasetReader.from_params(params.pop('dataset_reader'))
    train_data_path = params.pop('train_data_path')
    logger.info("Reading training data from %s", train_data_path)
    train_data = dataset_reader.read(train_data_path)

    # 2. Auxillary training data.
    dataset_reader_aux = DatasetReader.from_params(
        params.pop('dataset_reader_aux'))
    train_data_path_aux = params.pop('train_data_path_aux')
    logger.info("Reading auxilliary training data from %s",
                train_data_path_aux)
    train_data_aux = dataset_reader_aux.read(train_data_path_aux)

    # If only using a fraction of the auxiliary data.
    aux_sample_fraction = params.pop("aux_sample_fraction", 1.0)
    if aux_sample_fraction < 1.0:
        sample_size = int(aux_sample_fraction * len(train_data_aux.instances))
        train_data_aux = Dataset(
            random.sample(train_data_aux.instances, sample_size))

    # Balance the two datasets by inflating the size of the smaller dataset to the size of the larger dataset.
    train_size = len(train_data.instances)
    aux_train_size = len(train_data_aux.instances)
    mixing_ratio = params.pop("mixing_ratio")
    # mixing_ratio = float(train_size)/aux_train_size

    if train_size > aux_train_size:  # case for PB scaffold.
        difference = train_size - aux_train_size
        aux_sample = [
            random.choice(train_data_aux.instances) for _ in range(difference)
        ]
        train_data_aux = Dataset(train_data_aux.instances + aux_sample)
        logger.info(
            "Inflating auxiliary train data from {} to {} samples".format(
                aux_train_size, len(train_data_aux.instances)))
    # else: # case for FN scaffold.
    #     difference = aux_train_size - train_size
    #     train_sample = [random.choice(train_data.instances) for _ in range(difference)]
    #     train_data = Dataset(train_data.instances + train_sample)
    #     logger.info("Inflating train data from {} to {} samples".format(
    #         train_size, len(train_data.instances)))

    all_datasets: Dict[str, Dataset] = {"train": train_data}
    all_datasets_aux: Dict[str, Dataset] = {"train_aux": train_data_aux}

    # 3. Primary validation data.
    validation_data_path = params.pop('validation_data_path', None)
    if validation_data_path is not None:
        logger.info("Reading validation data from %s", validation_data_path)
        validation_data = dataset_reader.read(validation_data_path)
        all_datasets["validation"] = validation_data
    else:
        validation_data = None

    # 4. Auxillary validation data.
    validation_data_path_aux = params.pop('validation_data_path_aux', None)
    if validation_data_path_aux is not None:
        logger.info("Reading auxilliary validation data from %s",
                    validation_data_path_aux)
        validation_data_aux = dataset_reader_aux.read(validation_data_path_aux)
        all_datasets_aux["validation_aux"] = validation_data_aux
    else:
        validation_data_aux = None

    # 5. Primary test data
    test_data_path = params.pop("test_data_path", None)
    if test_data_path is not None:
        logger.info("Reading test data from %s", test_data_path)
        test_data = dataset_reader.read(test_data_path)
        all_datasets["test"] = test_data
    else:
        test_data = None

    # 6. Auxillary test data
    test_data_path_aux = params.pop("test_data_path_aux", None)
    if test_data_path_aux is not None:
        logger.info("Reading auxillary test data from %s", test_data_path_aux)
        test_data_aux = dataset_reader_aux.read(test_data_path_aux)
        all_datasets_aux["test_aux"] = test_data_aux
    else:
        test_data_aux = None

    datasets_for_vocab_creation = set(
        params.pop("datasets_for_vocab_creation", all_datasets))
    datasets_for_vocab_creation_aux = set(
        params.pop("auxillary_datasets_for_vocab_creation", all_datasets_aux))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(
                f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info(
        "Creating a vocabulary using %s data. Auxillary also included.",
        ", ".join(datasets_for_vocab_creation))
    dataset_primary = Dataset([
        instance for key, dataset in all_datasets.items()
        for instance in dataset.instances if key in datasets_for_vocab_creation
    ])
    dataset_aux = Dataset([
        instance for key, dataset in all_datasets_aux.items()
        for instance in dataset.instances
        if key in datasets_for_vocab_creation_aux
    ])
    vocab = Vocabulary.from_params(params.pop("vocabulary", {}),
                                   dataset_primary,
                                   dataset_aux=dataset_aux)
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    model = Model.from_params(vocab, params.pop('model'))
    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator_aux = DataIterator.from_params(params.pop("iterator_aux"))

    train_data.index_instances(vocab)
    train_data_aux.index_instances(vocab)
    if validation_data:
        validation_data.index_instances(vocab)
    if validation_data_aux:
        validation_data_aux.index_instances(vocab)

    cutoff_epoch = params.pop("cutoff_epoch", -1)

    trainer_params = params.pop("trainer")
    trainer = MultiTaskTrainer.from_params(
        model=model,
        serialization_dir=serialization_dir,
        iterator=iterator,
        iterator_aux=iterator_aux,
        train_dataset=train_data,
        train_dataset_aux=train_data_aux,
        mixing_ratio=mixing_ratio,
        cutoff_epoch=cutoff_epoch,
        validation_dataset=validation_data,
        validation_dataset_aux=validation_data_aux,
        params=trainer_params,
        files_to_archive=params.files_to_archive)

    evaluate_on_test = params.pop("evaluate_on_test", False)
    params.assert_empty('base train command')
    trainer.train()

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)

    if test_data and evaluate_on_test:
        test_data.index_instances(vocab)
        evaluate(model, test_data, iterator, cuda_device=trainer._cuda_device)  # pylint: disable=protected-access

    elif test_data:
        logger.info(
            "To evaluate on the test set after training, pass the "
            "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    if test_data_aux and evaluate_on_test:
        test_data_aux.index_instances(vocab)
        evaluate(model,
                 test_data_aux,
                 iterator_aux,
                 cuda_device=trainer._cuda_device)  # pylint: disable=protected-access

    elif test_data_aux:
        logger.info(
            "To evaluate on the auxillary test set after training, pass the "
            "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    return model
Esempio n. 26
0
def fine_tune_model(model: Model,
                    params: Params,
                    serialization_dir: str,
                    file_friendly_logging: bool = False) -> Model:
    """
    Fine tunes the given model, using a set of parameters that is largely identical to those used
    for :func:`~allennlp.commands.train.train_model`, except that the ``model`` section is ignored,
    if it is present (as we are already given a ``Model`` here).

    The main difference between the logic done here and the logic done in ``train_model`` is that
    here we do not worry about vocabulary construction or creating the model object.  Everything
    else is the same.

    Parameters
    ----------
    archive : ``Archive``
        A saved model archive that is the result of running the ``train`` command.
    train_data_path : ``str``
        Path to the training data to use for fine-tuning.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    validation_data_path : ``str``, optional
        Path to the validation data to use while fine-tuning.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    """
    prepare_environment(params)

    os.makedirs(serialization_dir)

    # TODO(mattg): pull this block out into a separate function (maybe just add this to
    # `prepare_environment`?)
    Tqdm.set_slower_interval(file_friendly_logging)
    sys.stdout = TeeLogger(
        os.path.join(serialization_dir, "stdout.log"),  # type: ignore
        sys.stdout,
        file_friendly_logging)
    sys.stderr = TeeLogger(
        os.path.join(serialization_dir, "stderr.log"),  # type: ignore
        sys.stderr,
        file_friendly_logging)
    handler = logging.FileHandler(
        os.path.join(serialization_dir, "python_logging.log"))
    handler.setLevel(logging.INFO)
    handler.setFormatter(
        logging.Formatter(
            '%(asctime)s - %(levelname)s - %(name)s - %(message)s'))
    logging.getLogger().addHandler(handler)

    serialization_params = deepcopy(params).as_dict(quiet=True)
    with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file:
        json.dump(serialization_params, param_file, indent=4)

    if params.pop('model', None):
        logger.warning(
            "You passed parameters for the model in your configuration file, but we "
            "are ignoring them, using instead the model parameters in the archive."
        )

    if params.pop('vocabulary', None):
        logger.warning(
            "You passed parameters for the vocabulary in your configuration file, but "
            "we are ignoring them, using instead the vocabulary from the saved model."
        )

    vocab = model.vocab
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)

    all_datasets = datasets_from_params(params)

    train_data = all_datasets['train']
    validation_data = all_datasets.get('validation')
    test_data = all_datasets.get('test')

    trainer_params = params.pop("trainer")
    trainer = Trainer.from_params(model, serialization_dir, iterator,
                                  train_data, validation_data, trainer_params)

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)
    params.assert_empty('base train command')
    metrics = trainer.train()

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)

    if test_data and evaluate_on_test:
        test_metrics = evaluate(model,
                                test_data,
                                iterator,
                                cuda_device=trainer._cuda_devices[0])  # pylint: disable=protected-access
        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif test_data:
        logger.info(
            "To evaluate on the test set after training, pass the "
            "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    metrics_json = json.dumps(metrics, indent=2)
    with open(os.path.join(serialization_dir, "metrics.json"),
              "w") as metrics_file:
        metrics_file.write(metrics_json)
    logger.info("Metrics: %s", metrics_json)

    return model
Esempio n. 27
0
def fine_tune_model(model: Model,
                    params: Params,
                    serialization_dir: str,
                    extend_vocab: bool = False,
                    file_friendly_logging: bool = False) -> Model:
    """
    Fine tunes the given model, using a set of parameters that is largely identical to those used
    for :func:`~allennlp.commands.train.train_model`, except that the ``model`` section is ignored,
    if it is present (as we are already given a ``Model`` here).

    The main difference between the logic done here and the logic done in ``train_model`` is that
    here we do not worry about vocabulary construction or creating the model object.  Everything
    else is the same.

    Parameters
    ----------
    archive : ``Archive``
        A saved model archive that is the result of running the ``train`` command.
    train_data_path : ``str``
        Path to the training data to use for fine-tuning.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    validation_data_path : ``str``, optional
        Path to the validation data to use while fine-tuning.
    extend_vocab: ``bool``, optional (default=False)
        If ``True``, we use the new instances to extend your vocabulary.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    """
    prepare_environment(params)
    if os.path.exists(serialization_dir) and os.listdir(serialization_dir):
        raise ConfigurationError(
            f"Serialization directory ({serialization_dir}) "
            f"already exists and is not empty.")

    os.makedirs(serialization_dir, exist_ok=True)
    prepare_global_logging(serialization_dir, file_friendly_logging)

    serialization_params = deepcopy(params).as_dict(quiet=True)
    with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file:
        json.dump(serialization_params, param_file, indent=4)

    if params.pop('model', None):
        logger.warning(
            "You passed parameters for the model in your configuration file, but we "
            "are ignoring them, using instead the model parameters in the archive."
        )

    vocabulary_params = params.pop('vocabulary', {})
    if vocabulary_params.get('directory_path', None):
        logger.warning(
            "You passed `directory_path` in parameters for the vocabulary in "
            "your configuration file, but it will be ignored. ")

    all_datasets = datasets_from_params(params)
    vocab = model.vocab

    if extend_vocab:
        datasets_for_vocab_creation = set(
            params.pop("datasets_for_vocab_creation", all_datasets))

        for dataset in datasets_for_vocab_creation:
            if dataset not in all_datasets:
                raise ConfigurationError(
                    f"invalid 'dataset_for_vocab_creation' {dataset}")

        logger.info("Extending model vocabulary using %s data.",
                    ", ".join(datasets_for_vocab_creation))
        vocab.extend_from_instances(
            vocabulary_params,
            (instance for key, dataset in all_datasets.items()
             for instance in dataset if key in datasets_for_vocab_creation))

    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(model.vocab)
    validation_iterator_params = params.pop("validation_iterator", None)
    if validation_iterator_params:
        validation_iterator = DataIterator.from_params(
            validation_iterator_params)
        validation_iterator.index_with(vocab)
    else:
        validation_iterator = None

    train_data = all_datasets['train']
    validation_data = all_datasets.get('validation')
    test_data = all_datasets.get('test')

    trainer_params = params.pop("trainer")
    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)

    frozen_parameter_names, tunable_parameter_names = \
                   get_frozen_and_tunable_parameter_names(model)
    logger.info("Following parameters are Frozen  (without gradient):")
    for name in frozen_parameter_names:
        logger.info(name)
    logger.info("Following parameters are Tunable (with gradient):")
    for name in tunable_parameter_names:
        logger.info(name)

    trainer_choice = trainer_params.pop_choice("type",
                                               Trainer.list_available(),
                                               default_to_first_choice=True)
    trainer = Trainer.by_name(trainer_choice).from_params(
        model=model,
        serialization_dir=serialization_dir,
        iterator=iterator,
        train_data=train_data,
        validation_data=validation_data,
        params=trainer_params,
        validation_iterator=validation_iterator)

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)
    params.assert_empty('base train command')
    try:
        metrics = trainer.train()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info(
                "Fine-tuning interrupted by the user. Attempting to create "
                "a model archive using the current best epoch weights.")
            archive_model(serialization_dir,
                          files_to_archive=params.files_to_archive)
        raise

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)

    if test_data and evaluate_on_test:
        test_metrics = evaluate(model,
                                test_data,
                                iterator,
                                cuda_device=trainer._cuda_devices[0])  # pylint: disable=protected-access
        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif test_data:
        logger.info(
            "To evaluate on the test set after training, pass the "
            "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    metrics_json = json.dumps(metrics, indent=2)
    with open(os.path.join(serialization_dir, "metrics.json"),
              "w") as metrics_file:
        metrics_file.write(metrics_json)
    logger.info("Metrics: %s", metrics_json)

    return model
Esempio n. 28
0
if args.v:
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
        level=logging.INFO)  # turn on logging.
    # Disable some of the more verbose logging statements
    logging.getLogger('allennlp.common.params').disabled = True
    logging.getLogger('allennlp.nn.initializers').disabled = True
    logging.getLogger('allennlp.modules.token_embedders.embedding').setLevel(
        logging.INFO)

# Load from archive
archive = load_archive(args.archive_file, args.cuda_device, args.overrides,
                       args.weights_file)
config = archive.config
config.formalism = "DUMMY"
prepare_environment(config)
model = archive.model
model.eval()
if not isinstance(model, GraphDependencyParser):
    raise ConfigurationError(
        "The loaded model seems not to be an am-parser (GraphDependencyParser)"
    )

# Load the evaluation data

# Try to use the validation dataset reader if there is one - otherwise fall back
# to the default dataset_reader used for both training and validation.
validation_dataset_reader_params = config.pop('validation_dataset_reader',
                                              None)
if validation_dataset_reader_params is not None:
    dataset_reader = DatasetReader.from_params(
Esempio n. 29
0
def train_model(params: Params,
                serialization_dir: str,
                file_friendly_logging: bool = False,
                recover: bool = False) -> Model:
    """
    Trains the model specified in the given :class:`Params` object, using the data and training
    parameters also specified in that object, and saves the results in ``serialization_dir``.

    Parameters
    ----------
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    recover : ``bool`, optional (default=False)
        If ``True``, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see the ``fine-tune`` command.
    """
    prepare_environment(params)

    create_serialization_dir(params, serialization_dir, recover)
    prepare_global_logging(serialization_dir, file_friendly_logging)

    serialization_params = deepcopy(params).as_dict(quiet=True)
    with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file:
        json.dump(serialization_params, param_file, indent=4)

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_for_vocab_creation))
    vocab = Vocabulary.from_params(params.pop("vocabulary", {}),
                                   (instance for key, dataset in all_datasets.items()
                                    for instance in dataset
                                    if key in datasets_for_vocab_creation))
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    model = Model.from_params(vocab, params.pop('model'))
    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)

    train_data = all_datasets['train']
    validation_data = all_datasets.get('validation')
    test_data = all_datasets.get('test')

    trainer_params = params.pop("trainer")
    trainer = Trainer.from_params(model,
                                  serialization_dir,
                                  iterator,
                                  train_data,
                                  validation_data,
                                  trainer_params)

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)
    params.assert_empty('base train command')

    try:
        metrics = trainer.train()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info("Training interrupted by the user. Attempting to create "
                         "a model archive using the current best epoch weights.")
            archive_model(serialization_dir, files_to_archive=params.files_to_archive)
        raise

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)

    if test_data and evaluate_on_test:
        test_metrics = evaluate(model, test_data, iterator, cuda_device=trainer._cuda_devices[0])  # pylint: disable=protected-access
        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif test_data:
        logger.info("To evaluate on the test set after training, pass the "
                    "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    metrics_json = json.dumps(metrics, indent=2)
    with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file:
        metrics_file.write(metrics_json)
    logger.info("Metrics: %s", metrics_json)

    return model
Esempio n. 30
0
def train_model(params: Params,
                serialization_dir: str,
                file_friendly_logging: bool = False,
                recover: bool = False,
                force: bool = False) -> Model:
    prepare_environment(params)

    create_serialization_dir(params, serialization_dir, recover, force)
    prepare_global_logging(serialization_dir, file_friendly_logging)

    cuda_device = params.params.get('trainer').get('cuda_device', -1)
    if isinstance(cuda_device, list):
        for device in cuda_device:
            check_for_gpu(device)
    else:
        check_for_gpu(cuda_device)

    params.to_file(os.path.join(serialization_dir, CONFIG_NAME))

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(
        params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(
                f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info(
        "From dataset instances, %s will be considered for vocabulary creation.",
        ", ".join(datasets_for_vocab_creation))

    if recover and os.path.exists(os.path.join(serialization_dir,
                                               "vocabulary")):
        vocab = Vocabulary.from_files(
            os.path.join(serialization_dir, "vocabulary"))
    else:
        vocab = Vocabulary.from_params(
            params.pop("vocabulary", {}),
            (instance for key, dataset in all_datasets.items()
             for instance in dataset if key in datasets_for_vocab_creation))

    model = Model.from_params(vocab=vocab, params=params.pop('model'))

    # Initializing the model can have side effect of expanding the vocabulary
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)
    validation_iterator_params = params.pop("validation_iterator", None)
    if validation_iterator_params:
        validation_iterator = DataIterator.from_params(
            validation_iterator_params)
        validation_iterator.index_with(vocab)
    else:
        validation_iterator = None

    train_data = all_datasets['train']
    validation_data = all_datasets.get('validation')
    test_data = all_datasets.get('test')

    trainer_params = params.pop("trainer")
    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)

    frozen_parameter_names, tunable_parameter_names = \
        get_frozen_and_tunable_parameter_names(model)
    logger.info("Following parameters are Frozen  (without gradient):")
    for name in frozen_parameter_names:
        logger.info(name)
    logger.info("Following parameters are Tunable (with gradient):")
    for name in tunable_parameter_names:
        logger.info(name)

    trainer_choice = trainer_params.pop_choice("type",
                                               Trainer.list_available(),
                                               default_to_first_choice=True)
    trainer = Trainer.by_name(trainer_choice).from_params(
        model=model,
        serialization_dir=serialization_dir,
        iterator=iterator,
        train_data=train_data,
        validation_data=validation_data,
        params=trainer_params,
        validation_iterator=validation_iterator)

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)
    params.assert_empty('base train command')

    try:
        metrics = trainer.train()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if os.path.exists(os.path.join(serialization_dir, DEFAULT_WEIGHTS)):
            logging.info(
                "Training interrupted by the user. Attempting to create "
                "a model archive using the current best epoch weights.")
            archive_model(serialization_dir,
                          files_to_archive=params.files_to_archive)
        raise

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)

    logger.info("Loading the best epoch weights.")
    best_model_state_path = os.path.join(serialization_dir, 'best.th')
    best_model_state = torch.load(best_model_state_path)
    best_model = model
    best_model.load_state_dict(best_model_state)

    if test_data and evaluate_on_test:
        logger.info(
            "The model will be evaluated using the best epoch weights.")
        test_metrics = evaluate(
            best_model,
            test_data,
            validation_iterator or iterator,
            cuda_device=trainer._cuda_devices[0]  # pylint: disable=protected-access
        )
        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif test_data:
        logger.info(
            "To evaluate on the test set after training, pass the "
            "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    dump_metrics(os.path.join(serialization_dir, "metrics.json"),
                 metrics,
                 log=True)

    return best_model
Esempio n. 31
0
def fine_tune_model(model: Model,
                    params: Params,
                    serialization_dir: str,
                    file_friendly_logging: bool = False) -> Model:
    """
    Fine tunes the given model, using a set of parameters that is largely identical to those used
    for :func:`~allennlp.commands.train.train_model`, except that the ``model`` section is ignored,
    if it is present (as we are already given a ``Model`` here).

    The main difference between the logic done here and the logic done in ``train_model`` is that
    here we do not worry about vocabulary construction or creating the model object.  Everything
    else is the same.

    Parameters
    ----------
    archive : ``Archive``
        A saved model archive that is the result of running the ``train`` command.
    train_data_path : ``str``
        Path to the training data to use for fine-tuning.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    validation_data_path : ``str``, optional
        Path to the validation data to use while fine-tuning.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    """
    prepare_environment(params)
    os.makedirs(serialization_dir)
    prepare_global_logging(serialization_dir, file_friendly_logging)

    serialization_params = deepcopy(params).as_dict(quiet=True)
    with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file:
        json.dump(serialization_params, param_file, indent=4)

    if params.pop('model', None):
        logger.warning(
            "You passed parameters for the model in your configuration file, but we "
            "are ignoring them, using instead the model parameters in the archive."
        )

    if params.pop('vocabulary', None):
        logger.warning(
            "You passed parameters for the vocabulary in your configuration file, but "
            "we are ignoring them, using instead the vocabulary from the saved model."
        )

    vocab = model.vocab
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)

    all_datasets = datasets_from_params(params)

    train_data = all_datasets['train']
    validation_data = all_datasets.get('validation')
    test_data = all_datasets.get('test')

    trainer_params = params.pop("trainer")
    trainer = Trainer.from_params(model, serialization_dir, iterator,
                                  train_data, validation_data, trainer_params)

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)
    params.assert_empty('base train command')
    try:
        metrics = trainer.train()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info(
                "Fine-tuning interrupted by the user. Attempting to create "
                "a model archive using the current best epoch weights.")
            archive_model(serialization_dir,
                          files_to_archive=params.files_to_archive)
        raise

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)

    if test_data and evaluate_on_test:
        test_metrics = evaluate(model,
                                test_data,
                                iterator,
                                cuda_device=trainer._cuda_devices[0])  # pylint: disable=protected-access
        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif test_data:
        logger.info(
            "To evaluate on the test set after training, pass the "
            "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    metrics_json = json.dumps(metrics, indent=2)
    with open(os.path.join(serialization_dir, "metrics.json"),
              "w") as metrics_file:
        metrics_file.write(metrics_json)
    logger.info("Metrics: %s", metrics_json)

    return model
Esempio n. 32
0
def fine_tune_model(model: Model,
                    params: Params,
                    serialization_dir: str,
                    file_friendly_logging: bool = False) -> Model:
    """
    Fine tunes the given model, using a set of parameters that is largely identical to those used
    for :func:`~allennlp.commands.train.train_model`, except that the ``model`` section is ignored,
    if it is present (as we are already given a ``Model`` here).

    The main difference between the logic done here and the logic done in ``train_model`` is that
    here we do not worry about vocabulary construction or creating the model object.  Everything
    else is the same.

    Parameters
    ----------
    archive : ``Archive``
        A saved model archive that is the result of running the ``train`` command.
    train_data_path : ``str``
        Path to the training data to use for fine-tuning.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    validation_data_path : ``str``, optional
        Path to the validation data to use while fine-tuning.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    """
    prepare_environment(params)
    os.makedirs(serialization_dir)
    prepare_global_logging(serialization_dir, file_friendly_logging)

    serialization_params = deepcopy(params).as_dict(quiet=True)
    with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file:
        json.dump(serialization_params, param_file, indent=4)

    if params.pop('model', None):
        logger.warning("You passed parameters for the model in your configuration file, but we "
                       "are ignoring them, using instead the model parameters in the archive.")

    vocabulary_params = params.pop('vocabulary', {})
    if vocabulary_params.get('directory_path', None):
        logger.warning("You passed `directory_path` in parameters for the vocabulary in "
                       "your configuration file, but it will be ignored. "
                       "Vocabulary from the saved model will be extended with current data.")

    all_datasets = datasets_from_params(params)

    datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("Extending model vocabulary using %s data.", ", ".join(datasets_for_vocab_creation))
    vocab = model.vocab
    vocab.extend_from_instances(vocabulary_params,
                                (instance for key, dataset in all_datasets.items()
                                 for instance in dataset
                                 if key in datasets_for_vocab_creation))
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)

    train_data = all_datasets['train']
    validation_data = all_datasets.get('validation')
    test_data = all_datasets.get('test')

    trainer_params = params.pop("trainer")
    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)

    frozen_parameter_names, tunable_parameter_names = \
                   get_frozen_and_tunable_parameter_names(model)
    logger.info("Following parameters are Frozen  (without gradient):")
    for name in frozen_parameter_names:
        logger.info(name)
    logger.info("Following parameters are Tunable (with gradient):")
    for name in tunable_parameter_names:
        logger.info(name)

    trainer = Trainer.from_params(model,
                                  serialization_dir,
                                  iterator,
                                  train_data,
                                  validation_data,
                                  trainer_params)

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)
    params.assert_empty('base train command')
    try:
        metrics = trainer.train()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info("Fine-tuning interrupted by the user. Attempting to create "
                         "a model archive using the current best epoch weights.")
            archive_model(serialization_dir, files_to_archive=params.files_to_archive)
        raise

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)

    if test_data and evaluate_on_test:
        test_metrics = evaluate(model, test_data, iterator, cuda_device=trainer._cuda_devices[0])  # pylint: disable=protected-access
        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif test_data:
        logger.info("To evaluate on the test set after training, pass the "
                    "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    metrics_json = json.dumps(metrics, indent=2)
    with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file:
        metrics_file.write(metrics_json)
    logger.info("Metrics: %s", metrics_json)

    return model
Esempio n. 33
0
def train_model(db: FeverDocDB, params: Union[Params, Dict[str, Any]],
                cuda_device: int, serialization_dir: str) -> Model:
    """
    This function can be used as an entry point to running models in AllenNLP
    directly from a JSON specification using a :class:`Driver`. Note that if
    you care about reproducibility, you should avoid running code using Pytorch
    or numpy which affect the reproducibility of your experiment before you
    import and use this function, these libraries rely on random seeds which
    can be set in this function via a JSON specification file. Note that this
    function performs training and will also evaluate the trained model on
    development and test sets if provided in the parameter json.

    Parameters
    ----------
    params: Params, required.
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir: str, required
        The directory in which to save results and logs.
    """
    prepare_environment(params)

    os.makedirs(serialization_dir, exist_ok=True)
    sys.stdout = TeeLogger(os.path.join(serialization_dir, "stdout.log"),
                           sys.stdout)  # type: ignore
    sys.stderr = TeeLogger(os.path.join(serialization_dir, "stderr.log"),
                           sys.stderr)  # type: ignore
    handler = logging.FileHandler(
        os.path.join(serialization_dir, "python_logging.log"))
    handler.setLevel(logging.INFO)
    handler.setFormatter(
        logging.Formatter(
            '%(asctime)s - %(levelname)s - %(name)s - %(message)s'))
    logging.getLogger().addHandler(handler)
    serialization_params = deepcopy(params).as_dict(quiet=True)

    with open(os.path.join(serialization_dir, "model_params.json"),
              "w") as param_file:
        json.dump(serialization_params, param_file, indent=4)

    # Now we begin assembling the required parts for the Trainer.
    ds_params = params.pop('dataset_reader', {})
    dataset_reader = FEVERSentenceReader(
        db,
        wiki_tokenizer=Tokenizer.from_params(
            ds_params.pop('wiki_tokenizer', {})),
        claim_tokenizer=Tokenizer.from_params(
            ds_params.pop('claim_tokenizer', {})),
        token_indexers=TokenIndexer.dict_from_params(
            ds_params.pop('token_indexers', {})))

    train_data_path = params.pop('train_data_path')
    logger.info("Reading training data from %s", train_data_path)
    train_data = dataset_reader.read(train_data_path)

    all_datasets: List[Dataset] = [train_data]
    datasets_in_vocab = ["train"]

    validation_data_path = params.pop('validation_data_path', None)
    if validation_data_path is not None:
        logger.info("Reading validation data from %s", validation_data_path)
        validation_data = dataset_reader.read(validation_data_path)
        all_datasets.append(validation_data)
        datasets_in_vocab.append("validation")
    else:
        validation_data = None

    logger.info("Creating a vocabulary using %s data.",
                ", ".join(datasets_in_vocab))
    vocab = Vocabulary.from_params(
        params.pop("vocabulary", {}),
        Dataset([
            instance for dataset in all_datasets
            for instance in dataset.instances
        ]))
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    model = Model.from_params(vocab, params.pop('model'))
    iterator = DataIterator.from_params(params.pop("iterator"))

    train_data.index_instances(vocab)
    if validation_data:
        validation_data.index_instances(vocab)

    trainer_params = params.pop("trainer")
    if cuda_device is not None:
        args.trainer_params["cuda_device"] = cuda_device
    trainer = Trainer.from_params(model, serialization_dir, iterator,
                                  train_data, validation_data, trainer_params)

    trainer.train()

    # Now tar up results
    archive_model(serialization_dir)

    return model
Esempio n. 34
0
def find_learning_rate_model(params: Params, serialization_dir: str,
                             start_lr: float = 1e-5,
                             end_lr: float = 10,
                             num_batches: int = 100,
                             linear_steps: bool = False,
                             stopping_factor: float = None,
                             force: bool = False) -> None:
    """
    Runs learning rate search for given `num_batches` and saves the results in ``serialization_dir``

    Parameters
    ----------
    trainer: :class:`~allennlp.common.registrable.Registrable`
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results.
    start_lr: ``float``
        Learning rate to start the search.
    end_lr: ``float``
        Learning rate upto which search is done.
    num_batches: ``int``
        Number of mini-batches to run Learning rate finder.
    linear_steps: ``bool``
        Increase learning rate linearly if False exponentially.
    stopping_factor: ``float``
        Stop the search when the current loss exceeds the best loss recorded by
        multiple of stopping factor. If ``None`` search proceeds till the ``end_lr``
    force: ``bool``
        If True and the serialization directory already exists, everything in it will
        be removed prior to finding the learning rate.
    """
    if os.path.exists(serialization_dir) and force:
        shutil.rmtree(serialization_dir)

    if os.path.exists(serialization_dir) and os.listdir(serialization_dir):
        raise ConfigurationError(f'Serialization directory {serialization_dir} already exists and is '
                                 f'not empty.')
    else:
        os.makedirs(serialization_dir, exist_ok=True)

    prepare_environment(params)

    cuda_device = params.params.get('trainer').get('cuda_device', -1)
    if isinstance(cuda_device, list):
        for device in cuda_device:
            check_for_gpu(device)
    else:
        check_for_gpu(cuda_device)

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("From dataset instances, %s will be considered for vocabulary creation.",
                ", ".join(datasets_for_vocab_creation))
    vocab = Vocabulary.from_params(
            params.pop("vocabulary", {}),
            (instance for key, dataset in all_datasets.items()
             for instance in dataset
             if key in datasets_for_vocab_creation)
    )

    model = Model.from_params(vocab=vocab, params=params.pop('model'))
    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)

    train_data = all_datasets['train']

    trainer_params = params.pop("trainer")
    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)

    trainer = Trainer.from_params(model,
                                  serialization_dir,
                                  iterator,
                                  train_data,
                                  params=trainer_params,
                                  validation_data=None,
                                  validation_iterator=None)

    logger.info(f'Starting learning rate search from {start_lr} to {end_lr} in {num_batches} iterations.')
    learning_rates, losses = search_learning_rate(trainer,
                                                  start_lr=start_lr,
                                                  end_lr=end_lr,
                                                  num_batches=num_batches,
                                                  linear_steps=linear_steps,
                                                  stopping_factor=stopping_factor)
    logger.info(f'Finished learning rate search.')
    losses = _smooth(losses, 0.98)

    _save_plot(learning_rates, losses, os.path.join(serialization_dir, 'lr-losses.png'))
Esempio n. 35
0
def train_model(args: argparse.Namespace,
                params: Params,
                serialization_dir: str,
                file_friendly_logging: bool = False,
                recover: bool = False) -> Model:
    """
    Trains the model specified in the given :class:`Params` object, using the data and training
    parameters also specified in that object, and saves the results in ``serialization_dir``.

    Parameters
    ----------
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    recover : ``bool`, optional (default=False)
        If ``True``, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see the ``fine-tune`` command.
    """
    prepare_environment(params)

    create_serialization_dir(params, serialization_dir, recover)
    prepare_global_logging(serialization_dir, file_friendly_logging)

    serialization_params = deepcopy(params).as_dict(quiet=True)
    with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file:
        json.dump(serialization_params, param_file, indent=4)

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_for_vocab_creation))
    if args.archive_file is not None:
        params.pop("vocabulary", {})
        params.pop('model')
        archive = load_archive(args.archive_file,
                               weights_file=args.weights_file,
                               cuda_device=args.cuda_device,
                               overrides=args.overrides)
        model = archive.model
        model.eval()
        vocab = model.vocab
    else:
        vocab = Vocabulary.from_params(params.pop("vocabulary", {}),
                                       (instance for key, dataset in all_datasets.items()
                                        for instance in dataset
                                        if key in datasets_for_vocab_creation))
        vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))
        model = Model.from_params(vocab, params.pop('model'))

    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)

    train_data = all_datasets['train']
    validation_data = all_datasets.get('validation')
    test_data = all_datasets.get('test')

    trainer_params = params.pop("trainer")
    trainer = Trainer.from_params(model,
                                  serialization_dir,
                                  iterator,
                                  train_data,
                                  validation_data,
                                  trainer_params)

    if args.archive_file is not None:
        trainer._patience = 10000
        trainer._num_epochs = 10000

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)
    params.assert_empty('base train command')

    try:
        metrics = trainer.train()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info("Training interrupted by the user. Attempting to create "
                         "a model archive using the current best epoch weights.")
            archive_model(serialization_dir, files_to_archive=params.files_to_archive)
        raise

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)

    if test_data and evaluate_on_test:
        test_metrics = evaluate(model, test_data, iterator, cuda_device=trainer._cuda_devices[0])  # pylint: disable=protected-access
        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif test_data:
        logger.info("To evaluate on the test set after training, pass the "
                    "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    metrics_json = json.dumps(metrics, indent=2)
    with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file:
        metrics_file.write(metrics_json)
    logger.info("Metrics: %s", metrics_json)

    return model
Esempio n. 36
0
def main(args):
    params = Params.from_file(args.params_file)

    # print('Data seed:{}, Percent data: {}'.format(shuffle_id, train_size))
    settings.cuda = params['cuda_device'] != -1
    common_util.prepare_environment(params)

    serialization_dir = params['serialization_dir']
    training_util.create_serialization_dir(params, serialization_dir,
                                           args.recover, args.force)
    common_util.prepare_global_logging(serialization_dir, True)
    logging.info(
        "torch version: {}, allennlp version: {}, allennlp path: {}".format(
            torch.__version__, allennlp.__version__, allennlp.__path__))
    params.to_file(os.path.join(serialization_dir, CONFIG_NAME))

    semi_supervision = params.get('semi_supervised', False)
    which_mixer = params.get('which_mixer', 'cm')
    dd_warmup_iters = params.pop('dd_warmup_iters', 1)
    dd_semi_warmup_iters = params.pop('dd_semi_warmup_iters', 1)
    dd_update_freq = params.pop('dd_update_freq', 2)
    constraints_wt = params.get('constraints_wt', 0)
    calc_valid_freq = params.get('calc_valid_freq', 1)
    backprop_after_xbatches = params.pop('backprop_after_xbatches', 1)
    min_pct_of_unlabelled = params.pop('min_pct_of_unlabelled', 0.0)
    dd_increase_freq_after = params.pop('dd_increase_freq_after', 0)
    dd_increase_freq_by = params.pop('dd_increase_freq_by', 0)
    dd_decay_lr = params.pop('dd_decay_lr', 0)
    dd_decay_lr_after = params.pop('dd_decay_lr_after', 1.0)
    grad_norm_before_warmup = params.pop('grad_norm_before_warmup', 0)
    if semi_supervision:
        print("Semi Supervision On")

    for key in [
            'warmup_epochs', 'unlabelled_train_data_file', 'test_data_file',
            'data_dir', 'cuda_device', 'serialization_dir', 'train_data_file',
            'validation_data_file', 'constraints_wt', 'train_size',
            'shuffle_id', 'semi_supervised', 'which_mixer',
            'distributed_lambda_update', 'calc_valid_freq'
    ]:
        params.pop(key, None)

    print("Trainer pieces")
    pieces = gan_trainer.TrainerPiecesForSemi.from_params(
        params, serialization_dir, args.recover, semi_supervision)  # pylint: disable=no-member

    #pieces for constrained learning"
    print("Constraint model")
    constraints_model = Model.from_params(vocab=pieces.model.vocab,
                                          params=params.pop('dd_constraints'))
    dd_params = [[n, p] for n, p in constraints_model.named_parameters()
                 if p.requires_grad]
    dd_optimizer = None
    dd_optim_params = params.pop('dd_optimizer', None)
    if len(dd_params) > 0:
        dd_optimizer = Optimizer.from_params(dd_params, dd_optim_params)

    cp = None
    chfile = None
    #Pdb().set_trace()
    if args.weight_dir is not None:
        #Pdb().set_trace()
        flag = True
        if args.weight_file is not None:
            logging.info("Loading  Model weights from :{}".format(
                os.path.join(args.weight_dir, args.weight_file)))
            model_states = torch.load(
                os.path.join(args.weight_dir, args.weight_file))
            pieces.model.load_state_dict(model_states)
            flag = False
        if args.dd_file is not None:
            logging.info("Loading Constraint Model from :{}".format(
                os.path.join(args.weight_dir, args.dd_file)))
            flag = False
            chfile = os.path.join(args.weight_dir, args.dd_file)
            # cp = torch.load(chfile)
            # constraints_model.load_state_dict(cp['constraints_model'])
            # if 'dd_update_freq' in cp:
            #     dd_update_freq  = cp['dd_update_freq']
            #     print("New dd_update_freq:" , dd_update_freq)

        if flag:
            raise (
                "why provide args.weight_dir? when both weight_file and dd_file are None"
            )
    print("Trainer")
    trainer = Trainer.from_params(
        model=pieces.model,
        serialization_dir=serialization_dir,
        iterator=pieces.iterator,
        train_data=pieces.train_dataset,
        validation_data=pieces.validation_dataset,
        params=pieces.params,
        validation_iterator=pieces.validation_iterator)

    if args.weight_dir is not None and args.training_state_file is not None:
        logging.info("Loading Training state from :{}".format(
            os.path.join(args.weight_dir, args.training_state_file)))
        training_state = torch.load(
            os.path.join(args.weight_dir, args.training_state_file))
        trainer.optimizer.load_state_dict(training_state["optimizer"])

    params.assert_empty('base train command')

    try:
        #if backprop_after_xbatches == 1:
        #    print("Training setup")
        #    semi_trainer= gan_trainer.SemiSupervisedTrainer(trainer, constraints_model, dd_optimizer, pieces.validation_iterator,  pieces.unlabelled_dataset, semi_supervision, which_mixer, dd_warmup_iters, dd_update_freq, constraints_wt, calc_valid_freq)
        #    print("Training start")
        #    metrics = semi_trainer.custom_train()
        #else:
        print("Training setup")
        semi_trainer = gan_trainer.SemiSupervisedTrainer(
            trainer,
            constraints_model,
            dd_optimizer,
            pieces.validation_iterator,
            pieces.unlabelled_dataset,
            semi_supervision,
            which_mixer,
            dd_warmup_iters,
            dd_update_freq,
            constraints_wt,
            calc_valid_freq,
            backprop_after_xbatches,
            min_pct_of_unlabelled,
            dd_semi_warmup_iters,
            dd_increase_freq_after,
            dd_increase_freq_by,
            dd_decay_lr,
            args.debug,
            chfile=chfile,
            shuffle=args.shuffle,
            dd_decay_lr_after=dd_decay_lr_after,
            grad_norm_before_warmup=grad_norm_before_warmup)

        print("Training start")
        #print(yatin)
        metrics = semi_trainer.custom_train()

    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info(
                "Training interrupted by the user. Attempting to create "
                "a model archive using the current best epoch weights.")
            archive_model(serialization_dir,
                          files_to_archive=params.files_to_archive)
        raise

    archive_model(serialization_dir, files_to_archive=params.files_to_archive)
    common_util.dump_metrics(os.path.join(serialization_dir, "metrics.json"),
                             metrics,
                             log=True)
Esempio n. 37
0
def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]:
    common_logging.FILE_FRIENDLY_LOGGING = args.file_friendly_logging

    # Disable some of the more verbose logging statements
    logging.getLogger("allennlp.common.params").disabled = True
    logging.getLogger("allennlp.nn.initializers").disabled = True
    logging.getLogger("allennlp.modules.token_embedders.embedding").setLevel(
        logging.INFO)

    # Load from archive
    archive = load_archive(
        args.archive_file,
        weights_file=args.weights_file,
        cuda_device=args.cuda_device,
        overrides=args.overrides,
    )
    config = archive.config
    prepare_environment(config)
    model = archive.model
    model.eval()

    # Load the evaluation data

    # Try to use the validation dataset reader if there is one - otherwise fall back
    # to the default dataset_reader used for both training and validation.
    validation_dataset_reader_params = config.pop("validation_dataset_reader",
                                                  None)
    if validation_dataset_reader_params is not None:
        dataset_reader = DatasetReader.from_params(
            validation_dataset_reader_params)
    else:
        dataset_reader = DatasetReader.from_params(
            config.pop("dataset_reader"))
    evaluation_data_path = args.input_file
    logger.info("Reading evaluation data from %s", evaluation_data_path)
    instances = dataset_reader.read(evaluation_data_path)

    embedding_sources = (json.loads(args.embedding_sources_mapping)
                         if args.embedding_sources_mapping else {})

    if args.extend_vocab:
        logger.info("Vocabulary is being extended with test instances.")
        model.vocab.extend_from_instances(instances=instances)
        model.extend_embedder_vocab(embedding_sources)

    instances.index_with(model.vocab)
    data_loader_params = config.pop("validation_data_loader", None)
    if data_loader_params is None:
        data_loader_params = config.pop("data_loader")
    if args.batch_size:
        data_loader_params["batch_size"] = args.batch_size
    data_loader = DataLoader.from_params(dataset=instances,
                                         params=data_loader_params)

    metrics = evaluate(model, data_loader, args.cuda_device,
                       args.batch_weight_key)

    logger.info("Finished evaluating.")

    dump_metrics(args.output_file, metrics, log=True)

    return metrics