Esempio n. 1
0
    def from_params(self, params: Params, **extras) -> PytorchSeq2SeqWrapper:
        if not params.pop_bool('batch_first', True):
            raise ConfigurationError(
                "Our encoder semantics assumes batch is always first!")
        if self._module_class in self.PYTORCH_MODELS:
            params['batch_first'] = True
        stateful = params.pop_bool('stateful', False)
        weight_dropout = params.pop_float('weight_dropout', 0.0)
        variational = params.pop_float('variational', True)
        num_layers = params.get('num_layers', 1)
        bidirectional = params.get('bidirectional', False)
        all_recurrent_weights = [
            f"weight_hh_l{layer}{suffix}"
            for layer, suffix in product(range(num_layers), [""] +
                                         ["_reverse"] *
                                         (1 if bidirectional else 0))
        ]

        if weight_dropout > 0.0:
            module = weight_drop_factory(self._module_class)(
                module_args=params.as_dict(infer_type_and_cast=True),
                weights=all_recurrent_weights,
                wdrop=weight_dropout,
                variational=variational,
            )
        else:
            module = self._module_class(**params.as_dict(
                infer_type_and_cast=True))

        return PytorchSeq2SeqWrapper(module, stateful=stateful)
Esempio n. 2
0
    def from_params(cls, params: Params) -> 'LinearTransformSumReprCombination':
        tensor1_dim = params.get("tensor_1_dim", 0)
        tensor2_dim = params.get("tensor_2_dim", 0)
        tensor3_dim = params.get("tensor_3_dim", 0)
        output_dim = params.get("output_dim", 0)

        activation = Activation.by_name(params.get("activation", "linear"))()
        return cls(tensor1_dim, tensor2_dim, tensor3_dim, output_dim, activation)
Esempio n. 3
0
    def from_params(cls, params: Params) -> 'WeightedSumReprCombination':
        keep_context_threshold = params.get("keep_context_threshold", 0.5)
        tensor1_dim = params.get("tensor1_dim", 0)
        tensor2_dim = params.get("tensor2_dim", 0)
        output_dim = params.get("output_dim", 0)

        activation = Activation.by_name(params.get("activation", "linear"))()
        return cls(tensor1_dim, tensor2_dim, output_dim,
                   keep_context_threshold, activation)
    def create_or_extend_vocab(
        params: Params,
        datasets: Dict[str, Dict[str, Iterable[Instance]]],
        vocabulary_params: Params,
        vocabulary_path: str,
        vocab: Vocabulary = None,
        recover: bool = False,
    ) -> Vocabulary:
        datasets_for_vocab_creation = set(
            params.pop("datasets_for_vocab_creation", datasets))

        for key in datasets_for_vocab_creation:
            if key not in datasets:
                raise ConfigurationError(
                    f"invalid 'dataset_for_vocab_creation' {key}")

        datasets = {
            key: dataset
            for key, dataset in datasets.items()
            if key in datasets_for_vocab_creation
        }
        flat_datasets = training_util.as_flat_dict(datasets)
        instance_generator = (instance
                              for key, dataset in flat_datasets.items()
                              for instance in dataset)
        dataset_keys_to_use_str = ", ".join(datasets_for_vocab_creation)

        if vocab:
            logger.info(
                f"Extending model vocabulary using {dataset_keys_to_use_str} data."
            )
            vocab.extend_from_instances(instances=instance_generator)
        else:
            logger.info(
                "From dataset instances, %s will be considered for vocabulary creation.",
                dataset_keys_to_use_str,
            )

            if recover and os.path.exists(vocabulary_path):
                vocab = Vocabulary.from_files(
                    vocabulary_path,
                    vocabulary_params.get("padding_token", None),
                    vocabulary_params.get("oov_token", None),
                )
            else:
                # Using a generator comprehension here is important because, by being lazy,
                # it allows us to not iterate over the dataset when directory_path is specified.
                vocab = Vocabulary.from_params(vocabulary_params,
                                               instances=instance_generator)

        return vocab
Esempio n. 5
0
    def from_params(
            cls,  # type: ignore
            params: Params,
            serialization_dir: str,
            recover: bool = False):
        # pylint: disable=arguments-differ
        typ3 = params.get("trainer", {}).pop("type", "default")

        if typ3 == "default":
            # Special logic to keep old from_params behavior.
            from allennlp.training.trainer import Trainer, TrainerPieces

            pieces = TrainerPieces.from_params(params, serialization_dir,
                                               recover)  # pylint: disable=no-member
            return Trainer.from_params(
                model=pieces.model,
                serialization_dir=serialization_dir,
                iterator=pieces.iterator,
                train_data=pieces.train_dataset,
                validation_data=pieces.validation_dataset,
                params=pieces.params,
                validation_iterator=pieces.validation_iterator)
        else:
            return TrainerBase.by_name(typ3).from_params(
                params, serialization_dir, recover)
    def from_params(cls, params: Params) -> 'ArcMultiChoiceJsonReader':
        tokenizer = Tokenizer.from_params(params.pop('tokenizer', {}))
        token_indexers = token_indexer_dict_from_params(
            params.pop('token_indexers', {}))

        choice_value_type = params.get('choice_value_type', None)
        question_value_type = params.get('question_value_type', None)

        lazy = params.pop('lazy', False)

        return ArcMultiChoiceJsonReader(
            tokenizer=tokenizer,
            token_indexers=token_indexers,
            choice_value_type=choice_value_type,
            question_value_type=question_value_type,
            lazy=lazy)
Esempio n. 7
0
 def __init__(
     self,
     model: Model,
     train_dataset: List[Instance],
     iterator: DataIterator,
     subtrainer_params: Params,
     cross_validation_splitter: CrossValidationSplitter,
     serialization_dir: str,
     group_key: Optional[str] = None,
     leave_model_trained: bool = False,
     validation_dataset: Optional[List[Instance]] = None,
     recover: bool = False
 ) -> None:  # FIXME: does recover make sense? Maybe to continue the CV.
     # To use the same device as the subtrainers, in case `self._cuda_devices` is queried.
     cuda_device = parse_cuda_device(
         subtrainer_params.get('cuda_device', -1))
     super().__init__(serialization_dir, cuda_device=cuda_device)
     self.model = model
     self.train_dataset = train_dataset
     self.iterator = iterator
     self.subtrainer_params = subtrainer_params
     self.cross_validation_splitter = cross_validation_splitter
     self.group_key = group_key
     self.leave_model_trained = leave_model_trained
     self.validation_dataset = validation_dataset
     self.recover = recover
Esempio n. 8
0
    def from_params(
            cls,  # type: ignore
            params: Params,
            serialization_dir: str,
            recover: bool = False):
        # pylint: disable=arguments-differ
        typ3 = params.get("trainer", {}).pop("type", "default")

        if typ3 == "default":
            # Special logic to keep old from_params behavior.
            from allennlp.training.trainer import Trainer
            from allennlp.training.trainer_pieces import TrainerPieces

            pieces = TrainerPieces.from_params(params, serialization_dir,
                                               recover)  # pylint: disable=no-member
            return Trainer.from_params(
                model=pieces.model,
                serialization_dir=serialization_dir,
                iterator=pieces.iterator,
                train_data=pieces.train_dataset,
                validation_data=pieces.validation_dataset,
                params=pieces.params,
                validation_iterator=pieces.validation_iterator)
        else:
            klass = TrainerBase.by_name(typ3)
            # Explicit check to prevent recursion.
            is_overriden = klass.from_params.__func__ != TrainerBase.from_params.__func__  # type: ignore
            assert is_overriden, f"Class {klass.__name__} must override `from_params`."
            return klass.from_params(params, serialization_dir, recover)
Esempio n. 9
0
    def _load(
        cls,
        config: Params,
        serialization_dir: str,
        weights_file: Optional[str] = None,
        cuda_device: int = -1,
        opt_level: Optional[str] = None,
    ) -> Model:
        """
        Ensembles don't have vocabularies or weights of their own, so they override _load.
        """
        if opt_level is not None:
            raise NotImplementedError(f"{cls.__name__} does not support AMP yet.")

        model_params = config.get("model")

        # The experiment config tells us how to _train_ a model, including where to get pre-trained
        # embeddings from.  We're now _loading_ the model, so those embeddings will already be
        # stored in our weights.  We don't need any pretrained weight file anymore, and we don't
        # want the code to look for it, so we remove it from the parameters here.
        remove_weights_related_keys_from_params(model_params)
        model = Model.from_params(vocab=None, params=model_params)

        # Force model to cpu or gpu, as appropriate, to make sure that the embeddings are
        # in sync with the weights
        if cuda_device >= 0:
            model.cuda(cuda_device)
        else:
            model.cpu()

        return model
Esempio n. 10
0
    def from_params(cls, vocab: Vocabulary,
                    params: Params) -> 'QAMultiChoice_OneVsRest_Choices_v1':
        embedder_params = params.pop("text_field_embedder")
        text_field_embedder = TextFieldEmbedder.from_params(
            vocab, embedder_params)

        embeddings_dropout_value = params.pop("embeddings_dropout", 0.0)

        # question encoder
        question_encoder_params = params.pop("question_encoder", None)
        question_enc_aggregate = params.pop("question_encoder_aggregate",
                                            "max")
        share_encoders = params.pop("share_encoders", False)

        # condition the choices or facts encoding on quesiton output states
        choices_init_from_question_states = params.pop(
            "choices_init_from_question_states", False)

        if question_encoder_params is not None:
            question_encoder = Seq2SeqEncoder.from_params(
                question_encoder_params)
        else:
            question_encoder = None

        if share_encoders:
            choice_encoder = question_encoder
            choice_enc_aggregate = question_enc_aggregate
        else:
            # choice encoder
            choice_encoder_params = params.pop("choice_encoder", None)
            choice_enc_aggregate = params.pop("choice_encoder_aggregate",
                                              "max")

            if choice_encoder_params is not None:
                choice_encoder = Seq2SeqEncoder.from_params(
                    choice_encoder_params)
            else:
                choice_encoder = None

        use_choice_sum_instead_of_question = params.get(
            "use_choice_sum_instead_of_question", False)
        init_params = params.pop('initializer', None)
        initializer = (InitializerApplicator.from_params(init_params)
                       if init_params is not None else InitializerApplicator())

        return cls(
            vocab=vocab,
            text_field_embedder=text_field_embedder,
            question_encoder=question_encoder,
            choice_encoder=choice_encoder,
            initializer=initializer,
            aggregate_choice=choice_enc_aggregate,
            aggregate_question=question_enc_aggregate,
            embeddings_dropout_value=embeddings_dropout_value,
            share_encoders=share_encoders,
            choices_init_from_question_states=choices_init_from_question_states,
            use_choice_sum_instead_of_question=
            use_choice_sum_instead_of_question,
            params=params)
Esempio n. 11
0
    def __init__(self, vocab_size, params: Params):
        super().__init__()
        self.vocab_size = vocab_size
        self.char_emb_size = params.get("char_emb_size")
        self.dropout_rate = params.get("dropout_rate")

        self.init_char_embs = Embedding(
            num_embeddings=self.vocab_size,
            embedding_dim=self.char_emb_size,
            padding_idx=0,
        )
        self.cnn = CNN(params)
        self.highway_network = HighwayNetwork(
            Params(
                {"dropout_rate": self.dropout_rate, "input_size": self.char_emb_size}
            )
        )
Esempio n. 12
0
def setup_output_dir(config: Params, loglevel: Optional[str] = None) -> str:
    """Setup the Experiment Folder
    Note that the output_dir stores each run as run-1, ....
    Makes the next run directory. This also sets up the logger
    A run directory has the following structure
    - run-1
        - models
                * modelname*.tar.gz
                - vocabulary
                    * namespace_1.txt
                    * namespace_2.txt ...
        * config.json
        * githash.log of current run
        * gitdiff.log of current run
        * logfile.log (the log of the current run)
    Arguments:
        config (``allennlp.common.Params``): The experiment parameters
        loglevel (str): The logger mode [INFO/DEBUG/ERROR]
    Returns
        str, allennlp.common.Params: The filename, and the modified config
    """
    output_dir = config.get('base_output_dir', "./Outputs")
    make_directory(output_dir)
    last_run = -1
    for dirname in os.listdir(output_dir):
        if dirname.startswith('run-'):
            last_run = max(last_run, int(dirname.split('-')[1]))
    new_dirname = os.path.join(output_dir, 'run-%d' % (last_run + 1))
    make_directory(new_dirname)
    best_model_dirname = os.path.join(new_dirname, 'models')
    make_directory(best_model_dirname)
    vocab_dirname = os.path.join(best_model_dirname, 'vocabulary')
    make_directory(vocab_dirname)

    config_file = os.path.join(new_dirname, 'config.jsonnet')
    write_config_to_file(config_file, config)

    # Save the git hash
    process = Popen('git log -1 --format="%H"'.split(),
                    stdout=PIPE,
                    stderr=PIPE)
    stdout, _ = process.communicate()
    stdout = stdout.decode('ascii').strip('\n').strip('"')
    with open(os.path.join(new_dirname, "githash.log"), "w") as fp:
        fp.write(stdout)

    # Save the git diff
    process = Popen('git diff'.split(), stdout=PIPE, stderr=PIPE)
    stdout, _ = process.communicate()
    with open(os.path.join(new_dirname, "gitdiff.log"), "w") as fp:
        stdout = stdout.decode('ascii', errors="ignore")
        fp.write(stdout)

    if loglevel:
        # Set up the logger
        logfile = os.path.join(new_dirname, 'logfile.log')
        setup_logger(logfile, loglevel)
        return best_model_dirname
Esempio n. 13
0
def _sanitize_config(config: Params) -> None:
    """
    There are some elements of the model config that we need to get rid of
    when we load from archive, as they refer to paths on the training machine
    that are unlikely to exist on the un-archiving machine. To be extra-safe
    we just remove them from the config.

    This is a temporary fix; once we implement https://github.com/allenai/allennlp/issues/244
    it should become unnecessary.
    """
    evaluation_json_file = config.get("model", {}).get('evaluation_json_file',
                                                       None)

    if evaluation_json_file and not os.path.exists(evaluation_json_file):
        logger.warning(
            "specified evaluation_json_file %s does not exist, removing key",
            evaluation_json_file)
        config.get("model", {}).pop('evaluation_json_file')
Esempio n. 14
0
    def from_params(cls, vocab: Vocabulary,
                    params: Params) -> 'QAMultiChoiceMaxAttention':
        embedder_params = params.pop("text_field_embedder")
        text_field_embedder = TextFieldEmbedder.from_params(
            vocab, embedder_params)

        embeddings_dropout_value = params.pop("embeddings_dropout", 0.0)

        # question encoder
        question_encoder_params = params.pop("question_encoder", None)
        question_enc_aggregate = params.pop("question_encoder_aggregate",
                                            "max")
        share_encoders = params.pop("share_encoders", False)

        if question_encoder_params is not None:
            question_encoder = Seq2SeqEncoder.from_params(
                question_encoder_params)
        else:
            question_encoder = None

        if share_encoders:
            choice_encoder = question_encoder
            choice_enc_aggregate = question_enc_aggregate
        else:
            # choice encoder
            choice_encoder_params = params.pop("choice_encoder", None)
            choice_enc_aggregate = params.pop("choice_encoder_aggregate",
                                              "max")

            if choice_encoder_params is not None:
                choice_encoder = Seq2SeqEncoder.from_params(
                    choice_encoder_params)
            else:
                choice_encoder = None

        # question to choice attention
        att_question_to_choice_params = params.get("att_question_to_choice")
        att_question_to_choice = SimilarityFunction.from_params(
            att_question_to_choice_params)

        init_params = params.pop('initializer', None)
        initializer = (InitializerApplicator.from_params(init_params)
                       if init_params is not None else InitializerApplicator())

        return cls(vocab=vocab,
                   text_field_embedder=text_field_embedder,
                   question_encoder=question_encoder,
                   choice_encoder=choice_encoder,
                   initializer=initializer,
                   aggregate_choice=choice_enc_aggregate,
                   aggregate_question=question_enc_aggregate,
                   embeddings_dropout_value=embeddings_dropout_value,
                   att_question_to_choice=att_question_to_choice)
Esempio n. 15
0
    def from_params(cls, params: Params, **extras) -> "Tokenizer":  # type: ignore

        # Backwards compatibility for legacy "word" Tokenizer
        # which provided arguments to intitalize current tokenizers
        # inside "word_splitter" key.
        tokenizer_type = params.get("type")
        splitter_params = params.get("word_splitter")
        if tokenizer_type == "word" or (tokenizer_type is None and splitter_params):
            if not splitter_params:
                splitter_params = Params({"type": "spacy"})
            elif isinstance(splitter_params, str):
                splitter_params = Params({"type": splitter_params})

            if params.get("word_filter") or params.get("word_stemmer"):
                raise ConfigurationError(
                    "Support for word_filter, word_stemmer is dropped in the current default tokenizer."
                )

            start_tokens = params.get("start_tokens")
            end_tokens = params.get("end_tokens")
            if start_tokens:
                splitter_params["start_tokens"] = start_tokens
            if end_tokens:
                splitter_params["end_tokens"] = end_tokens

            logger.warning(
                "Converting old WordTokenizer params - %s \n" "to new params %s.",
                str(params),
                str(splitter_params),
            )

            params = splitter_params

        return super().from_params(params, **extras)
Esempio n. 16
0
    def from_params(
            cls, params: Params
    ) -> 'ArcMultiChoiceWithFactsTextJsonReaderMultiSource':
        # read tokenizers
        field_tokenizers = tokenizer_dict_from_params(
            params.get('tokenizers', {}))
        token_indexers = token_indexer_dict_from_params(
            params.get('token_indexers', {}))

        # external knowledge
        external_knowledge_params = params.pop('external_knowledge')

        choice_value_type = params.get('choice_value_type', None)
        question_value_type = params.get('question_value_type', None)

        no_relevant_fact_add = params.get('no_relevant_fact_add', False)
        no_relevant_fact_text = params.get('no_relevant_fact_text',
                                           NO_RELEVANT_FACT_TEXT)

        lazy = params.pop('lazy', False)
        # params.assert_empty(cls.__name__)

        return ArcMultiChoiceWithFactsTextJsonReaderMultiSource(
            field_tokenizers=field_tokenizers,
            token_indexers=token_indexers,
            external_know_config=external_knowledge_params,
            choice_value_type=choice_value_type,
            question_value_type=question_value_type,
            no_relevant_fact_add=no_relevant_fact_add,
            no_relevant_fact_text=no_relevant_fact_text,
            lazy=lazy)
Esempio n. 17
0
def setup_datasets(params: Params) -> Dict[str, Iterable[Instance]]:
    dataset_reader_params = params.get('dataset_reader')
    validation_dataset_reader_params = params.get('validation_dataset_reader', None)
    dataset_reader = DatasetReader.from_params(dataset_reader_params)

    validation_and_test_dataset_reader: DatasetReader = dataset_reader
    if validation_dataset_reader_params is not None:
        validation_and_test_dataset_reader = DatasetReader.from_params(validation_dataset_reader_params)

    train_data_path = params.get('train_data_path')
    train_data = dataset_reader.read(train_data_path)

    datasets: Dict[str, Iterable[Instance]] = {"train": train_data}

    validation_data_path = params.get('validation_data_path', None)
    if validation_data_path is not None:
        validation_data = validation_and_test_dataset_reader.read(validation_data_path)
        datasets["validation"] = validation_data

    test_data_path = params.get("test_data_path", None)
    if test_data_path is not None:
        test_data = validation_and_test_dataset_reader.read(test_data_path)
        datasets["test"] = test_data
    return datasets
Esempio n. 18
0
    def from_config(cls,
                    config: Params,
                    predictor_name: str = None) -> 'Predictor':
        dataset_reader_params = config["dataset_reader"]
        dataset_reader = DatasetReader.from_params(dataset_reader_params)

        tokenizer = dataset_reader._tokenizer or WordTokenizer()  # pylint: disable=protected-access
        token_indexers = dataset_reader._token_indexers  # pylint: disable=protected-access

        model_name = config.get("model").get("type")
        model = Model.load(config)
        model.eval()

        predictor_name = predictor_name or DEFAULT_PREDICTORS[model_name]
        return Predictor.by_name(predictor_name)(model, tokenizer,
                                                 token_indexers)
Esempio n. 19
0
def datasets_from_params(params: Params) -> Dict[str, Iterable[Instance]]:
    """
    Load all the datasets specified by the config.
    """
    for data_name in [
            "train_data_path", "validation_data_path", "test_data_path"
    ]:
        data_path = params.get(data_name, None)
        if data_path is not None:
            check_for_data_path(data_path, data_name)

    dataset_reader = DatasetReader.from_params(params.pop('dataset_reader'))
    validation_dataset_reader_params = params.pop("validation_dataset_reader",
                                                  None)

    validation_and_test_dataset_reader: DatasetReader = dataset_reader
    if validation_dataset_reader_params is not None:
        logger.info(
            "Using a separate dataset reader to load validation and test data."
        )
        validation_and_test_dataset_reader = DatasetReader.from_params(
            validation_dataset_reader_params)

    train_data_path = params.pop('train_data_path')
    logger.info("Reading training data from %s", train_data_path)
    train_data = dataset_reader.read(train_data_path)

    datasets: Dict[str, Iterable[Instance]] = {"train": train_data}

    validation_data_path = params.pop('validation_data_path', None)
    if validation_data_path is not None:
        logger.info("Reading validation data from %s", validation_data_path)
        validation_data = validation_and_test_dataset_reader.read(
            validation_data_path)
        datasets["validation"] = validation_data

    test_data_path = params.pop("test_data_path", None)
    if test_data_path is not None:
        logger.info("Reading test data from %s", test_data_path)
        test_data = validation_and_test_dataset_reader.read(test_data_path)
        datasets["test"] = test_data

    return datasets
Esempio n. 20
0
    def from_params(  # type: ignore
        cls,
        params: Params,
        serialization_dir: str,
        recover: bool = False,
        cache_directory: str = None,
        cache_prefix: str = None,
    ):

        typ3 = params.get("trainer", {}).pop("type", "default")

        if typ3 == "default":
            # Special logic to keep old from_params behavior.
            from allennlp.training.trainer import Trainer
            from allennlp.training.trainer_pieces import TrainerPieces

            pieces = TrainerPieces.from_params(params, serialization_dir,
                                               recover, cache_directory,
                                               cache_prefix)
            return Trainer.from_params(
                model=pieces.model,
                serialization_dir=serialization_dir,
                iterator=pieces.iterator,
                train_data=pieces.train_dataset,
                validation_data=pieces.validation_dataset,
                params=pieces.params,
                validation_iterator=pieces.validation_iterator,
            )
        else:
            klass = TrainerBase.by_name(typ3)
            # Explicit check to prevent recursion.
            is_overriden = (
                klass.from_params.__func__ !=
                TrainerBase.from_params.__func__  # type: ignore
            )
            assert is_overriden, f"Class {klass.__name__} must override `from_params`."
            return klass.from_params(params, serialization_dir, recover,
                                     cache_directory, cache_prefix)
def modified_model_load(config: Params,
                        serialization_dir: str,
                        weights_file: str = None,
                        cuda_device: int = -1) -> Model:
    """
    Instantiates an already-trained model, based on the experiment
    configuration and some optional overrides.
    """
    weights_file = weights_file or os.path.join(serialization_dir,
                                                _DEFAULT_WEIGHTS)

    # Load vocabulary from file
    vocab_dir = os.path.join(serialization_dir, 'vocabulary')
    # If the config specifies a vocabulary subclass, we need to use it.
    vocab = Vocabulary.from_files(vocab_dir)

    model_params = config.get('model')

    # The experiment config tells us how to _train_ a model, including where to get pre-trained
    # embeddings from.  We're now _loading_ the model, so those embeddings will already be
    # stored in our weights.  We don't need any pretrained weight file anymore, and we don't
    # want the code to look for it, so we remove it from the parameters here.
    remove_pretrained_embedding_params(model_params)
    model = Model.from_params(vocab=vocab, params=model_params)
    model_state = torch.load(weights_file,
                             map_location=util.device_mapping(cuda_device))
    model.load_state_dict(model_state, strict=False)

    # Force model to cpu or gpu, as appropriate, to make sure that the embeddings are
    # in sync with the weights
    if cuda_device >= 0:
        model.cuda(cuda_device)
    else:
        model.cpu()

    return model
Esempio n. 22
0
def datasets_from_params(params: Params) -> Dict[str, Iterable[Instance]]:
    """
    Load all the datasets specified by the config.
    """
    for data_name in ["train_data_path", "validation_data_path", "test_data_path"]:
        data_path = params.get(data_name, None)
        if data_path is not None:
            check_for_data_path(data_path, data_name)

    dataset_reader = DatasetReader.from_params(params.pop('dataset_reader'))
    validation_dataset_reader_params = params.pop("validation_dataset_reader", None)

    validation_and_test_dataset_reader: DatasetReader = dataset_reader
    if validation_dataset_reader_params is not None:
        logger.info("Using a separate dataset reader to load validation and test data.")
        validation_and_test_dataset_reader = DatasetReader.from_params(validation_dataset_reader_params)

    train_data_path = params.pop('train_data_path')
    logger.info("Reading training data from %s", train_data_path)
    train_data = dataset_reader.read(train_data_path)

    datasets: Dict[str, Iterable[Instance]] = {"train": train_data}

    validation_data_path = params.pop('validation_data_path', None)
    if validation_data_path is not None:
        logger.info("Reading validation data from %s", validation_data_path)
        validation_data = validation_and_test_dataset_reader.read(validation_data_path)
        datasets["validation"] = validation_data

    test_data_path = params.pop("test_data_path", None)
    if test_data_path is not None:
        logger.info("Reading test data from %s", test_data_path)
        test_data = validation_and_test_dataset_reader.read(test_data_path)
        datasets["test"] = test_data

    return datasets
Esempio n. 23
0
def train_model(params: Params,
                serialization_dir: str,
                file_friendly_logging: bool = False,
                recover: bool = False,
                force: bool = False,
                cache_directory: str = None,
                cache_prefix: str = None) -> Model:
    """
    Trains the model specified in the given :class:`Params` object, using the data and training
    parameters also specified in that object, and saves the results in ``serialization_dir``.

    Parameters
    ----------
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    recover : ``bool``, optional (default=False)
        If ``True``, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see the ``fine-tune`` command.
    force : ``bool``, optional (default=False)
        If ``True``, we will overwrite the serialization directory if it already exists.
    cache_directory : ``str``, optional
        For caching data pre-processing.  See :func:`allennlp.training.util.datasets_from_params`.
    cache_prefix : ``str``, optional
        For caching data pre-processing.  See :func:`allennlp.training.util.datasets_from_params`.

    Returns
    -------
    best_model: ``Model``
        The model with the best epoch weights.
    """
    create_serialization_dir(params, serialization_dir, recover, force)
    stdout_handler = prepare_global_logging(serialization_dir,
                                            file_friendly_logging)
    prepare_environment(params)

    cuda_device = params.params.get('trainer').get('cuda_device', -1)
    check_for_gpu(cuda_device)

    params.to_file(os.path.join(serialization_dir, CONFIG_NAME))

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)

    trainer_type = params.get("trainer", {}).get("type", "default")

    if trainer_type == "default":
        # Special logic to instantiate backward-compatible trainer.
        pieces = TrainerPieces.from_params(
            params,  # pylint: disable=no-member
            serialization_dir,
            recover,
            cache_directory,
            cache_prefix)
        trainer = Trainer.from_params(
            model=pieces.model,
            serialization_dir=serialization_dir,
            iterator=pieces.iterator,
            train_data=pieces.train_dataset,
            validation_data=pieces.validation_dataset,
            params=pieces.params,
            validation_iterator=pieces.validation_iterator)

        evaluation_iterator = pieces.validation_iterator or pieces.iterator
        evaluation_dataset = pieces.test_dataset

    else:
        if evaluate_on_test:
            raise ValueError(
                "--evaluate-on-test only works with the default Trainer. "
                "If you're using the CallbackTrainer you can use a callback "
                "to evaluate at Events.TRAINING_END; otherwise you'll have "
                "to run allennlp evaluate separately.")

        trainer = TrainerBase.from_params(params, serialization_dir, recover,
                                          cache_directory, cache_prefix)
        evaluation_dataset = None

    params.assert_empty('base train command')

    try:
        metrics = trainer.train()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info(
                "Training interrupted by the user. Attempting to create "
                "a model archive using the current best epoch weights.")
            archive_model(serialization_dir,
                          files_to_archive=params.files_to_archive)
        raise

    # Evaluate
    if evaluation_dataset and evaluate_on_test:
        logger.info(
            "The model will be evaluated using the best epoch weights.")
        test_metrics = evaluate(
            trainer.model,
            evaluation_dataset,
            evaluation_iterator,
            cuda_device=trainer._cuda_devices[0],  # pylint: disable=protected-access,
            # TODO(brendanr): Pass in an arg following Joel's trainer refactor.
            batch_weight_key="")

        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif evaluation_dataset:
        logger.info(
            "To evaluate on the test set after training, pass the "
            "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    cleanup_global_logging(stdout_handler)

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)
    dump_metrics(os.path.join(serialization_dir, "metrics.json"),
                 metrics,
                 log=True)

    # We count on the trainer to have the model with best weights
    return trainer.model
Esempio n. 24
0
def _load(config: Params,
          adapters_dir: str,
          serialization_dir: str,
          weights_file: str = None,
          cuda_device: int = -1) -> 'Model':
    """
    Instantiates an already-trained model, based on the experiment
    configuration and some optional overrides.
    """
    weights_file = weights_file or os.path.join(serialization_dir, "best.th")

    # Load vocabulary from file
    vocab_dir = os.path.join(serialization_dir, 'vocabulary')
    # If the config specifies a vocabulary subclass, we need to use it.
    vocab_params = config.get("vocabulary", Params({}))
    vocab_choice = vocab_params.pop_choice("type", Vocabulary.list_available(), True)
    vocab = Vocabulary.by_name(vocab_choice).from_files(vocab_dir)

    model_params = config.get('model')

    # The experiment config tells us how to _train_ a model, including where to get pre-trained
    # embeddings from.  We're now _loading_ the model, so those embeddings will already be
    # stored in our weights.  We don't need any pretrained weight file anymore, and we don't
    # want the code to look for it, so we remove it from the parameters here.
    remove_pretrained_embedding_params(model_params)
    model = Model.from_params(vocab=vocab, params=model_params)

    # If vocab+embedding extension was done, the model initialized from from_params
    # and one defined by state dict in weights_file might not have same embedding shapes.
    # Eg. when model embedder module was transferred along with vocab extension, the
    # initialized embedding weight shape would be smaller than one in the state_dict.
    # So calling model embedding extension is required before load_state_dict.
    # If vocab and model embeddings are in sync, following would be just a no-op.
    model.extend_embedder_vocab()

    # model_state = torch.load(weights_file, map_location=util.device_mapping(cuda_device))
    # model.load_state_dict(model_state, strict=False)

    for file in os.listdir(adapters_dir):
        logger.info(f"{file} is loading..")

    # loop over the adapters folder and load weights into a dictionary
    for i, layer in enumerate(model.text_field_embedder.token_embedder_bert.bert_model.encoder.layer):
        try:
            for j, (file, attention_adapter, output_attention) in enumerate(zip(os.listdir(adapters_dir), layer.attention.output.adapter, layer.output.adapter)):
                adapter_state = torch.load(os.path.join(adapters_dir, file))
                attention_adapter.load_state_dict(adapter_state['attention_adapter_' + str(i)])
                output_attention.load_state_dict(adapter_state['output_adapter_' + str(i)])
        except AttributeError:
            logger.warning(f"Could not find the adapter model inside the archive {adapters_dir}")
            traceback.print_exc()
            return

    # Force model to cpu or gpu, as appropriate, to make sure that the embeddings are
    # in sync with the weights
    if cuda_device >= 0:
        model.cuda(cuda_device)
    else:
        model.cpu()

    return model
Esempio n. 25
0
def _train_worker(
    process_rank: int,
    params: Params,
    serialization_dir: str,
    file_friendly_logging: bool = False,
    recover: bool = False,
    cache_directory: str = None,
    cache_prefix: str = None,
    include_package: List[str] = None,
    node_rank: int = 0,
    master_addr: str = "127.0.0.1",
    master_port: int = 29500,
    world_size: int = 1,
    distributed_device_ids: List[str] = None,
) -> Optional[Model]:
    """
    Helper to train the configured model/experiment. In distributed mode, this is spawned as a
    worker process. In a single GPU experiment, this returns the ``Model`` object and in distributed
    training, nothing is returned.

    # Parameters

    process_rank : ``int``
        The process index that is initialized using the GPU device id.
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    recover : ``bool``, optional (default=False)
        If ``True``, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see the ``fine-tune`` command.
    cache_directory : ``str``, optional
        For caching data pre-processing.  See :func:`allennlp.training.util.datasets_from_params`.
    cache_prefix : ``str``, optional
        For caching data pre-processing.  See :func:`allennlp.training.util.datasets_from_params`.
    include_package : ``List[str]``, optional
        In distributed mode, since this function would have been spawned as a separate process,
        the extra imports need to be done again. NOTE: This does not have any effect in single
        GPU training.
    node_rank : ``int``, optional
        Rank of the node
    world_size : ``int``, optional
        The number of processes involved in distributed training.

    # Returns

    best_model : ``Model``
        The model with the best epoch weights.
    """
    prepare_global_logging(serialization_dir,
                           file_friendly_logging,
                           rank=process_rank,
                           world_size=world_size)
    prepare_environment(params)

    distributed = world_size > 1

    # not using `allennlp.common.util.is_master` as the process group is yet to be initialized
    master = process_rank == 0

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)

    if distributed:
        # Since the worker is spawned and not forked, the extra imports
        # need to be done again.
        if include_package is not None:
            for package_name in include_package:
                import_submodules(package_name)

        num_procs_per_node = len(distributed_device_ids)
        # The Unique identifier of the worker process among all the processes in the
        # distributed training group is computed here. This is used while initializing
        # the process group using `init_process_group`
        global_rank = node_rank * num_procs_per_node + process_rank

        # In distributed training, the configured device is always going to be a list.
        # The corresponding gpu id for the particular worker is obtained by picking the id
        # from the device list with the rank as index
        gpu_id = distributed_device_ids[process_rank]  # type: ignore

        # Till now, "cuda_device" might not be set in the trainer params.
        # But a worker trainer needs to only know about its specific GPU id.
        params["trainer"]["cuda_device"] = gpu_id
        params["trainer"]["world_size"] = world_size
        params["trainer"]["distributed"] = True

        torch.cuda.set_device(gpu_id)
        dist.init_process_group(
            backend="nccl",
            init_method=f"tcp://{master_addr}:{master_port}",
            world_size=world_size,
            rank=global_rank,
        )
        logging.info(f"Process group of world size {world_size} initialized "
                     f"for distributed training in worker {global_rank}")

    trainer_type = params.get("trainer", {}).get("type", "default")

    if trainer_type == "default":
        # Special logic to instantiate backward-compatible trainer.
        pieces = TrainerPieces.from_params(params, serialization_dir, recover,
                                           cache_directory, cache_prefix)
        trainer = Trainer.from_params(
            model=pieces.model,
            serialization_dir=serialization_dir,
            iterator=pieces.iterator,
            train_data=pieces.train_dataset,
            validation_data=pieces.validation_dataset,
            params=pieces.params,
            validation_iterator=pieces.validation_iterator,
        )

        evaluation_iterator = pieces.validation_iterator or pieces.iterator
        evaluation_dataset = pieces.test_dataset

    else:
        if evaluate_on_test:
            raise ValueError(
                "--evaluate-on-test only works with the default Trainer. "
                "If you're using the CallbackTrainer you can use a callback "
                "to evaluate at Events.TRAINING_END; otherwise you'll have "
                "to run allennlp evaluate separately.")

        trainer = TrainerBase.from_params(params, serialization_dir, recover,
                                          cache_directory, cache_prefix)
        evaluation_dataset = None

    params.assert_empty("base train command")

    try:
        if distributed:  # let the setup get ready for all the workers
            dist.barrier()

        metrics = trainer.train()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if master and os.path.exists(
                os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info(
                "Training interrupted by the user. Attempting to create "
                "a model archive using the current best epoch weights.")
            archive_model(serialization_dir,
                          files_to_archive=params.files_to_archive)
        raise

    if master:
        if evaluation_dataset and evaluate_on_test:
            logger.info(
                "The model will be evaluated using the best epoch weights.")
            test_metrics = evaluate(
                trainer.model,
                evaluation_dataset,
                evaluation_iterator,
                cuda_device=trainer.cuda_device,
                # TODO(brendanr): Pass in an arg following Joel's trainer refactor.
                batch_weight_key="",
            )

            for key, value in test_metrics.items():
                metrics["test_" + key] = value
        elif evaluation_dataset:
            logger.info(
                "To evaluate on the test set after training, pass the "
                "'evaluate_on_test' flag, or use the 'allennlp evaluate' command."
            )
        dump_metrics(os.path.join(serialization_dir, "metrics.json"),
                     metrics,
                     log=True)

    if not distributed:
        return trainer.model

    return None  # to make mypy happy
Esempio n. 26
0
    def from_params(cls, vocab: Vocabulary,
                    params: Params) -> 'QAMultiChoiceKnowReader_v1':
        embedder_params = params.pop("text_field_embedder")
        text_field_embedder = TextFieldEmbedder.from_params(
            vocab, embedder_params)

        embeddings_dropout_value = params.pop("embeddings_dropout", 0.0)

        # whether we want to use knowledge
        use_knowledge = params.pop("use_knowledge", True)
        use_ctx2facts_retrieval_map_as_mask = params.pop(
            "use_ctx2facts_retrieval_map_as_mask", False)

        # question encoder
        question_encoder_params = params.pop("question_encoder", None)
        question_enc_aggregate = params.pop("question_encoder_aggregate",
                                            "max")
        share_encoders = params.pop("share_encoders", False)

        # condition the choices or facts encoding on quesiton output states
        choices_init_from_question_states = params.pop(
            "choices_init_from_question_states", False)
        facts_init_from_question_states = params.pop(
            "facts_init_from_question_states", False)

        if question_encoder_params is not None:
            question_encoder = Seq2SeqEncoder.from_params(
                question_encoder_params)
        else:
            question_encoder = None

        knowledge_encoder = None
        knowledge_enc_aggregate = "max"

        if share_encoders:
            choice_encoder = question_encoder
            choice_enc_aggregate = question_enc_aggregate

            if use_knowledge:
                knowledge_encoder = question_encoder
                knowledge_enc_aggregate = question_enc_aggregate
        else:
            # choice encoder
            choice_encoder_params = params.pop("choice_encoder", None)
            choice_enc_aggregate = params.pop("choice_encoder_aggregate",
                                              "max")

            if choice_encoder_params is not None:
                choice_encoder = Seq2SeqEncoder.from_params(
                    choice_encoder_params)
            else:
                choice_encoder = None

            if use_knowledge:
                knowledge_encoder_params = params.pop("knowledge_encoder",
                                                      None)
                knowledge_enc_aggregate = params.pop(
                    "knowledge_encoder_aggregate", "max")

                if knowledge_encoder_params is not None:
                    knowledge_encoder = Seq2SeqEncoder.from_params(
                        knowledge_encoder_params)
                else:
                    knowledge_encoder = None

        know_interactions_params = params.get("know_interactions")
        know_interactions_aggregate_ffw_params = know_interactions_params.get(
            'aggregate_feedforward')

        # aggregate knowledge input state is inferred automatically
        update_params(know_interactions_aggregate_ffw_params, {
            "input_dim":
            len(know_interactions_params.get("interactions", []))
        })
        know_aggregate_feedforward = FeedForward.from_params(
            params.get("know_interactions").get('aggregate_feedforward'))

        init_params = params.pop('initializer', None)
        initializer = (InitializerApplicator.from_params(init_params)
                       if init_params is not None else InitializerApplicator())

        return cls(
            vocab=vocab,
            text_field_embedder=text_field_embedder,
            question_encoder=question_encoder,
            choice_encoder=choice_encoder,
            use_knowledge=use_knowledge,
            facts_encoder=knowledge_encoder,
            know_aggregate_feedforward=know_aggregate_feedforward,
            initializer=initializer,
            aggregate_choice=choice_enc_aggregate,
            aggregate_question=question_enc_aggregate,
            aggregate_facts=knowledge_enc_aggregate,
            embeddings_dropout_value=embeddings_dropout_value,
            share_encoders=share_encoders,
            choices_init_from_question_states=choices_init_from_question_states,
            facts_init_from_question_states=facts_init_from_question_states,
            use_ctx2facts_retrieval_map_as_mask=
            use_ctx2facts_retrieval_map_as_mask,
            params=params)
Esempio n. 27
0
def find_learning_rate_model(params: Params,
                             serialization_dir: str,
                             start_lr: float = 1e-5,
                             end_lr: float = 10,
                             num_batches: int = 100,
                             linear_steps: bool = False,
                             stopping_factor: float = None,
                             force: bool = False) -> None:
    """
    Runs learning rate search for given `num_batches` and saves the results in ``serialization_dir``

    Parameters
    ----------
    trainer: :class:`~allennlp.common.registrable.Registrable`
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results.
    start_lr: ``float``
        Learning rate to start the search.
    end_lr: ``float``
        Learning rate upto which search is done.
    num_batches: ``int``
        Number of mini-batches to run Learning rate finder.
    linear_steps: ``bool``
        Increase learning rate linearly if False exponentially.
    stopping_factor: ``float``
        Stop the search when the current loss exceeds the best loss recorded by
        multiple of stopping factor. If ``None`` search proceeds till the ``end_lr``
    force: ``bool``
        If True and the serialization directory already exists, everything in it will
        be removed prior to finding the learning rate.
    """
    if os.path.exists(serialization_dir) and force:
        shutil.rmtree(serialization_dir)

    if os.path.exists(serialization_dir) and os.listdir(serialization_dir):
        raise ConfigurationError(
            f'Serialization directory {serialization_dir} already exists and is '
            f'not empty.')
    else:
        os.makedirs(serialization_dir, exist_ok=True)

    prepare_environment(params)

    check_for_gpu(params.get('trainer').get('cuda_device', -1))

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(
        params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(
                f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info(
        "From dataset instances, %s will be considered for vocabulary creation.",
        ", ".join(datasets_for_vocab_creation))
    vocab = Vocabulary.from_params(
        params.pop("vocabulary", {}),
        (instance for key, dataset in all_datasets.items()
         for instance in dataset if key in datasets_for_vocab_creation))

    model = Model.from_params(vocab=vocab, params=params.pop('model'))
    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)

    train_data = all_datasets['train']

    trainer_params = params.pop("trainer")
    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)

    trainer = Trainer.from_params(model,
                                  serialization_dir,
                                  iterator,
                                  train_data,
                                  params=trainer_params,
                                  validation_data=None,
                                  validation_iterator=None)

    logger.info(
        f'Starting learning rate search from {start_lr} to {end_lr} in {num_batches} iterations.'
    )
    learning_rates, losses = search_learning_rate(
        trainer,
        start_lr=start_lr,
        end_lr=end_lr,
        num_batches=num_batches,
        linear_steps=linear_steps,
        stopping_factor=stopping_factor)
    logger.info(f'Finished learning rate search.')
    losses = _smooth(losses, 0.98)

    _save_plot(learning_rates, losses,
               os.path.join(serialization_dir, 'lr-losses.png'))
Esempio n. 28
0
 def from_params(cls, vocab: Vocabulary, params: Params) -> 'ThriftyEmbedding':  # type: ignore
     weights_file = params.get('weights_file')
     params.params['pretrained_file'] = weights_file
     obj = super(ThriftyEmbedding, cls).from_params(vocab, params)
     return obj
Esempio n. 29
0
def train_model(params: Params,
                serialization_dir: str,
                file_friendly_logging: bool = False,
                recover: bool = False) -> Model:
    """
    Trains the model specified in the given :class:`Params` object, using the data and training
    parameters also specified in that object, and saves the results in ``serialization_dir``.

    Parameters
    ----------
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    recover : ``bool``, optional (default=False)
        If ``True``, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see the ``fine-tune`` command.

    Returns
    -------
    best_model: ``Model``
        The model with the best epoch weights.
    """
    prepare_environment(params)

    create_serialization_dir(params, serialization_dir, recover)
    prepare_global_logging(serialization_dir, file_friendly_logging)

    check_for_gpu(params.get('trainer').get('cuda_device', -1))

    params.to_file(os.path.join(serialization_dir, CONFIG_NAME))

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("From dataset instances, %s will be considered for vocabulary creation.",
                ", ".join(datasets_for_vocab_creation))
    vocab = Vocabulary.from_params(
            params.pop("vocabulary", {}),
            (instance for key, dataset in all_datasets.items()
             for instance in dataset
             if key in datasets_for_vocab_creation)
    )

    model = Model.from_params(vocab=vocab, params=params.pop('model'))

    # Initializing the model can have side effect of expanding the vocabulary
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)
    validation_iterator_params = params.pop("validation_iterator", None)
    if validation_iterator_params:
        validation_iterator = DataIterator.from_params(validation_iterator_params)
        validation_iterator.index_with(vocab)
    else:
        validation_iterator = None

    train_data = all_datasets['train']
    validation_data = all_datasets.get('validation')
    test_data = all_datasets.get('test')

    trainer_params = params.pop("trainer")
    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)

    frozen_parameter_names, tunable_parameter_names = \
                   get_frozen_and_tunable_parameter_names(model)
    logger.info("Following parameters are Frozen  (without gradient):")
    for name in frozen_parameter_names:
        logger.info(name)
    logger.info("Following parameters are Tunable (with gradient):")
    for name in tunable_parameter_names:
        logger.info(name)

    trainer_choice = trainer_params.pop_choice("type",
                                               Trainer.list_available(),
                                               default_to_first_choice=True)
    trainer = Trainer.by_name(trainer_choice).from_params(model=model,
                                                          serialization_dir=serialization_dir,
                                                          iterator=iterator,
                                                          train_data=train_data,
                                                          validation_data=validation_data,
                                                          params=trainer_params,
                                                          validation_iterator=validation_iterator)

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)
    params.assert_empty('base train command')

    try:
        metrics = trainer.train()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info("Training interrupted by the user. Attempting to create "
                         "a model archive using the current best epoch weights.")
            archive_model(serialization_dir, files_to_archive=params.files_to_archive)
        raise

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)

    logger.info("Loading the best epoch weights.")
    best_model_state_path = os.path.join(serialization_dir, 'best.th')
    best_model_state = torch.load(best_model_state_path)
    best_model = model
    best_model.load_state_dict(best_model_state)

    if test_data and evaluate_on_test:
        logger.info("The model will be evaluated using the best epoch weights.")
        test_metrics = evaluate(
                best_model, test_data, validation_iterator or iterator,
                cuda_device=trainer._cuda_devices[0] # pylint: disable=protected-access
        )
        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif test_data:
        logger.info("To evaluate on the test set after training, pass the "
                    "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    dump_metrics(os.path.join(serialization_dir, "metrics.json"), metrics, log=True)

    return best_model
Esempio n. 30
0
def train_model(
    params: Params,
    serialization_dir: str,
    file_friendly_logging: bool = False,
    recover: bool = False,
    force: bool = False,
    node_rank: int = 0,
    include_package: List[str] = None,
    batch_weight_key: str = "",
) -> Model:
    """
    Trains the model specified in the given :class:`Params` object, using the data and training
    parameters also specified in that object, and saves the results in ``serialization_dir``.

    # Parameters

    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    recover : ``bool``, optional (default=False)
        If ``True``, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see ``Model.from_archive``.
    force : ``bool``, optional (default=False)
        If ``True``, we will overwrite the serialization directory if it already exists.
    node_rank : ``int``, optional
        Rank of the current node in distributed training
    include_package : ``List[str]``, optional
        In distributed mode, extra packages mentioned will be imported in trainer workers.
    batch_weight_key : ``str``, optional (default="")
        If non-empty, name of metric used to weight the loss on a per-batch basis.

    # Returns

    best_model : ``Model``
        The model with the best epoch weights.
    """
    training_util.create_serialization_dir(params, serialization_dir, recover,
                                           force)
    params.to_file(os.path.join(serialization_dir, CONFIG_NAME))

    distributed_params = params.params.pop("distributed", None)
    # If distributed isn't in the config and the config contains strictly
    # one cuda device, we just run a single training process.
    if distributed_params is None:
        model = _train_worker(
            process_rank=0,
            params=params,
            serialization_dir=serialization_dir,
            file_friendly_logging=file_friendly_logging,
            include_package=include_package,
            batch_weight_key=batch_weight_key,
        )
        archive_model(serialization_dir)
        return model

    # Otherwise, we are running multiple processes for training.
    else:
        # We are careful here so that we can raise a good error if someone
        # passed the wrong thing - cuda_devices are required.
        device_ids = distributed_params.pop("cuda_devices", None)
        multi_device = isinstance(device_ids, list) and len(device_ids) > 1
        num_nodes = distributed_params.pop("num_nodes", 1)

        if not (multi_device or num_nodes > 1):
            raise ConfigurationError(
                "Multiple cuda devices/nodes need to be configured to run distributed training."
            )
        check_for_gpu(device_ids)

        master_addr = distributed_params.pop("master_address", "127.0.0.1")
        master_port = distributed_params.pop("master_port", 29500)
        num_procs = len(device_ids)
        world_size = num_nodes * num_procs

        logging.info(
            f"Switching to distributed training mode since multiple GPUs are configured"
            f"Master is at: {master_addr}:{master_port} | Rank of this node: {node_rank} | "
            f"Number of workers in this node: {num_procs} | Number of nodes: {num_nodes} | "
            f"World size: {world_size}")

        # Creating `Vocabulary` objects from workers could be problematic since
        # the data iterators in each worker will yield only `rank` specific
        # instances. Hence it is safe to construct the vocabulary and write it
        # to disk before initializing the distributed context. The workers will
        # load the vocabulary from the path specified.
        if params.get("vocabulary", Params({})).get("type",
                                                    "") != "from_files":
            vocab = training_util.make_vocab_from_params(
                params.duplicate(), serialization_dir)
            params["vocabulary"] = {
                "type": "from_files",
                "directory": os.path.join(serialization_dir, "vocabulary"),
                "padding_token": vocab._padding_token,
                "oov_token": vocab._oov_token,
            }

        mp.spawn(
            _train_worker,
            args=(
                params.duplicate(),
                serialization_dir,
                file_friendly_logging,
                include_package,
                batch_weight_key,
                node_rank,
                master_addr,
                master_port,
                world_size,
                device_ids,
            ),
            nprocs=num_procs,
        )
        archive_model(serialization_dir)
        model = Model.load(params, serialization_dir)
        return model
Esempio n. 31
0
def train_model(params: Params,
                serialization_dir: str,
                file_friendly_logging: bool = False,
                recover: bool = False,
                force: bool = False,
                debate_mode: List[str] = ('f'),
                judge_filename: str = None,
                update_judge: bool = False,
                eval_mode: bool = False,
                reward_method: str = None,
                detach_value_head: bool = False,
                breakpoint_level: int = 0,
                search_outputs_path: str = None,
                accumulation_steps: int = 1,
                multi_gpu: bool = False,
                choice_mode: str = None,
                qa_loss_weight: float = 0.,
                influence_reward: bool = False,
                theory_of_mind: bool = False,
                num_pred_rounds: int = -1,
                x_order_prob: float = 0.,
                require_action: bool = False,
                single_shot: bool = False) -> Model:
    """
    Trains the model specified in the given :class:`Params` object, using the data and training
    parameters also specified in that object, and saves the results in ``serialization_dir``.

    Parameters
    ----------
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    debate_mode : ``List[str]``
        List of debate turns (e.g. aa, ar, rr, Ar) => capitalization implies search agent
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    recover : ``bool``, optional (default=False)
        If ``True``, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see the ``fine-tune`` command.
    force : ``bool``, optional (default=False)
        If ``True``, we will overwrite the serialization directory if it already exists.
    judge_filename : ``str``, optional (default=None)
        Path to judge config or pre-trained judge model. If config, judge trained during debate. Necessary parameter
        if running in debate mode.
    update_judge : ``bool``, optional (default=False)
        Boolean whether or not to update Judge model during debate training.
    eval_mode : ``bool``, optional (default=False)
        Boolean whether or not to run in eval-only mode, on test data. Does not update/train any of the models.
    reward_method : ``str``, optional (default=False)
        Choice of reward function (RL) or loss function (Supervised Learning) for training debate agents
    detach_value_head : ``bool``, optional (default=False)
        Boolean whether or not to detatch value function gradient updates from the policy network. This prevents
        value function gradients from affecting policy network parameters.
    breakpoint_level : ``int`` optional (default=0)
        Debugging option to set breakpoint sensitivity (0 - no breakpoints).
    id_to_search_filename : ``str`` optional (default=None)
        Path to file with search predictions for each agent - necessary for supervised training
    accumulation_steps : ``int`` (default=1)
        Number of gradient steps to accumulate over before performing an update. Poor-man's batching for instances where
        number of examples per batch is small (limited GPU memory)
    multi_gpu : ``bool`` (default=False)
        Boolean whether or not to run models/training in model parallel mode. Requires specifying GPU allocations for
        trainer, judge, and debaters in the training config file (see training_config/bidaf.race.size=0.5.gpu=2.jsonnet
        for example usage).

    Returns
    -------
    best_model: ``Model``
        The model with the best epoch weights.
    """
    assert (
        not single_shot
    ) or eval_mode, 'Using single shot prediction outside eval_mode not yet supported.'
    assert (not single_shot) or (num_pred_rounds == -1), \
        'Using single shot prediction for a specific number of rounds is not yet supported.'
    # Get number of debate turns, and assert that not performing judge-only training
    num_no_qa_turns = sum([(('l' in debate_turn) or ('w' in debate_turn))
                           for debate_turn in debate_mode])
    if (qa_loss_weight > 0) and (num_no_qa_turns == 0):
        warnings.warn(
            'Unused argument qa_loss_weight in debate mode ' +
            str(debate_mode) +
            '. If this was unintentional, please remove the -q flag.',
            UserWarning)
    not_using_trained_debater = len(
        set('ablwⅰⅱⅲⅳ').intersection(''.join(debate_mode))) == 0
    if (judge_filename is not None) and not_using_trained_debater:
        warnings.warn(
            'Unnecessary to have debaters in debate mode ' + str(debate_mode) +
            '. If this was unintentional, please remove the -j flag.',
            UserWarning)

    prepare_environment(params)
    create_serialization_dir(params, serialization_dir, recover, force)
    prepare_global_logging(serialization_dir, file_friendly_logging)

    # Check that all Desired CUDA Devices exist => trainer => cuda_devices should contain list of required devices
    cuda_device = params.params.get('trainer').get('cuda_device', -1)
    check_for_gpu(cuda_device)

    # Build Allocation Dictionary (to be passed to all future functions)
    if multi_gpu:
        gpu_allocations, allocation_dict = params.params.pop(
            'gpu_allocations', {}), {}
        assert len(gpu_allocations
                   ) == 3, 'Must set gpu_allocations in config if multi-gpu'
        for k in ['debate', 'judge', 'trainer']:
            assert gpu_allocations[
                k] in cuda_device, "Desired GPU not available... current: %s" % str(
                    cuda_device)
            allocation_dict[k] = gpu_allocations[k]
    else:
        allocation_dict = {}

    params.to_file(os.path.join(serialization_dir, CONFIG_NAME))

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)

    trainer_type = params.get("trainer", {}).get("type", "default")

    if trainer_type == "default":
        # Special logic to instantiate backward-compatible trainer.
        params['dataset_reader'][
            'debate_mode'] = debate_mode  # If debate_mode requires sample duplicates
        pieces = TrainerPieces.from_params(params,
                                           serialization_dir,
                                           cuda_device,
                                           recover,
                                           judge_filename=judge_filename,
                                           update_judge=update_judge,
                                           eval_mode=eval_mode,
                                           reward_method=reward_method,
                                           detach_value_head=detach_value_head,
                                           allocation_dict=allocation_dict,
                                           qa_loss_weight=qa_loss_weight,
                                           influence_reward=influence_reward,
                                           theory_of_mind=theory_of_mind)  # pylint: disable=no-member
        trainer = Trainer.from_params(
            model=pieces.model,
            serialization_dir=serialization_dir,
            debate_mode=debate_mode,
            iterator=pieces.iterator,
            train_data=pieces.train_dataset,
            validation_data=pieces.validation_dataset,
            params=pieces.params,
            validation_iterator=pieces.validation_iterator,
            eval_mode=eval_mode,
            breakpoint_level=breakpoint_level,
            search_outputs_path=search_outputs_path,
            accumulation_steps=accumulation_steps,
            allocation_dict=allocation_dict,
            choice_mode=choice_mode,
            num_pred_rounds=num_pred_rounds,
            x_order_prob=x_order_prob,
            require_action=require_action,
            single_shot=single_shot)
        evaluation_iterator = pieces.validation_iterator or pieces.iterator
        evaluation_dataset = pieces.test_dataset
    else:
        assert (len(debate_mode)
                == 1) and (debate_mode[0]
                           == 'f'), 'TrainerBase untested for debate training.'
        trainer = TrainerBase.from_params(params, serialization_dir, recover)
        evaluation_iterator = evaluation_dataset = None

    params.assert_empty('base train command')

    try:
        metrics = trainer.train()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if os.path.exists(os.path.join(serialization_dir,
                                       _DEFAULT_WEIGHTS)) and not eval_mode:
            logging.info(
                "Training interrupted by the user. Attempting to create "
                "a model archive using the current best epoch weights.")
            archive_model(serialization_dir,
                          files_to_archive=params.files_to_archive)
        raise

    # Evaluate
    if evaluation_dataset and evaluate_on_test:
        logger.info(
            "The model will be evaluated using the best epoch weights.")
        test_metrics = evaluate(
            trainer.model,
            evaluation_dataset,
            evaluation_iterator,
            cuda_device=trainer._cuda_devices[0],  # pylint: disable=protected-access,
            batch_weight_key="")

        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif evaluation_dataset:
        logger.info(
            "To evaluate on the test set after training, pass the "
            "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    # Now tar up results
    if not eval_mode:
        archive_model(serialization_dir,
                      files_to_archive=params.files_to_archive)
        dump_metrics(os.path.join(serialization_dir, "metrics.json"),
                     metrics,
                     log=True)
    else:
        dump_metrics(os.path.join(
            serialization_dir,
            "metrics.eval.d=" + '-'.join(debate_mode) + ".json"),
                     metrics,
                     log=True)

    # We count on the trainer to have the model with best weights
    return trainer.model