Example #1
0
    def test_knowbert_wiki_wordnet(self):
        from kb.testing import get_bert_pretraining_reader_with_kg

        reader = get_bert_pretraining_reader_with_kg(
            mask_candidate_strategy='full_mask',
            masked_lm_prob=0.35,
            include_wiki=True)
        instances = reader.read("tests/fixtures/bert_pretraining/shard1.txt")

        vocab = Vocabulary.from_params(
            Params({
                "directory_path": "tests/fixtures/wordnet_wiki_vocab",
            }))

        iterator = BasicIterator()
        iterator.index_with(vocab)

        for batch in iterator(instances, num_epochs=1, shuffle=False):
            pass

        # hack, incompatitable fixtures...
        batch['tokens']['tokens'] = torch.min(batch['tokens']['tokens'],
                                              torch.tensor([17]))
        batch['lm_label_ids']['lm_labels'] = torch.min(
            batch['lm_label_ids']['lm_labels'], torch.tensor([17]))
        model = get_knowbert(vocab, None, include_wiki=True)
        output = model(**batch)
        loss = output['loss']
        loss.backward()
        self.assertTrue(True)
def make_vocab_from_params(params: Params, serialization_dir: str):
    prepare_environment(params)

    vocab_params = params.pop("vocabulary", {})
    os.makedirs(serialization_dir, exist_ok=True)
    vocab_dir = os.path.join(serialization_dir, "vocabulary")

    if os.path.isdir(vocab_dir) and os.listdir(vocab_dir) is not None:
        raise ConfigurationError("The 'vocabulary' directory in the provided "
                                 "serialization directory is non-empty")

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(
        params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(
                f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info(
        "From dataset instances, %s will be considered for vocabulary creation.",
        ", ".join(datasets_for_vocab_creation))

    instances = [
        instance for key, dataset in all_datasets.items()
        for instance in dataset if key in datasets_for_vocab_creation
    ]

    vocab = Vocabulary.from_params(vocab_params, instances)

    logger.info(f"writing the vocabulary to {vocab_dir}.")
    vocab.save_to_files(vocab_dir)
    logger.info("done creating vocab")
Example #3
0
    def set_up_model(self, param_file, dataset_file):
        # pylint: disable=attribute-defined-outside-init
        self.param_file = param_file
        params = Params.from_file(self.param_file)

        reader = DatasetReader.from_params(params['dataset_reader'])
        # The dataset reader might be lazy, but a lazy list here breaks some of our tests.
        instances = list(reader.read(str(dataset_file)))
        # Use parameters for vocabulary if they are present in the config file, so that choices like
        # "non_padded_namespaces", "min_count" etc. can be set if needed.
        if 'vocabulary' in params:
            vocab_params = params['vocabulary']
            vocab = Vocabulary.from_params(params=vocab_params,
                                           instances=instances)
        else:
            vocab = Vocabulary.from_instances(instances)
        self.vocab = vocab
        self.instances = instances
        self.model = Model.from_params(vocab=self.vocab,
                                       params=params['model'])

        # TODO(joelgrus) get rid of these
        # (a lot of the model tests use them, so they'll have to be changed)
        self.dataset = Batch(self.instances)
        self.dataset.index_instances(self.vocab)
Example #4
0
def setup_vocab(params):
    all_datasets = setup_datasets(params)
    vocab: Vocabulary = Vocabulary.from_params(
        params.get("vocabulary", {}),
        (instance for key, dataset in all_datasets.items() for instance in dataset)
    )
    return vocab
Example #5
0
def write_for_official_eval(model_archive_file, test_file, output_file,
                            label_ids_to_label):
    archive = load_archive(model_archive_file)
    model = archive.model

    reader = DatasetReader.from_params(archive.config['dataset_reader'])

    iterator = DataIterator.from_params(Params({"type": "basic", "batch_size": 4}))
    vocab = Vocabulary.from_params(archive.config['vocabulary'])
    iterator.index_with(vocab)

    model.cuda()
    model.eval()

    instances = reader.read(test_file)
    predictions = []
    for batch in iterator(instances, num_epochs=1, shuffle=False):
        batch = move_to_device(batch, cuda_device=0)
        output = model(**batch)

        batch_labels = [
            label_ids_to_label[i]
            for i in output['predictions'].cpu().numpy().tolist()
        ]

        predictions.extend(batch_labels)


    with open(output_file, 'w') as fout:
        for p in predictions:
            fout.write("{}\n".format(p))
Example #6
0
def make_vocab_from_params(params: Params):
    prepare_environment(params)

    vocab_params = params.pop("vocabulary", {})
    vocab_dir = vocab_params.get('directory_path')
    if vocab_dir is None:
        raise ConfigurationError("To use `make-vocab` your configuration must contain a value "
                                 "at vocabulary.directory_path")

    os.makedirs(vocab_dir, exist_ok=True)

    all_datasets = datasets_from_params(params)

    datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_for_vocab_creation))
    vocab = Vocabulary.from_params(Params({}),
                                   (instance for key, dataset in all_datasets.items()
                                    for instance in dataset
                                    if key in datasets_for_vocab_creation))

    vocab.save_to_files(vocab_dir)
    logger.info("done creating vocab")
Example #7
0
    def set_up_model(self, param_file, dataset_file):
        # pylint: disable=attribute-defined-outside-init
        initial_working_dir = os.getcwd()
        # Change directory to module root.
        os.chdir(self.MODULE_ROOT)

        self.param_file = param_file
        params = Params.from_file(self.param_file)

        reader = DatasetReader.from_params(params['dataset_reader'])
        instances = reader.read(dataset_file)
        # Use parameters for vocabulary if they are present in the config file, so that choices like
        # "non_padded_namespaces", "min_count" etc. can be set if needed.
        if 'vocabulary' in params:
            vocab_params = params['vocabulary']
            vocab = Vocabulary.from_params(params=vocab_params, instances=instances)
        else:
            vocab = Vocabulary.from_instances(instances)
        self.vocab = vocab
        self.instances = instances
        self.model = Model.from_params(self.vocab, params['model'])

        # TODO(joelgrus) get rid of these
        # (a lot of the model tests use them, so they'll have to be changed)
        self.dataset = Batch(self.instances)
        self.dataset.index_instances(self.vocab)

        # Change directory back to what it was initially
        os.chdir(initial_working_dir)
Example #8
0
def make_vocab_from_params(params: Params):
    prepare_environment(params)

    vocab_params = params.pop("vocabulary", {})
    vocab_dir = vocab_params.get('directory_path')
    if vocab_dir is None:
        raise ConfigurationError(
            "To use `make-vocab` your configuration must contain a value "
            "at vocabulary.directory_path")

    os.makedirs(vocab_dir, exist_ok=True)

    all_datasets = datasets_from_params(params)

    datasets_for_vocab_creation = set(
        params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(
                f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("Creating a vocabulary using %s data.",
                ", ".join(datasets_for_vocab_creation))
    vocab = Vocabulary.from_params(Params(
        {}), (instance for key, dataset in all_datasets.items()
              for instance in dataset if key in datasets_for_vocab_creation))

    vocab.save_to_files(vocab_dir)
    logger.info("done creating vocab")
Example #9
0
    def setUp(self) -> None:
        super().setUp()
        param_file = FIXTURES_ROOT / "pointer_rewrite" / "lstm_lstm_pointer_rewrite.jsonnet"
        dataset_file = FIXTURES_ROOT / "test_pointer_rewrite.txt"
        self.param_file = param_file
        params = Params.from_file(self.param_file)

        # 获取reader
        reader = DatasetReader.from_params(params["dataset_reader"])
        instances = reader.read(str(dataset_file))
        # 如果存在词表的参数,则加载词表
        if "vocabulary" in params:
            vocab_params = params["vocabulary"]
            vocab = Vocabulary.from_params(
                params=vocab_params, instances=instances)
        else:
            vocab = Vocabulary.from_instances(instances)

        self.vocab = vocab
        self.instances = instances
        self.instances.index_with(vocab)
        # 加载模型
        self.model = Model.from_params(params=params["model"], vocab=self.vocab)

        self.dataset = Batch(list(self.instances))
        self.dataset.index_instances(self.vocab)
        self.TEST_DIR = Path(tempfile.mkdtemp(prefix="allennlp_tests"))
Example #10
0
    def setUp(self):
        super().setUp()
        params = Params({
            "model": {
                "type": "simple_tagger",
                "text_field_embedder": {
                    "token_embedders": {
                        "tokens": {
                            "type": "embedding",
                            "embedding_dim": 5
                        }
                    }
                },
                "encoder": {
                    "type": "lstm",
                    "input_size": 5,
                    "hidden_size": 7,
                    "num_layers": 2
                },
            },
            "dataset_reader": {
                "type": "sequence_tagging"
            },
            "train_data_path":
            str(self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv"),
            "validation_data_path":
            str(self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv"),
            "iterator": {
                "type": "basic",
                "batch_size": 2
            },
            "trainer": {
                "cuda_device": -1,
                "num_epochs": 2,
                "optimizer": "adam"
            },
        })
        all_datasets = datasets_from_params(params)
        vocab = Vocabulary.from_params(
            params.pop("vocabulary", {}),
            instances=(instance for dataset in all_datasets.values()
                       for instance in dataset),
        )
        model = Model.from_params(vocab=vocab, params=params.pop("model"))
        iterator = DataIterator.from_params(params.pop("iterator"))
        iterator.index_with(vocab)
        train_data = all_datasets["train"]
        trainer_params = params.pop("trainer")
        serialization_dir = os.path.join(self.TEST_DIR,
                                         "test_search_learning_rate")

        self.trainer = TrainerBase.from_params(
            model=model,
            serialization_dir=serialization_dir,
            iterator=iterator,
            train_data=train_data,
            params=trainer_params,
            validation_data=None,
            validation_iterator=None,
        )
Example #11
0
def make_vocab_from_params(params: Params, serialization_dir: str):
    prepare_environment(params)

    vocab_params = params.pop("vocabulary", {})
    os.makedirs(serialization_dir, exist_ok=True)
    vocab_dir = os.path.join(serialization_dir, "vocabulary")

    if os.path.isdir(vocab_dir) and os.listdir(vocab_dir) is not None:
        raise ConfigurationError("The 'vocabulary' directory in the provided "
                                 "serialization directory is non-empty")

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("From dataset instances, %s will be considered for vocabulary creation.",
                ", ".join(datasets_for_vocab_creation))

    instances = [instance for key, dataset in all_datasets.items()
                 for instance in dataset
                 if key in datasets_for_vocab_creation]

    vocab = Vocabulary.from_params(vocab_params, instances)

    logger.info(f"writing the vocabulary to {vocab_dir}.")
    vocab.save_to_files(vocab_dir)
    logger.info("done creating vocab")
Example #12
0
    def test_no_eval(self):
        reader = KGTupleReader()
        instances = reader.read('tests/fixtures/kg_embeddings/wn18rr_train.txt')

        self.assertTrue(len(instances) == 8)

        # create the vocab and index to make sure things look good
        vocab = Vocabulary.from_params(Params({}), instances)
        # (+2 for @@PADDING@@ and @@UNKNOWN@@
        self.assertEqual(vocab.get_vocab_size("entity"), 5 + 2)
        self.assertEqual(vocab.get_vocab_size("relation"), 4 + 2)

        # now get a batch
        iterator = BasicIterator(batch_size=32)
        iterator.index_with(vocab)
        for batch in iterator(instances, num_epochs=1, shuffle=False):
            pass

        # check it!
        expected_entity = [1, 2, 1, 3, 3, 4, 1, 5]
        expected_relation = ['_hypernym', '_hypernym_reverse',
                              '_derivationally_related_form', '_derivationally_related_form_reverse',
                              '_hypernym_reverse', '_hypernym', '_hypernym_reverse',
                              '_hypernym_reverse']
        expected_entity2 = [[2, 3], [1], [3], [1], [1], [1, 5], [4], [4]]

        self._check_batch(batch, vocab,
                          expected_entity, expected_relation, expected_entity2)
Example #13
0
def make_vocab_from_params(
    params: Params, serialization_dir: str, print_statistics: bool = False
) -> Vocabulary:
    vocab_params = params.pop("vocabulary", {})
    os.makedirs(serialization_dir, exist_ok=True)
    vocab_dir = os.path.join(serialization_dir, "vocabulary")

    if os.path.isdir(vocab_dir) and os.listdir(vocab_dir) is not None:
        raise ConfigurationError(
            "The 'vocabulary' directory in the provided serialization directory is non-empty"
        )

    datasets_for_vocab_creation: Optional[List[str]] = params.pop(
        "datasets_for_vocab_creation", None
    )
    # Do a quick sanity check here. There's no need to load any datasets if the vocab
    # type is "empty".
    if datasets_for_vocab_creation is None and vocab_params.get("type") in ("empty", "from_files"):
        datasets_for_vocab_creation = []

    datasets: Dict[str, Dataset]
    if datasets_for_vocab_creation is None:
        # If `datasets_for_vocab_creation` was not specified, we'll use all datasets
        # from the config.
        datasets = datasets_from_params(params)
    else:
        for dataset_name in datasets_for_vocab_creation:
            data_path = f"{dataset_name}_data_path"
            if data_path not in params:
                raise ConfigurationError(f"invalid 'datasets_for_vocab_creation' {dataset_name}")
        datasets = datasets_from_params(
            params,
            train=("train" in datasets_for_vocab_creation),
            validation=("validation" in datasets_for_vocab_creation),
            test=("test" in datasets_for_vocab_creation),
        )

    instances: Iterable[Instance] = (
        instance
        for key, dataset in datasets.items()
        if datasets_for_vocab_creation is None or key in datasets_for_vocab_creation
        for instance in dataset
    )

    if print_statistics:
        instances = list(instances)

    vocab = Vocabulary.from_params(vocab_params, instances=instances)

    logger.info(f"writing the vocabulary to {vocab_dir}.")
    vocab.save_to_files(vocab_dir)
    logger.info("done creating vocab")

    if print_statistics:
        dataset = Batch(instances)
        dataset.index_instances(vocab)
        dataset.print_statistics()
        vocab.print_statistics()

    return vocab
Example #14
0
    def set_up_model(self, param_file, dataset_file):

        self.param_file = param_file
        params = Params.from_file(self.param_file)

        reader = DatasetReader.from_params(params["dataset_reader"])
        # The dataset reader might be lazy, but a lazy list here breaks some of our tests.
        instances = reader.read(str(dataset_file))
        # Use parameters for vocabulary if they are present in the config file, so that choices like
        # "non_padded_namespaces", "min_count" etc. can be set if needed.
        if "vocabulary" in params:
            vocab_params = params["vocabulary"]
            vocab = Vocabulary.from_params(params=vocab_params,
                                           instances=instances)
        else:
            vocab = Vocabulary.from_instances(instances)
        self.vocab = vocab
        self.instances = instances
        self.instances.index_with(vocab)
        self.model = Model.from_params(vocab=self.vocab,
                                       params=params["model"])

        # TODO(joelgrus) get rid of these
        # (a lot of the model tests use them, so they'll have to be changed)
        self.dataset = Batch(list(self.instances))
        self.dataset.index_instances(self.vocab)
Example #15
0
def dry_run_from_params(params: Params, serialization_dir: str) -> None:
    prepare_environment(params)

    vocab_params = params.pop("vocabulary", {})
    os.makedirs(serialization_dir, exist_ok=True)
    vocab_dir = os.path.join(serialization_dir, "vocabulary")

    if os.path.isdir(vocab_dir) and os.listdir(vocab_dir) is not None:
        raise ConfigurationError("The 'vocabulary' directory in the provided "
                                 "serialization directory is non-empty")

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(
        params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(
                f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info(
        "From dataset instances, %s will be considered for vocabulary creation.",
        ", ".join(datasets_for_vocab_creation))

    instances = [
        instance for key, dataset in all_datasets.items()
        for instance in dataset if key in datasets_for_vocab_creation
    ]

    vocab = Vocabulary.from_params(vocab_params, instances)
    dataset = Batch(instances)
    dataset.index_instances(vocab)
    dataset.print_statistics()
    vocab.print_statistics()

    logger.info(f"writing the vocabulary to {vocab_dir}.")
    vocab.save_to_files(vocab_dir)

    model = Model.from_params(vocab=vocab, params=params.pop('model'))
    trainer_params = params.pop("trainer")
    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)

    frozen_parameter_names, tunable_parameter_names = \
                   get_frozen_and_tunable_parameter_names(model)

    stdout_handler = prepare_global_logging(serialization_dir, False)

    logger.info("Following parameters are Frozen  (without gradient):")
    for name in frozen_parameter_names:
        logger.info(name)
    logger.info("Following parameters are Tunable (with gradient):")
    for name in tunable_parameter_names:
        logger.info(name)

    cleanup_global_logging(stdout_handler)
Example #16
0
def tasks_and_vocab_from_params(
        params: Params,
        serialization_dir: str) -> Tuple[List[Task], Vocabulary]:
    """
    Load each of the tasks in the model from the ``params`` file
    and load the datasets associated with each of these task.
    Create the vocavulary from ``params`` using the concatenation of the ``datasets_for_vocab_creation``
    from each of the task specific dataset.
    
    Parameters
    ----------
    params: ``Params``
        A parameter object specifing an experiment.
    serialization_dir: ``str``
        Directory in which to save the model and its logs.
    Returns
    -------
    task_list: ``List[Task]``
        A list containing the tasks of the model to train.
    vocab: ``Vocabulary``
        The vocabulary fitted on the datasets_for_vocab_creation.
    """
    ### Instantiate the different tasks ###
    task_list = []
    instances_for_vocab_creation = itertools.chain()
    datasets_for_vocab_creation = {}
    task_keys = [key for key in params.keys() if re.search("^task_", key)]

    for key in task_keys:
        logger.info("Creating %s", key)
        task_params = params.pop(key)
        task_description = task_params.pop("task_description")
        task_data_params = task_params.pop("data_params")

        task = Task.from_params(params=task_description)
        task_list.append(task)

        task_instances_for_vocab, task_datasets_for_vocab = task.load_data_from_params(
            params=task_data_params)
        instances_for_vocab_creation = itertools.chain(
            instances_for_vocab_creation, task_instances_for_vocab)
        datasets_for_vocab_creation[task._name] = task_datasets_for_vocab

    ### Create and save the vocabulary ###
    for task_name, task_dataset_list in datasets_for_vocab_creation.items():
        logger.info("Creating a vocabulary using %s data from %s.",
                    ", ".join(task_dataset_list), task_name)

    logger.info("Fitting vocabulary from dataset")
    vocab = Vocabulary.from_params(params.pop("vocabulary", {}),
                                   instances_for_vocab_creation)

    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))
    logger.info("Vocabulary saved to %s",
                os.path.join(serialization_dir, "vocabulary"))

    return task_list, vocab
Example #17
0
    def test_reader(self):
        reader = get_reader(masked_lm_prob=0.15)

        np.random.seed(5)
        instances = reader.read("tests/fixtures/bert_pretraining/shard1.txt")

        vocab = Vocabulary.from_params(Params({
            "directory_path": "tests/fixtures/bert/vocab_dir_with_entities_for_tokenizer_and_generator"
        }))
        iterator = DataIterator.from_params(Params({"type": "basic"}))
        iterator.index_with(vocab)

        for batch in iterator(instances, num_epochs=1, shuffle=False):
            break

        actual_tokens_ids = batch['tokens']['tokens']
        expected_tokens_ids = torch.tensor(
                [[16, 18, 19, 20,  1, 19, 21, 13, 17, 21,  3,  4, 12, 13, 17],
                [16,  1, 13, 17, 21,  1,  1, 13, 17,  0,  0,  0,  0,  0,  0]])

        self.assertEqual(actual_tokens_ids.tolist(), expected_tokens_ids.tolist())

        actual_entities = batch['candidates']['wordnet']['candidate_entities']['ids']
        expected_entities = torch.tensor(
                    [[[29, 30],
                     [31,  0],
                     [31,  0]],
                   
                    [[ 0,  0],
                     [ 0,  0],
                     [ 0,  0]]])
        self.assertEqual(actual_entities.tolist(), expected_entities.tolist())

        expected_spans = torch.tensor(
                       [[[ 1,  3],
                         [ 2,  3],
                         [ 5,  6]],
                
                        [[-1, -1],
                         [-1, -1],
                         [-1, -1]]])
        actual_spans = batch['candidates']['wordnet']['candidate_spans']
        self.assertEqual(actual_spans.tolist(), expected_spans.tolist())

        expected_lm_labels = torch.tensor(
                [[ 0,  0,  0,  0,  0,  0, 20,  0,  0,  2,  0,  0,  0,  0,  0],
                 [ 0,  0,  0,  0,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]])
        actual_lm_labels = batch['lm_label_ids']['lm_labels']
        self.assertEqual(actual_lm_labels.tolist(), expected_lm_labels.tolist())

        expected_segment_ids = torch.tensor(
            [[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1],
             [0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]])
        self.assertEqual(batch['segment_ids'].tolist(), expected_segment_ids.tolist())
        self.assertTrue(batch['segment_ids'].dtype == torch.long)
Example #18
0
    def test_multi_iterator(self):
        params, file_paths = get_dataset_params_paths(['ner', 'ccg'])

        multitask_reader = DatasetReader.from_params(params)
        dataset = multitask_reader.read(file_paths)

        iterator_params = Params({
            "type": "multitask_iterator",
            "iterators": {
                "ner": {
                    "type": "bucket",
                    "sorting_keys": [["tokens", "num_tokens"]],
                    "padding_noise": 0.0,
                    "batch_size": 2
                },
                "ccg": {
                    "type": "basic",
                    "batch_size": 1
                }
            },
            "names_to_index": ["ner", "ccg"],
        })

        multi_iterator = DataIterator.from_params(iterator_params)

        # make the vocab
        vocab = Vocabulary.from_params(Params({}),
                                       (instance for instance in dataset))
        multi_iterator.index_with(vocab)

        all_batches = []
        for epoch in range(2):
            all_batches.append([])
            for batch in multi_iterator(dataset, shuffle=True, num_epochs=1):
                all_batches[-1].append(batch)

        # 3 batches per epoch -
        self.assertEqual([len(b) for b in all_batches], [3, 3])

        ner_batches = []
        ccg_batches = []
        for epoch_batches in all_batches:
            ner_batches.append(0)
            ccg_batches.append(0)
            for batch in epoch_batches:
                if 'original_pos_tags' not in batch:
                    ner_batches[-1] += 1
                if 'original_pos_tags' in batch:
                    ccg_batches[-1] += 1

        # 1 NER batch per epoch, 2 CCG per epoch
        self.assertEqual(ner_batches, [1, 1])
        self.assertEqual(ccg_batches, [2, 2])
    def setUp(self):
        super().setUp()
        params = Params({
                "model": {
                    "type": "simple_tagger",
                    "text_field_embedder": {
                        "token_embedders": {
                            "tokens": {
                                "type": "embedding",
                                "embedding_dim": 5
                            }
                        }
                    },
                    "encoder": {
                        "type": "lstm",
                        "input_size": 5,
                        "hidden_size": 7,
                        "num_layers": 2
                    }
                },
                "dataset_reader": {"type": "sequence_tagging"},
                "train_data_path": str(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv'),
                "validation_data_path": str(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv'),
                "iterator": {"type": "basic", "batch_size": 2},
                "trainer": {
                    "cuda_device": -1,
                    "num_epochs": 2,
                    "optimizer": "adam"
                }
            })
        all_datasets = datasets_from_params(params)
        vocab = Vocabulary.from_params(
            params.pop("vocabulary", {}),
            (instance for dataset in all_datasets.values()
             for instance in dataset)
        )
        model = Model.from_params(vocab=vocab, params=params.pop('model'))
        iterator = DataIterator.from_params(params.pop("iterator"))
        iterator.index_with(vocab)
        train_data = all_datasets['train']
        trainer_params = params.pop("trainer")
        serialization_dir = os.path.join(self.TEST_DIR, 'test_search_learning_rate')

        self.trainer = Trainer.from_params(model,
                                           serialization_dir,
                                           iterator,
                                           train_data,
                                           params=trainer_params,
                                           validation_data=None,
                                           validation_iterator=None)
Example #20
0
def dry_run_from_params(params: Params, serialization_dir: str) -> None:
    prepare_environment(params)

    vocab_params = params.pop("vocabulary", {})
    os.makedirs(serialization_dir, exist_ok=True)
    vocab_dir = os.path.join(serialization_dir, "vocabulary")

    if os.path.isdir(vocab_dir) and os.listdir(vocab_dir) is not None:
        raise ConfigurationError("The 'vocabulary' directory in the provided "
                                 "serialization directory is non-empty")

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("From dataset instances, %s will be considered for vocabulary creation.",
                ", ".join(datasets_for_vocab_creation))

    instances = [instance for key, dataset in all_datasets.items()
                 for instance in dataset
                 if key in datasets_for_vocab_creation]

    vocab = Vocabulary.from_params(vocab_params, instances)
    dataset = Batch(instances)
    dataset.index_instances(vocab)
    dataset.print_statistics()
    vocab.print_statistics()

    logger.info(f"writing the vocabulary to {vocab_dir}.")
    vocab.save_to_files(vocab_dir)

    model = Model.from_params(vocab=vocab, params=params.pop('model'))
    trainer_params = params.pop("trainer")
    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)

    frozen_parameter_names, tunable_parameter_names = \
                   get_frozen_and_tunable_parameter_names(model)
    logger.info("Following parameters are Frozen  (without gradient):")
    for name in frozen_parameter_names:
        logger.info(name)
    logger.info("Following parameters are Tunable (with gradient):")
    for name in tunable_parameter_names:
        logger.info(name)
    def setUp(self) -> None:
        super().setUp()
        param_file = FIXTURES_ROOT / "pointer_rewrite" / "bert_transformer_pointer_rewrite.jsonnet"
        dataset_file = FIXTURES_ROOT / "test_pointer_rewrite.txt"
        self.param_file = param_file
        params = Params.from_file(self.param_file)

        # 构建适用于bert model的词表,和vocabulary词表保持一致
        vocab_path = params["dataset_reader"]["vocab_path"]
        # 新生成的bert词表的路径
        bert_temp_dir = tempfile.mkdtemp(suffix="bert")
        with open(Path(vocab_path) / "tokens.txt", 'r', encoding="utf-8") as f, \
            open(Path(bert_temp_dir) / "vocab.txt", 'w', encoding="utf-8") as fp:
            fp.write("[PAD]" + "\n")
            for line in f:
                line = line.strip()
                fp.write(line)
                fp.write("\n")

        # 改写config中的部分参数
        overrides_config = {
            "dataset_reader.model_name": bert_temp_dir,
            "model.model_name": params["model"]["model_name"] + "/config.json"
        }
        self.overrides_config = json.dumps(overrides_config)
        params = Params.from_file(self.param_file,
                                  params_overrides=self.overrides_config)
        # 获取reader
        reader = DatasetReader.from_params(params["dataset_reader"])
        instances = reader.read(str(dataset_file))
        # 如果存在词表的参数,则加载词表
        if "vocabulary" in params:
            vocab_params = params["vocabulary"]
            vocab = Vocabulary.from_params(params=vocab_params,
                                           instances=instances)
        else:
            vocab = Vocabulary.from_instances(instances)

        self.vocab = vocab
        self.instances = instances
        self.instances.index_with(vocab)
        # 加载模型
        # 将模型对应的model_name改成对应的config文件
        self.model = Model.from_params(params=params["model"],
                                       vocab=self.vocab)

        self.dataset = Batch(list(self.instances))
        self.dataset.index_instances(self.vocab)
        self.TEST_DIR = Path(tempfile.mkdtemp(prefix="allennlp_tests"))
Example #22
0
def make_vocab_from_params(
    params: Params, serialization_dir: str, print_statistics: bool = False
) -> Vocabulary:
    vocab_params = params.pop("vocabulary", {})
    os.makedirs(serialization_dir, exist_ok=True)
    vocab_dir = os.path.join(serialization_dir, "vocabulary")

    if os.path.isdir(vocab_dir) and os.listdir(vocab_dir) is not None:
        raise ConfigurationError(
            "The 'vocabulary' directory in the provided serialization directory is non-empty"
        )

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info(
        "From dataset instances, %s will be considered for vocabulary creation.",
        ", ".join(datasets_for_vocab_creation),
    )

    instances: Iterable[Instance] = (
        instance
        for key, dataset in all_datasets.items()
        if key in datasets_for_vocab_creation
        for instance in dataset
    )

    if print_statistics:
        instances = list(instances)

    vocab = Vocabulary.from_params(vocab_params, instances=instances)

    logger.info(f"writing the vocabulary to {vocab_dir}.")
    vocab.save_to_files(vocab_dir)
    logger.info("done creating vocab")

    if print_statistics:
        dataset = Batch(instances)
        dataset.index_instances(vocab)
        dataset.print_statistics()
        vocab.print_statistics()

    return vocab
Example #23
0
def get_knowbert_model():
    vocab = Vocabulary.from_params(
        Params({
            "directory_path":
            "tests/fixtures/kg_embeddings/tucker_wordnet/vocabulary",
        }))

    params = Params({
        "type":
        "knowbert",
        "soldered_kgs": {
            "wordnet": {
                "type": "soldered_kg",
                "entity_linker": {
                    "type": "entity_linking_with_candidate_mentions",
                    "kg_model": {
                        "type":
                        "from_archive",
                        "archive_file":
                        "tests/fixtures/kg_embeddings/tucker_wordnet/model.tar.gz",
                    },
                    "contextual_embedding_dim": 12,
                    "max_sequence_length": 64,
                    "span_encoder_config": {
                        "hidden_size": 24,
                        "num_hidden_layers": 1,
                        "num_attention_heads": 3,
                        "intermediate_size": 37
                    },
                },
                "span_attention_config": {
                    "hidden_size": 24,
                    "num_hidden_layers": 2,
                    "num_attention_heads": 4,
                    "intermediate_size": 55
                }
            },
        },
        "soldered_layers": {
            "wordnet": 1
        },
        "bert_model_name":
        "tests/fixtures/bert/bert_test_fixture.tar.gz",
    })

    model = Model.from_params(params, vocab=vocab)
    return model, vocab
Example #24
0
def setup_model(params_file, dataset_file):
    params = Params.from_file(params_file)

    #reader = DatasetReader.from_params(params['dataset_reader'])
    reader = ToxicReader()
    instances = reader.read(str(dataset_file))
    Vocabulary.from_instances(instances)
    if 'vocabulary' in params:
        vocab_params = params['vocabulary']
        vocab = Vocabulary.from_params(params=vocab_params, instances=instances)
    else:
        vocab = Vocabulary.from_instances(instances)
    
    vocab.save_to_files("new_vocab2")
    dataset = Batch(instances)
    dataset.index_instances(vocab)
    
    print(dataset.as_tensor_dict())
Example #25
0
    def __init__(self,
                 model_archive,
                 batch_size=32,
                 masking_strategy=None,
                 wordnet_entity_file=None,
                 vocab_dir=None):

        # get bert_tokenizer_and_candidate_generator
        if os.path.isdir(model_archive):
            config = Params.from_file(
                os.path.join(model_archive, 'config.json'))
        else:
            config = _extract_config_from_archive(cached_path(model_archive))

        # look for the bert_tokenizers and candidate_generator
        candidate_generator_params = _find_key(
            config['dataset_reader'].as_dict(),
            'tokenizer_and_candidate_generator')

        if wordnet_entity_file is not None:
            candidate_generator_params['entity_candidate_generators'][
                'wordnet']['entity_file'] = wordnet_entity_file

        self.tokenizer_and_candidate_generator = TokenizerAndCandidateGenerator.\
                from_params(Params(candidate_generator_params))
        self.tokenizer_and_candidate_generator.whitespace_tokenize = False

        assert masking_strategy is None or masking_strategy == 'full_mask'
        self.masking_strategy = masking_strategy

        # need bert_tokenizer_and_candidate_generator
        if vocab_dir is not None:
            vocab_params = Params({"directory_path": vocab_dir})
        else:
            vocab_params = config['vocabulary']
        self.vocab = Vocabulary.from_params(vocab_params)

        self.iterator = DataIterator.from_params(
            Params({
                "type": "basic",
                "batch_size": batch_size
            }))
        self.iterator.index_with(self.vocab)
Example #26
0
File: testing.py Project: zxlzr/kb
def get_wsd_reader(is_training,
                   use_bert_indexer=False,
                   wordnet_entity_file=None):
    if wordnet_entity_file is None:
        wordnet_entity_file = "tests/fixtures/wordnet/entities_cat_hat.jsonl"

    if use_bert_indexer:
        bert_fixtures = get_bert_test_fixture()
        indexer_params = bert_fixtures["indexer_params"]
    else:
        indexer_params = {"type": "single_id", "lowercase_tokens": True}

    reader_params = {
        "type": "wordnet_fine_grained",
        "wordnet_entity_file": wordnet_entity_file,
        "token_indexers": {
            "tokens": indexer_params,
        },
        "entity_indexer": {
            "type": "characters_tokenizer",
            "tokenizer": {
                "type": "word",
                "word_splitter": {
                    "type": "just_spaces"
                },
            },
            "namespace": "entity"
        },
        "is_training": is_training,
        "use_surface_form": False
    }
    reader = DatasetReader.from_params(Params(reader_params))

    vocab_params = {
        "directory_path": "tests/fixtures/wordnet/cat_hat_vocabdir"
    }
    vocab = Vocabulary.from_params(Params(vocab_params))

    iterator = DataIterator.from_params(Params({"type": "basic"}))
    iterator.index_with(vocab)

    return reader, vocab, iterator
    def test_bert_transformer_predictor(self):
        param_file = FIXTURES_ROOT / "pointer_rewrite" / "bert_transformer_pointer_rewrite.jsonnet"
        params = Params.from_file(param_file)
        # 构建适用于bert model的词表,和vocabulary词表保持一致
        vocab_path = params["dataset_reader"]["vocab_path"]
        # 新生成的bert词表的路径
        bert_temp_dir = tempfile.mkdtemp(suffix="bert")
        with open(Path(vocab_path) / "tokens.txt", 'r', encoding="utf-8") as f, \
            open(Path(bert_temp_dir) / "vocab.txt", 'w', encoding="utf-8") as fp:
            fp.write("[PAD]"+"\n")
            for line in f:
                line = line.strip()
                fp.write(line)
                fp.write("\n")

        # 改写config中的部分参数
        overrides_config = {
            "dataset_reader.model_name": bert_temp_dir,
            "model.model_name": params["model"]["model_name"] + "/config.json"
        }
        overrides_config = json.dumps(overrides_config)
        # 重新加载参数并重写其中部分参数
        params = Params.from_file(param_file, params_overrides=overrides_config)

        # 获取reader
        reader = DatasetReader.from_params(params["dataset_reader"])
        # 如果存在词表的参数,则加载词表
        if "vocabulary" in params:
            vocab_params = params["vocabulary"]
            vocab = Vocabulary.from_params(params=vocab_params)
        else:
            vocab = Vocabulary()
        # 加载模型
        # 将模型对应的model_name改成对应的config文件
        model = Model.from_params(params=params["model"], vocab=vocab)

        predictor = PointerRewritePredictor(dataset_reader=reader, model=model)
        result = predictor.predict(self.context, self.query)

        self.assertTrue("rewrite_results" in result)
        assert isinstance(result["rewrite_results"], str)
    def test_lstm_lstm_predictor(self):
        param_file = FIXTURES_ROOT / "pointer_rewrite" / "lstm_lstm_pointer_rewrite.jsonnet"
        params = Params.from_file(param_file)
        # 获取reader
        reader = DatasetReader.from_params(params["dataset_reader"])
        # 获取模型
        # 如果存在词表的参数,则加载词表
        if "vocabulary" in params:
            vocab_params = params["vocabulary"]
            vocab = Vocabulary.from_params(params=vocab_params)
        else:
            vocab = Vocabulary()

        # 加载模型
        model = Model.from_params(params=params["model"], vocab=vocab)

        predictor = PointerRewritePredictor(dataset_reader=reader, model=model)
        result = predictor.predict(self.context, self.query)

        self.assertTrue("rewrite_results" in result)
        assert isinstance(result["rewrite_results"], str)
Example #29
0
    def test_kg_reader_with_eval(self):
        train_file = 'tests/fixtures/kg_embeddings/wn18rr_train.txt'
        dev_file = 'tests/fixtures/kg_embeddings/wn18rr_dev.txt'

        train_instances = KGTupleReader().read(train_file)

        reader = KGTupleReader(extra_files_for_gold_pairs=[train_file])
        instances = reader.read(dev_file)
        self.assertEqual(len(instances), 2)

        vocab = Vocabulary.from_params(Params({}), train_instances + instances)
        iterator = BasicIterator(batch_size=32)
        iterator.index_with(vocab)
        for batch in iterator(instances, num_epochs=1, shuffle=False):
            pass

        expected_entity = [1, 5]
        expected_relation = ['_hypernym', '_hypernym_reverse']
        expected_entity2 = [[5, 2, 3], [1, 4]]
        self._check_batch(batch, vocab,
                          expected_entity, expected_relation, expected_entity2)
Example #30
0
    def set_up_model(self, param_file, dataset_file):
        # pylint: disable=attribute-defined-outside-init
        self.param_file = param_file
        params = Params.from_file(self.param_file)

        reader = DatasetReader.from_params(params['dataset_reader'])
        instances = reader.read(dataset_file)
        # Use parameters for vocabulary if they are present in the config file, so that choices like
        # "non_padded_namespaces", "min_count" etc. can be set if needed.
        if 'vocabulary' in params:
            vocab_params = params['vocabulary']
            vocab = Vocabulary.from_params(params=vocab_params, instances=instances)
        else:
            vocab = Vocabulary.from_instances(instances)
        self.vocab = vocab
        self.instances = instances
        self.model = Model.from_params(vocab=self.vocab, params=params['model'])

        # TODO(joelgrus) get rid of these
        # (a lot of the model tests use them, so they'll have to be changed)
        self.dataset = Batch(self.instances)
        self.dataset.index_instances(self.vocab)
Example #31
0
def knowbert_fill2(sentences, model, batcher, vocab, mask_start=0, mask_end=0, config_file=None, top=10):
    iterator = DataIterator.from_params(Params({"type": "basic", "batch_size": 32}))
    config = Params.from_file(config_file)
    vocab_params = config['vocabulary']
    iterator.index_with(Vocabulary.from_params(vocab_params))
    instances = []
    for sent in sentences:
        token_candidates = batcher.tokenizer_and_candidate_generator.tokenize_and_generate_candidates(sent.replace('[MASK]', ' [MASK] '))
        masked_tokens = token_candidates['tokens'].copy()
        for i in range(mask_start, mask_end):
            masked_tokens[i] = '[MASK]'
        token_candidates['tokens'] = masked_tokens

        # mask out the entity candidates
        candidates = token_candidates['candidates']
        for candidate_key in candidates.keys():
            indices_to_mask = []
            for k, candidate_span in enumerate(candidates[candidate_key]['candidate_spans']):
                if (candidate_span[0] >= mask_start and candidate_span[0] <= mask_end-1) or (candidate_span[1] >= mask_start and candidate_span[1] <= mask_end-1):
                    indices_to_mask.append(k)
            for ind in indices_to_mask:
                candidates[candidate_key]['candidate_entities'][ind] = ['@@MASK@@']
                candidates[candidate_key]['candidate_entity_priors'][ind] = [1.0]
            if len(indices_to_mask) == 0:
                candidates[candidate_key]['candidate_spans'].append([mask_start, mask_end-1])
                candidates[candidate_key]['candidate_entities'].append(['@@MASK@@'])
                candidates[candidate_key]['candidate_entity_priors'].append([1.0])
                candidates[candidate_key]['candidate_segment_ids'].append(0)
        fields = batcher.tokenizer_and_candidate_generator.convert_tokens_candidates_to_fields(token_candidates)
        instances.append(Instance(fields))
    for batch in iterator(instances, num_epochs=1, shuffle=False):
        print(batch['tokens']['tokens'])
        model_output = model(**batch)
        print([vocab[w] for w in batch['tokens']['tokens'][0].numpy()])
        logits, _ = model.pretraining_heads(model_output['contextual_embeddings'], model_output['pooled_output'])
        log_probs = F.log_softmax(logits, dim=-1).cpu()
        for mask_ind in range(mask_start, mask_end):
            topk = torch.topk(log_probs[0, mask_ind], top, -1)[1]
            print([vocab[t.item()] for t in topk])
    def set_up_model(self, param_file, dataset_file):
        self.param_file = param_file
        params = Params.from_file(self.param_file)

        reader = DatasetReader.from_params(params['dataset_reader'])
        instances = reader.read(dataset_file)
        # Use parameters for vocabulary if they are present in the config file, so that choices like
        # "non_padded_namespaces", "min_count" etc. can be set if needed.
        if 'vocabulary' in params:
            vocab_params = params['vocabulary']
            vocab = Vocabulary.from_params(params=vocab_params,
                                           instances=instances)
        else:
            vocab = Vocabulary.from_instances(instances)
        self.vocab = vocab
        self.instances = instances
        self.model = Model.from_params(vocab=self.vocab,
                                       params=params['model'])

        # TODO(joelgrus) get rid of these
        # (a lot of the model tests use them, so they'll have to be changed)
        self.dataset = Batch(self.instances)
        self.dataset.index_instances(self.vocab)
Example #33
0
    def set_up_model(
        self,
        param_file: PathLike,
        dataset_file: PathLike,
        serialization_dir: PathLike = None,
        seed: int = None,
    ):
        if seed is not None:
            random.seed(seed)
            numpy.random.seed(seed)
            torch.manual_seed(seed)

        self.param_file = str(param_file)
        params = Params.from_file(self.param_file)

        reader = DatasetReader.from_params(
            params["dataset_reader"], serialization_dir=serialization_dir
        )
        # The dataset reader might be lazy, but a lazy list here breaks some of our tests.
        instances = list(reader.read(str(dataset_file)))
        # Use parameters for vocabulary if they are present in the config file, so that choices like
        # "non_padded_namespaces", "min_count" etc. can be set if needed.
        if "vocabulary" in params:
            vocab_params = params["vocabulary"]
            vocab = Vocabulary.from_params(params=vocab_params, instances=instances)
        else:
            vocab = Vocabulary.from_instances(instances)
        self.vocab = vocab
        self.instances = instances
        self.model = Model.from_params(
            vocab=self.vocab, params=params["model"], serialization_dir=serialization_dir
        )

        # TODO(joelgrus) get rid of these
        # (a lot of the model tests use them, so they'll have to be changed)
        self.dataset = Batch(self.instances)
        self.dataset.index_instances(self.vocab)
Example #34
0
def test_iterator():
    indexer = StaticFasttextTokenIndexer(
        model_path="./data/fasttext_embedding.model",
        model_params_path="./data/fasttext_embedding.model.params")

    loader = MenionsLoader(
        category_mapping_file='./data/test_category_mapping.json',
        token_indexers={"tokens": indexer},
        tokenizer=WordTokenizer(word_splitter=FastSplitter()))

    vocab = Vocabulary.from_params(Params({"directory_path":
                                           "./data/vocab2/"}))

    iterator = BasicIterator(batch_size=32)

    iterator.index_with(vocab)

    limit = 50
    for _ in tqdm.tqdm(iterator(loader.read('./data/train_data_aa.tsv'),
                                num_epochs=1),
                       mininterval=2):
        limit -= 1
        if limit <= 0:
            break
Example #35
0
def train_model(params: Params,
                serialization_dir: str,
                file_friendly_logging: bool = False,
                recover: bool = False,
                force: bool = False) -> Model:
    """
    Trains the model specified in the given :class:`Params` object, using the data and training
    parameters also specified in that object, and saves the results in ``serialization_dir``.

    Parameters
    ----------
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    recover : ``bool``, optional (default=False)
        If ``True``, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see the ``fine-tune`` command.

    Returns
    -------
    best_model: ``Model``
        The model with the best epoch weights.
    """
    prepare_environment(params)

    create_serialization_dir(params, serialization_dir, recover, force)
    prepare_global_logging(serialization_dir, file_friendly_logging)

    cuda_device = params.params.get('trainer').get('cuda_device', -1)
    if isinstance(cuda_device, list):
        for device in cuda_device:
            check_for_gpu(device)
    else:
        check_for_gpu(cuda_device)

    params.to_file(os.path.join(serialization_dir, CONFIG_NAME))

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("From dataset instances, %s will be considered for vocabulary creation.",
                ", ".join(datasets_for_vocab_creation))
    vocab = Vocabulary.from_params(
            params.pop("vocabulary", {}),
            (instance for key, dataset in all_datasets.items()
             for instance in dataset
             if key in datasets_for_vocab_creation)
    )

    model = Model.from_params(vocab=vocab, params=params.pop('model'))

    # Initializing the model can have side effect of expanding the vocabulary
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)
    validation_iterator_params = params.pop("validation_iterator", None)
    if validation_iterator_params:
        validation_iterator = DataIterator.from_params(validation_iterator_params)
        validation_iterator.index_with(vocab)
    else:
        validation_iterator = None

    train_data = all_datasets['train']
    validation_data = all_datasets.get('validation')
    test_data = all_datasets.get('test')

    trainer_params = params.pop("trainer")
    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)

    frozen_parameter_names, tunable_parameter_names = \
                   get_frozen_and_tunable_parameter_names(model)
    logger.info("Following parameters are Frozen  (without gradient):")
    for name in frozen_parameter_names:
        logger.info(name)
    logger.info("Following parameters are Tunable (with gradient):")
    for name in tunable_parameter_names:
        logger.info(name)

    trainer_choice = trainer_params.pop_choice("type",
                                               Trainer.list_available(),
                                               default_to_first_choice=True)
    trainer = Trainer.by_name(trainer_choice).from_params(model=model,
                                                          serialization_dir=serialization_dir,
                                                          iterator=iterator,
                                                          train_data=train_data,
                                                          validation_data=validation_data,
                                                          params=trainer_params,
                                                          validation_iterator=validation_iterator)

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)
    params.assert_empty('base train command')

    try:
        metrics = trainer.train()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info("Training interrupted by the user. Attempting to create "
                         "a model archive using the current best epoch weights.")
            archive_model(serialization_dir, files_to_archive=params.files_to_archive)
        raise

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)

    logger.info("Loading the best epoch weights.")
    best_model_state_path = os.path.join(serialization_dir, 'best.th')
    best_model_state = torch.load(best_model_state_path)
    best_model = model
    best_model.load_state_dict(best_model_state)

    if test_data and evaluate_on_test:
        logger.info("The model will be evaluated using the best epoch weights.")
        test_metrics = evaluate(
                best_model, test_data, validation_iterator or iterator,
                cuda_device=trainer._cuda_devices[0] # pylint: disable=protected-access
        )
        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif test_data:
        logger.info("To evaluate on the test set after training, pass the "
                    "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    dump_metrics(os.path.join(serialization_dir, "metrics.json"), metrics, log=True)

    return best_model
Example #36
0
def find_learning_rate_model(params: Params, serialization_dir: str,
                             start_lr: float = 1e-5,
                             end_lr: float = 10,
                             num_batches: int = 100,
                             linear_steps: bool = False,
                             stopping_factor: float = None,
                             force: bool = False) -> None:
    """
    Runs learning rate search for given `num_batches` and saves the results in ``serialization_dir``

    Parameters
    ----------
    trainer: :class:`~allennlp.common.registrable.Registrable`
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results.
    start_lr: ``float``
        Learning rate to start the search.
    end_lr: ``float``
        Learning rate upto which search is done.
    num_batches: ``int``
        Number of mini-batches to run Learning rate finder.
    linear_steps: ``bool``
        Increase learning rate linearly if False exponentially.
    stopping_factor: ``float``
        Stop the search when the current loss exceeds the best loss recorded by
        multiple of stopping factor. If ``None`` search proceeds till the ``end_lr``
    force: ``bool``
        If True and the serialization directory already exists, everything in it will
        be removed prior to finding the learning rate.
    """
    if os.path.exists(serialization_dir) and force:
        shutil.rmtree(serialization_dir)

    if os.path.exists(serialization_dir) and os.listdir(serialization_dir):
        raise ConfigurationError(f'Serialization directory {serialization_dir} already exists and is '
                                 f'not empty.')
    else:
        os.makedirs(serialization_dir, exist_ok=True)

    prepare_environment(params)

    cuda_device = params.params.get('trainer').get('cuda_device', -1)
    if isinstance(cuda_device, list):
        for device in cuda_device:
            check_for_gpu(device)
    else:
        check_for_gpu(cuda_device)

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("From dataset instances, %s will be considered for vocabulary creation.",
                ", ".join(datasets_for_vocab_creation))
    vocab = Vocabulary.from_params(
            params.pop("vocabulary", {}),
            (instance for key, dataset in all_datasets.items()
             for instance in dataset
             if key in datasets_for_vocab_creation)
    )

    model = Model.from_params(vocab=vocab, params=params.pop('model'))
    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)

    train_data = all_datasets['train']

    trainer_params = params.pop("trainer")
    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)

    trainer = Trainer.from_params(model,
                                  serialization_dir,
                                  iterator,
                                  train_data,
                                  params=trainer_params,
                                  validation_data=None,
                                  validation_iterator=None)

    logger.info(f'Starting learning rate search from {start_lr} to {end_lr} in {num_batches} iterations.')
    learning_rates, losses = search_learning_rate(trainer,
                                                  start_lr=start_lr,
                                                  end_lr=end_lr,
                                                  num_batches=num_batches,
                                                  linear_steps=linear_steps,
                                                  stopping_factor=stopping_factor)
    logger.info(f'Finished learning rate search.')
    losses = _smooth(losses, 0.98)

    _save_plot(learning_rates, losses, os.path.join(serialization_dir, 'lr-losses.png'))
Example #37
0
def train_model(params: Params,
                serialization_dir: str,
                file_friendly_logging: bool = False,
                recover: bool = False) -> Model:
    """
    Trains the model specified in the given :class:`Params` object, using the data and training
    parameters also specified in that object, and saves the results in ``serialization_dir``.

    Parameters
    ----------
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    recover : ``bool``, optional (default=False)
        If ``True``, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see the ``fine-tune`` command.

    Returns
    -------
    best_model: ``Model``
        The model with the best epoch weights.
    """
    prepare_environment(params)

    create_serialization_dir(params, serialization_dir, recover)
    prepare_global_logging(serialization_dir, file_friendly_logging)

    check_for_gpu(params.params.get('trainer').get('cuda_device', -1))

    serialization_params = deepcopy(params).as_dict(quiet=True)
    with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file:
        json.dump(serialization_params, param_file, indent=4)

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(
        params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(
                f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("Creating a vocabulary using %s data.",
                ", ".join(datasets_for_vocab_creation))
    vocab = Vocabulary.from_params(
        params.pop("vocabulary", {}),
        (instance for key, dataset in all_datasets.items()
         for instance in dataset if key in datasets_for_vocab_creation))

    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    model = Model.from_params(vocab=vocab, params=params.pop('model'))
    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)
    validation_iterator_params = params.pop("validation_iterator", None)
    if validation_iterator_params:
        validation_iterator = DataIterator.from_params(
            validation_iterator_params)
        validation_iterator.index_with(vocab)
    else:
        validation_iterator = None

    train_data = all_datasets['train']
    validation_data = all_datasets.get('validation')
    test_data = all_datasets.get('test')

    trainer_params = params.pop("trainer")
    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)

    frozen_parameter_names, tunable_parameter_names = \
                   get_frozen_and_tunable_parameter_names(model)
    logger.info("Following parameters are Frozen  (without gradient):")
    for name in frozen_parameter_names:
        logger.info(name)
    logger.info("Following parameters are Tunable (with gradient):")
    for name in tunable_parameter_names:
        logger.info(name)

    trainer = Trainer.from_params(model,
                                  serialization_dir,
                                  iterator,
                                  train_data,
                                  validation_data,
                                  trainer_params,
                                  validation_iterator=validation_iterator)

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)
    params.assert_empty('base train command')

    try:
        metrics = trainer.train()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info(
                "Training interrupted by the user. Attempting to create "
                "a model archive using the current best epoch weights.")
            archive_model(serialization_dir,
                          files_to_archive=params.files_to_archive)
        raise

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)

    logger.info("Loading the best epoch weights.")
    best_model_state_path = os.path.join(serialization_dir, 'best.th')
    best_model_state = torch.load(best_model_state_path)
    best_model = model
    best_model.load_state_dict(best_model_state)

    if test_data and evaluate_on_test:
        logger.info(
            "The model will be evaluated using the best epoch weights.")
        test_metrics = evaluate(
            best_model,
            test_data,
            validation_iterator or iterator,
            cuda_device=trainer._cuda_devices[0]  # pylint: disable=protected-access
        )
        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif test_data:
        logger.info(
            "To evaluate on the test set after training, pass the "
            "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    metrics_json = json.dumps(metrics, indent=2)
    with open(os.path.join(serialization_dir, "metrics.json"),
              "w") as metrics_file:
        metrics_file.write(metrics_json)
    logger.info("Metrics: %s", metrics_json)

    return best_model
Example #38
0
def train_model(params: Params,
                serialization_dir: str,
                file_friendly_logging: bool = False,
                recover: bool = False) -> Model:
    """
    Trains the model specified in the given :class:`Params` object, using the data and training
    parameters also specified in that object, and saves the results in ``serialization_dir``.

    Parameters
    ----------
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    recover : ``bool`, optional (default=False)
        If ``True``, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see the ``fine-tune`` command.
    """
    prepare_environment(params)

    create_serialization_dir(params, serialization_dir, recover)
    prepare_global_logging(serialization_dir, file_friendly_logging)

    serialization_params = deepcopy(params).as_dict(quiet=True)
    with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file:
        json.dump(serialization_params, param_file, indent=4)

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_for_vocab_creation))
    vocab = Vocabulary.from_params(params.pop("vocabulary", {}),
                                   (instance for key, dataset in all_datasets.items()
                                    for instance in dataset
                                    if key in datasets_for_vocab_creation))
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    model = Model.from_params(vocab, params.pop('model'))
    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)

    train_data = all_datasets['train']
    validation_data = all_datasets.get('validation')
    test_data = all_datasets.get('test')

    trainer_params = params.pop("trainer")
    trainer = Trainer.from_params(model,
                                  serialization_dir,
                                  iterator,
                                  train_data,
                                  validation_data,
                                  trainer_params)

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)
    params.assert_empty('base train command')

    try:
        metrics = trainer.train()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info("Training interrupted by the user. Attempting to create "
                         "a model archive using the current best epoch weights.")
            archive_model(serialization_dir, files_to_archive=params.files_to_archive)
        raise

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)

    if test_data and evaluate_on_test:
        test_metrics = evaluate(model, test_data, iterator, cuda_device=trainer._cuda_devices[0])  # pylint: disable=protected-access
        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif test_data:
        logger.info("To evaluate on the test set after training, pass the "
                    "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    metrics_json = json.dumps(metrics, indent=2)
    with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file:
        metrics_file.write(metrics_json)
    logger.info("Metrics: %s", metrics_json)

    return model