Ejemplo n.º 1
0
    def test_elmo_empty_token_list(self):
        indexer = CustomELMoTokenCharactersIndexer()
        indexer = {'elmo': indexer}

        tokens_1 = TextField([Token('Apple')], indexer)
        targets_1 = ListField([TextField([Token('Apple')], indexer)])
        tokens_2 = TextField([Token('Screen'), Token('device')], indexer)
        targets_2 = ListField([
            TextField([Token('Screen')], indexer),
            TextField([Token('Device')], indexer)
        ])
        instance_1 = Instance({'tokens': tokens_1, 'targets': targets_1})
        instance_2 = Instance({'tokens': tokens_2, 'targets': targets_2})
        a_batch = Batch([instance_1, instance_2])
        a_batch.index_instances(Vocabulary())
        batch_tensor = a_batch.as_tensor_dict()
        elmo_target_token_indices = batch_tensor['targets']['elmo']['tokens']
        empty_target = elmo_target_token_indices[0][1].numpy()
        np.testing.assert_array_equal(np.zeros((1, 50)), empty_target)
        non_empty_targets = [
            elmo_target_token_indices[0][0], elmo_target_token_indices[1][0],
            elmo_target_token_indices[1][1]
        ]
        for non_empty_target in non_empty_targets:
            with pytest.raises(AssertionError):
                np.testing.assert_array_equal(np.zeros((1, 50)),
                                              non_empty_target)
Ejemplo n.º 2
0
    def forward_on_instances(self, instances: List[Instance],
                             **kwargs) -> List[Dict[str, np.ndarray]]:
        # An exact copy of the original method, but supports kwargs
        batch_size = len(instances)
        with torch.no_grad():
            cuda_device = self._get_prediction_device()
            dataset = Batch(instances)
            dataset.index_instances(self.vocab)
            model_input = util.move_to_device(dataset.as_tensor_dict(),
                                              cuda_device)
            outputs = self.make_output_human_readable(
                self(**model_input, **kwargs))
            instance_separated_output: List[Dict[str, np.ndarray]] = [
                {} for _ in dataset.instances
            ]
            for name, output in list(outputs.items()):
                if isinstance(output, torch.Tensor):
                    if output.dim() == 0:
                        output = output.unsqueeze(0)

                    if output.size(0) != batch_size:
                        self._maybe_warn_for_unseparable_batches(name)
                        continue
                    output = output.detach().cpu().numpy()
                elif len(output) != batch_size:
                    self._maybe_warn_for_unseparable_batches(name)
                    continue
                for instance_output, batch_element in zip(
                        instance_separated_output, output):
                    instance_output[name] = batch_element
            return instance_separated_output
def allennlp_collate(instances: List[Instance]) -> TensorDict:
    """
    This is the default function used to turn a list of `Instance`s into a `TensorDict`
    batch.
    """
    batch = Batch(instances)
    return batch.as_tensor_dict()
Ejemplo n.º 4
0
    def set_up_model(self, param_file, dataset_file):

        self.param_file = param_file
        params = Params.from_file(self.param_file)

        reader = DatasetReader.from_params(params["dataset_reader"])
        # The dataset reader might be lazy, but a lazy list here breaks some of our tests.
        instances = reader.read(str(dataset_file))
        # Use parameters for vocabulary if they are present in the config file, so that choices like
        # "non_padded_namespaces", "min_count" etc. can be set if needed.
        if "vocabulary" in params:
            vocab_params = params["vocabulary"]
            vocab = Vocabulary.from_params(params=vocab_params,
                                           instances=instances)
        else:
            vocab = Vocabulary.from_instances(instances)
        self.vocab = vocab
        self.instances = instances
        self.instances.index_with(vocab)
        self.model = Model.from_params(vocab=self.vocab,
                                       params=params["model"])

        # TODO(joelgrus) get rid of these
        # (a lot of the model tests use them, so they'll have to be changed)
        self.dataset = Batch(list(self.instances))
        self.dataset.index_instances(self.vocab)
Ejemplo n.º 5
0
 def test_elmo_empty_token_list(self):
     # Basic test
     indexer = ELMoTokenCharactersIndexer()
     assert {"elmo_tokens": []} == indexer.get_empty_token_list()
     # Real world test
     indexer = {"elmo": indexer}
     tokens_1 = TextField([Token("Apple")], indexer)
     targets_1 = ListField([TextField([Token("Apple")], indexer)])
     tokens_2 = TextField([Token("Screen"), Token("device")], indexer)
     targets_2 = ListField([
         TextField([Token("Screen")], indexer),
         TextField([Token("Device")], indexer)
     ])
     instance_1 = Instance({"tokens": tokens_1, "targets": targets_1})
     instance_2 = Instance({"tokens": tokens_2, "targets": targets_2})
     a_batch = Batch([instance_1, instance_2])
     a_batch.index_instances(Vocabulary())
     batch_tensor = a_batch.as_tensor_dict()
     elmo_target_token_indices = batch_tensor["targets"]["elmo"][
         "elmo_tokens"]
     # The TextField that is empty should have been created using the
     # `get_empty_token_list` and then padded with zeros.
     empty_target = elmo_target_token_indices[0][1].numpy()
     np.testing.assert_array_equal(np.zeros((1, 50)), empty_target)
     non_empty_targets = [
         elmo_target_token_indices[0][0],
         elmo_target_token_indices[1][0],
         elmo_target_token_indices[1][1],
     ]
     for non_empty_target in non_empty_targets:
         with pytest.raises(AssertionError):
             np.testing.assert_array_equal(np.zeros((1, 50)),
                                           non_empty_target)
Ejemplo n.º 6
0
def batch_to_ids(batch: List[List[str]]) -> torch.Tensor:
    """
    Converts a batch of tokenized sentences to a tensor representing the sentences with encoded characters
    (len(batch), max sentence length, max word length).

    # Parameters

    batch : `List[List[str]]`, required
        A list of tokenized sentences.

    # Returns

        A tensor of padded character ids.
    """
    instances = []
    indexer = ELMoTokenCharactersIndexer()
    for sentence in batch:
        tokens = [Token(token) for token in sentence]
        field = TextField(tokens, {"character_ids": indexer})
        instance = Instance({"elmo": field})
        instances.append(instance)

    dataset = Batch(instances)
    vocab = Vocabulary()
    dataset.index_instances(vocab)
    return dataset.as_tensor_dict()["elmo"]["character_ids"]["tokens"]
Ejemplo n.º 7
0
    def setUp(self) -> None:
        super().setUp()
        param_file = FIXTURES_ROOT / "pointer_rewrite" / "lstm_lstm_pointer_rewrite.jsonnet"
        dataset_file = FIXTURES_ROOT / "test_pointer_rewrite.txt"
        self.param_file = param_file
        params = Params.from_file(self.param_file)

        # 获取reader
        reader = DatasetReader.from_params(params["dataset_reader"])
        instances = reader.read(str(dataset_file))
        # 如果存在词表的参数,则加载词表
        if "vocabulary" in params:
            vocab_params = params["vocabulary"]
            vocab = Vocabulary.from_params(
                params=vocab_params, instances=instances)
        else:
            vocab = Vocabulary.from_instances(instances)

        self.vocab = vocab
        self.instances = instances
        self.instances.index_with(vocab)
        # 加载模型
        self.model = Model.from_params(params=params["model"], vocab=self.vocab)

        self.dataset = Batch(list(self.instances))
        self.dataset.index_instances(self.vocab)
        self.TEST_DIR = Path(tempfile.mkdtemp(prefix="allennlp_tests"))
    def _create_batches(self, instances: Iterable[Instance],
                        shuffle: bool) -> Iterable[Batch]:
        """
        This method should return one epoch worth of batches.
        """
        hoppers: Dict[Any, List[Instance]] = defaultdict(list)

        for instance in instances:
            # Which hopper do we put this instance in?
            if self.allow_mixed_batches:
                instance_type = ""
            else:
                instance_type = instance.fields[
                    self.type_field_name].metadata  # type: ignore

            hoppers[instance_type].append(instance)

            # If the hopper is full, yield up the batch and clear it.
            if len(hoppers[instance_type]) >= self._batch_size:
                yield Batch(hoppers[instance_type])
                hoppers[instance_type].clear()

        # Deal with leftovers
        for remaining in hoppers.values():
            if remaining:
                yield Batch(remaining)
Ejemplo n.º 9
0
 def test_tagger_with_elmo_token_embedder_forward_pass_runs_correctly(self):
     dataset = Batch(self.instances)
     dataset.index_instances(self.vocab)
     training_tensors = dataset.as_tensor_dict()
     output_dict = self.model(**training_tensors)
     probs = output_dict["class_probabilities"]
     assert probs.size() == (2, 7,
                             self.model.vocab.get_vocab_size("labels"))
Ejemplo n.º 10
0
 def convert_documents_to_batch(self, documents: List[Tuple[List[Token],
                                                            List[Token]]],
                                vocabulary) -> Dict[str, Any]:
     batch = Batch(
         [self.convert_tokens_to_instance(tokens) for tokens in documents])
     batch.index_instances(vocabulary)
     batch = batch.as_tensor_dict()
     return batch["document"]
Ejemplo n.º 11
0
 def test_padding_lengths_uses_max_instance_lengths(self):
     dataset = Batch(self.instances)
     dataset.index_instances(self.vocab)
     padding_lengths = dataset.get_padding_lengths()
     assert padding_lengths == {
         "text1": {
             "tokens___tokens": 5
         },
         "text2": {
             "tokens___tokens": 6
         }
     }
Ejemplo n.º 12
0
    def get_gradients(
            self, instances: List[Instance]
    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
        """
        Gets the gradients of the loss with respect to the model inputs.

        # Parameters

        instances: List[Instance]

        # Returns

        Tuple[Dict[str, Any], Dict[str, Any]]
        The first item is a Dict of gradient entries for each input.
        The keys have the form  `{grad_input_1: ..., grad_input_2: ... }`
        up to the number of inputs given. The second item is the model's output.

        Notes
        -----
        Takes a `JsonDict` representing the inputs of the model and converts
        them to [`Instances`](../data/instance.md)), sends these through
        the model [`forward`](../models/model.md#forward) function after registering hooks on the embedding
        layer of the model. Calls `backward` on the loss and then removes the
        hooks.
        """
        embedding_gradients: List[Tensor] = []
        hooks: List[RemovableHandle] = self._register_embedding_gradient_hooks(
            embedding_gradients)

        dataset = Batch(instances)
        dataset.index_instances(self._model.vocab)
        dataset_tensor_dict = util.move_to_device(dataset.as_tensor_dict(),
                                                  self.cuda_device)
        # To bypass "RuntimeError: cudnn RNN backward can only be called in training mode"
        with backends.cudnn.flags(enabled=False):
            outputs = self._model.make_output_human_readable(
                self._model.forward(**dataset_tensor_dict)  # type: ignore
            )

            loss = outputs["loss"]
            self._model.zero_grad()
            loss.backward()

        for hook in hooks:
            hook.remove()

        grad_dict = dict()
        for idx, grad in enumerate(embedding_gradients):
            key = "grad_input_" + str(idx + 1)
            grad_dict[key] = grad.detach().cpu().numpy()

        return grad_dict, outputs
def compose_batch_stream(ins_stream: Generator[Instance, None, None],
                         batch_size: int = 12) -> Generator[Batch, None, None]:
    buffer = []
    while True:
        try:
            buffer.append(next(ins_stream))
            if len(buffer) == batch_size:
                yield Batch(buffer)
                buffer.clear()
        except StopIteration:
            break
    if len(buffer) != 0:
        yield Batch(buffer)
Ejemplo n.º 14
0
    def forward_on_instances(
            self, instances: List[Instance]) -> List[Dict[str, numpy.ndarray]]:
        """
        Takes a list of  :class:`~allennlp.data.instance.Instance`s, converts that text into
        arrays using this model's :class:`Vocabulary`, passes those arrays through
        :func:`self.forward()` and :func:`self.decode()` (which by default does nothing)
        and returns the result.  Before returning the result, we convert any
        `torch.Tensors` into numpy arrays and separate the
        batched output into a list of individual dicts per instance. Note that typically
        this will be faster on a GPU (and conditionally, on a CPU) than repeated calls to
        :func:`forward_on_instance`.

        # Parameters

        instances : List[Instance], required
            The instances to run the model on.

        # Returns

        A list of the models output for each instance.
        """
        batch_size = len(instances)
        with torch.no_grad():
            cuda_device = self._get_prediction_device()
            dataset = Batch(instances)
            dataset.index_instances(self.vocab)
            model_input = util.move_to_device(dataset.as_tensor_dict(),
                                              cuda_device)
            outputs = self.decode(self(**model_input))

            instance_separated_output: List[Dict[str, numpy.ndarray]] = [
                {} for _ in dataset.instances
            ]
            for name, output in list(outputs.items()):
                if isinstance(output, torch.Tensor):
                    # NOTE(markn): This is a hack because 0-dim pytorch tensors are not iterable.
                    # This occurs with batch size 1, because we still want to include the loss in that case.
                    if output.dim() == 0:
                        output = output.unsqueeze(0)

                    if output.size(0) != batch_size:
                        self._maybe_warn_for_unseparable_batches(name)
                        continue
                    output = output.detach().cpu().numpy()
                elif len(output) != batch_size:
                    self._maybe_warn_for_unseparable_batches(name)
                    continue
                for instance_output, batch_element in zip(
                        instance_separated_output, output):
                    instance_output[name] = batch_element
            return instance_separated_output
Ejemplo n.º 15
0
    def get_vocab_and_both_elmo_indexed_ids(batch: List[List[str]]):
        instances = []
        indexer = ELMoTokenCharactersIndexer()
        indexer2 = SingleIdTokenIndexer()
        for sentence in batch:
            tokens = [Token(token) for token in sentence]
            field = TextField(tokens, {"character_ids": indexer, "tokens": indexer2})
            instance = Instance({"elmo": field})
            instances.append(instance)

        dataset = Batch(instances)
        vocab = Vocabulary.from_instances(instances)
        dataset.index_instances(vocab)
        return vocab, dataset.as_tensor_dict()["elmo"]
Ejemplo n.º 16
0
    def _create_batches(self, instances: Iterable[Instance], shuffle: bool) -> Iterable[Batch]:
        # Shuffle the documents if requested.
        maybe_shuffled_instances = self._shuffle_documents(instances) if shuffle else instances

        for instance_list in self._memory_sized_lists(maybe_shuffled_instances):
            iterator = iter(instance_list)
            excess: Deque[Instance] = deque()
            # Then break each memory-sized list into batches.
            for batch_instances in lazy_groups_of(iterator, self._batch_size):
                for possibly_smaller_batches in self._ensure_batch_is_sufficiently_small(batch_instances, excess):
                    batch = Batch(possibly_smaller_batches)
                    yield batch
            if excess:
                yield Batch(excess)
Ejemplo n.º 17
0
 def test_tagger_with_elmo_token_embedder_forward_pass_runs_correctly(self):
     dataset = Batch(self.instances)
     dataset.index_instances(self.vocab)
     training_tensors = dataset.as_tensor_dict()
     output_dict = self.model(**training_tensors)
     tags = output_dict["tags"]
     assert len(tags) == 2
     assert len(tags[0]) == 7
     assert len(tags[1]) == 7
     for example_tags in tags:
         for tag_id in example_tags:
             tag = self.model.vocab.get_token_from_index(tag_id,
                                                         namespace="labels")
             assert tag in {"O", "I-ORG", "I-PER", "I-LOC"}
Ejemplo n.º 18
0
    def setUp(self) -> None:
        super().setUp()
        param_file = FIXTURES_ROOT / "pointer_rewrite" / "bert_transformer_pointer_rewrite.jsonnet"
        dataset_file = FIXTURES_ROOT / "test_pointer_rewrite.txt"
        self.param_file = param_file
        params = Params.from_file(self.param_file)

        # 构建适用于bert model的词表,和vocabulary词表保持一致
        vocab_path = params["dataset_reader"]["vocab_path"]
        # 新生成的bert词表的路径
        bert_temp_dir = tempfile.mkdtemp(suffix="bert")
        with open(Path(vocab_path) / "tokens.txt", 'r', encoding="utf-8") as f, \
            open(Path(bert_temp_dir) / "vocab.txt", 'w', encoding="utf-8") as fp:
            fp.write("[PAD]" + "\n")
            for line in f:
                line = line.strip()
                fp.write(line)
                fp.write("\n")

        # 改写config中的部分参数
        overrides_config = {
            "dataset_reader.model_name": bert_temp_dir,
            "model.model_name": params["model"]["model_name"] + "/config.json"
        }
        self.overrides_config = json.dumps(overrides_config)
        params = Params.from_file(self.param_file,
                                  params_overrides=self.overrides_config)
        # 获取reader
        reader = DatasetReader.from_params(params["dataset_reader"])
        instances = reader.read(str(dataset_file))
        # 如果存在词表的参数,则加载词表
        if "vocabulary" in params:
            vocab_params = params["vocabulary"]
            vocab = Vocabulary.from_params(params=vocab_params,
                                           instances=instances)
        else:
            vocab = Vocabulary.from_instances(instances)

        self.vocab = vocab
        self.instances = instances
        self.instances.index_with(vocab)
        # 加载模型
        # 将模型对应的model_name改成对应的config文件
        self.model = Model.from_params(params=params["model"],
                                       vocab=self.vocab)

        self.dataset = Batch(list(self.instances))
        self.dataset.index_instances(self.vocab)
        self.TEST_DIR = Path(tempfile.mkdtemp(prefix="allennlp_tests"))
Ejemplo n.º 19
0
    def _sentences_to_ids(self, sentences):
        indexer = ELMoTokenCharactersIndexer()

        # For each sentence, first create a TextField, then create an instance
        instances = []
        for sentence in sentences:
            tokens = [Token(token) for token in sentence]
            field = TextField(tokens, {"character_ids": indexer})
            instance = Instance({"elmo": field})
            instances.append(instance)

        dataset = Batch(instances)
        vocab = Vocabulary()
        dataset.index_instances(vocab)
        return dataset.as_tensor_dict()["elmo"]["character_ids"]["elmo_tokens"]
Ejemplo n.º 20
0
    def _create_batches(self, instances: Iterable[Instance],
                        shuffle: bool) -> Iterable[Batch]:
        # First break the dataset into memory-sized lists:
        for instance_list in self._memory_sized_lists(instances):
            if shuffle:
                random.shuffle(instance_list)

            # Divvy up the instances based on their value of the "partition_key" field.
            hoppers: Dict[str, List[Instance]] = defaultdict(list)
            for instance in instance_list:
                partition = instance.fields[
                    self._partition_key].metadata  # type: ignore
                hoppers[partition].append(instance)

            # Get a `lazy_groups_of` iterator over each set of homogeneous instances.
            batches = {
                key: lazy_groups_of(iter(hopper), self._batch_size)
                for key, hopper in hoppers.items()
            }

            remaining = set(batches)

            # Yield batches in a round-robin fashion until none are left.
            while remaining:
                for key, lazy_batches in batches.items():
                    if key in remaining:
                        try:
                            batch = next(lazy_batches)
                            if not self._skip_smaller_batches or len(
                                    batch) == self._batch_size:
                                yield Batch(batch)
                        except StopIteration:
                            remaining.remove(key)
Ejemplo n.º 21
0
 def instances_to_captum_inputs(self, labeled_instances):
     batch_size = len(labeled_instances)
     with torch.no_grad():
         cuda_device = self._get_prediction_device()
         batch = Batch(labeled_instances)
         batch.index_instances(self.vocab)
         model_input = util.move_to_device(batch.as_tensor_dict(),
                                           cuda_device)
         input_ids = model_input["tokens"]["tokens"]["token_ids"]
         label = model_input["label"]
         attention_mask = model_input["tokens"]["tokens"]["mask"]
         embedded_tokens = self.embeddings(input_ids)
         output_dict = {}
         output_dict["embedding"] = embedded_tokens
         return (embedded_tokens, ), None, (attention_mask, label,
                                            output_dict)
Ejemplo n.º 22
0
 def _create_batches(self, instances: Iterable[Instance],
                     shuffle: bool) -> Iterable[Batch]:
     # First break the dataset into memory-sized lists:
     for instance_list in self._memory_sized_lists(instances):
         if shuffle:
             random.shuffle(instance_list)
         iterator = iter(instance_list)
         excess: Deque[Instance] = deque()
         # Then break each memory-sized list into batches.
         for batch_instances in lazy_groups_of(iterator, self._batch_size):
             for possibly_smaller_batches in self._ensure_batch_is_sufficiently_small(
                     batch_instances, excess):
                 batch = Batch(possibly_smaller_batches)
                 yield batch
         if excess:
             yield Batch(excess)
Ejemplo n.º 23
0
 def forward_on_instances(
         self, instances: List[Instance]) -> List[Dict[str, numpy.ndarray]]:
     """
     我省略了复杂繁琐的检查,因为这会导致模型最后可能没有输出
     :param instances:
     :return:
     """
     batch_size = len(instances)
     with torch.no_grad():
         cuda_device = self._get_prediction_device()
         dataset = Batch(instances)
         dataset.index_instances(self.vocab)
         model_input = util.move_to_device(dataset.as_tensor_dict(),
                                           cuda_device)
         outputs = self.decode(self(**model_input))
         return outputs
Ejemplo n.º 24
0
    def setup_method(self):
        token_indexer = SingleIdTokenIndexer("tokens")

        self.pairs_fname = (
            "https://raw.githubusercontent.com/tolga-b/debiaswe/"
            "4c3fa843ffff45115c43fe112d4283c91d225c09/data/definitional_pairs.json"
        )
        with open(cached_path(self.pairs_fname)) as f:
            pairs_list = []
            [
                pairs_list.extend([
                    w1.lower(),
                    w2.lower(),
                    w1.title(),
                    w2.title(),
                    w1.upper(),
                    w2.upper()
                ]) for w1, w2 in json.load(f)
            ]

        text_field = TextField(
            [Token(t) for t in pairs_list],
            {"tokens": token_indexer},
        )
        instance = Instance({"text": text_field})
        dataset = Batch([instance])
        self.pairs_vocab = Vocabulary.from_instances(dataset)
        self.num_pairs = len(set(pairs_list))

        self.singles_fname = (
            "https://raw.githubusercontent.com/tolga-b/debiaswe/"
            "4c3fa843ffff45115c43fe112d4283c91d225c09/data/gender_specific_full.json"
        )
        with open(cached_path(self.singles_fname)) as f:
            singles_list = json.load(f)

        text_field = TextField(
            [Token(t) for t in singles_list],
            {"tokens": token_indexer},
        )
        instance = Instance({"text": text_field})
        dataset = Batch([instance])
        self.singles_vocab = Vocabulary.from_instances(dataset)
        self.num_singles = len(set(singles_list))

        super().setup_method()
Ejemplo n.º 25
0
    def preprocess(self, token_batch):
        seq_lens = [len(sequence) for sequence in token_batch if sequence]
        if not seq_lens:
            return []
        max_len = min(max(seq_lens), self.max_len)
        batches = []
        for indexer in self.indexers:
            batch = []
            for sequence in token_batch:
                tokens = sequence[:max_len]
                tokens = [Token(token) for token in ['$START'] + tokens]
                batch.append(Instance({'tokens': TextField(tokens, indexer)}))
            batch = Batch(batch)
            batch.index_instances(self.vocab)
            batches.append(batch)

        return batches
Ejemplo n.º 26
0
    def test_batch_count(self):
        dataset = AllennlpDataset(self.instances, vocab=self.vocab)
        sampler = BucketBatchSampler(dataset, batch_size=2, padding_noise=0, sorting_keys=["text"])
        # We use a custom collate_fn for testing, which doesn't actually create tensors,
        # just the allennlp Batches.
        dataloader = DataLoader(dataset, batch_sampler=sampler, collate_fn=lambda x: Batch(x))

        assert len(dataloader) == 3
Ejemplo n.º 27
0
class TestLSTMPointerForRewrite(TestCase):
    def setUp(self) -> None:
        super().setUp()
        param_file = FIXTURES_ROOT / "pointer_rewrite" / "lstm_lstm_pointer_rewrite.jsonnet"
        dataset_file = FIXTURES_ROOT / "test_pointer_rewrite.txt"
        self.param_file = param_file
        params = Params.from_file(self.param_file)

        # 获取reader
        reader = DatasetReader.from_params(params["dataset_reader"])
        instances = reader.read(str(dataset_file))
        # 如果存在词表的参数,则加载词表
        if "vocabulary" in params:
            vocab_params = params["vocabulary"]
            vocab = Vocabulary.from_params(
                params=vocab_params, instances=instances)
        else:
            vocab = Vocabulary.from_instances(instances)

        self.vocab = vocab
        self.instances = instances
        self.instances.index_with(vocab)
        # 加载模型
        self.model = Model.from_params(params=params["model"], vocab=self.vocab)

        self.dataset = Batch(list(self.instances))
        self.dataset.index_instances(self.vocab)
        self.TEST_DIR = Path(tempfile.mkdtemp(prefix="allennlp_tests"))

    def test_model_can_train_save_and_load(self):
        save_dir = self.TEST_DIR / "save_and_load_test"
        archive_file = save_dir / "model.tar.gz"
        # test train and save
        model = train_model_from_file(self.param_file, save_dir)
        # test load
        loaded_model = load_archive(archive_file, cuda_device=-1).model
        state_keys = model.state_dict().keys()
        loaded_state_keys = loaded_model.state_dict().keys()
        assert state_keys == loaded_state_keys
        # make sure that the state dict (the parameters) are the same
        # for both models.
        for key in state_keys:
            assert_allclose(model.state_dict()[key].cpu().numpy(),
                            loaded_model.state_dict()[key].cpu().numpy(),
                            err_msg=key)
Ejemplo n.º 28
0
 def setUp(self):
     token_indexer = SingleIdTokenIndexer("tokens")
     text_field = TextField(
         [Token(t) for t in ["a", "a", "a", "a", "b", "b", "c", "c", "c"]],
         {"tokens": token_indexer},
     )
     self.instance = Instance({"text": text_field})
     self.dataset = Batch([self.instance])
     super().setUp()
Ejemplo n.º 29
0
 def __iter__(self) -> Iterator[TensorDict]:
     epoch_instances = self._get_instances_for_epoch()
     return (
         nn_util.move_to_device(
             Batch(instances).as_tensor_dict(),
             -1 if self.cuda_device is None else self.cuda_device,
         )
         for instances in self.scheduler.batch_instances(epoch_instances)
     )
Ejemplo n.º 30
0
def setup_model(params_file, dataset_file):
    params = Params.from_file(params_file)

    #reader = DatasetReader.from_params(params['dataset_reader'])
    reader = ToxicReader()
    instances = reader.read(str(dataset_file))
    Vocabulary.from_instances(instances)
    if 'vocabulary' in params:
        vocab_params = params['vocabulary']
        vocab = Vocabulary.from_params(params=vocab_params, instances=instances)
    else:
        vocab = Vocabulary.from_instances(instances)
    
    vocab.save_to_files("new_vocab2")
    dataset = Batch(instances)
    dataset.index_instances(vocab)
    
    print(dataset.as_tensor_dict())