Ejemplo n.º 1
0
def test_language_model_data_collator():
    """
    Ensure `LanguageModelingDataCollator` works
    """
    norm_loader = MultiProcessDataLoader(MockDatasetReader(),
                                         "some path",
                                         batch_size=16)
    vocab = Vocabulary.from_instances(norm_loader.iter_instances())
    norm_loader.index_with(vocab)
    batch0 = list(norm_loader)[0]

    model_name = "epwalsh/bert-xsmall-dummy"
    data_collate = LanguageModelingDataCollator(model_name)
    mlm_loader = MultiProcessDataLoader(MockDatasetReader(),
                                        "some path",
                                        batch_size=16,
                                        collate_fn=data_collate)
    vocab = Vocabulary.from_instances(mlm_loader.iter_instances())
    mlm_loader.index_with(vocab)
    batch1 = list(mlm_loader)[0]

    norm_inputs = batch0["source"]["tokens"]["token_ids"]
    mlm_inputs = batch1["source"]["tokens"]["token_ids"]
    mlm_labels = batch1["source"]["tokens"]["labels"]

    # if we replace the mlm inputs with their labels, should be same as origin inputs
    assert torch.where(mlm_labels != -100, mlm_labels,
                       mlm_inputs).tolist() == norm_inputs.tolist()
Ejemplo n.º 2
0
def main():
    reader = StanfordSentimentTreeBankDatasetReader()
    train_path = 'https://s3.amazonaws.com/realworldnlpbook/data/stanfordSentimentTreebank/trees/train.txt'
    dev_path = 'https://s3.amazonaws.com/realworldnlpbook/data/stanfordSentimentTreebank/trees/dev.txt'

    sampler = BucketBatchSampler(batch_size=32, sorting_keys=["tokens"])
    train_data_loader = MultiProcessDataLoader(reader,
                                               train_path,
                                               batch_sampler=sampler)
    dev_data_loader = MultiProcessDataLoader(reader,
                                             dev_path,
                                             batch_sampler=sampler)

    # You can optionally specify the minimum count of tokens/labels.
    # `min_count={'tokens':3}` here means that any tokens that appear less than three times
    # will be ignored and not included in the vocabulary.
    vocab = Vocabulary.from_instances(chain(train_data_loader.iter_instances(),
                                            dev_data_loader.iter_instances()),
                                      min_count={'tokens': 3})
    train_data_loader.index_with(vocab)
    dev_data_loader.index_with(vocab)

    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=EMBEDDING_DIM)

    # BasicTextFieldEmbedder takes a dict - we need an embedding just for tokens,
    # not for labels, which are used as-is as the "answer" of the sentence classification
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

    # Seq2VecEncoder is a neural network abstraction that takes a sequence of something
    # (usually a sequence of embedded word vectors), processes it, and returns a single
    # vector. Oftentimes this is an RNN-based architecture (e.g., LSTM or GRU), but
    # AllenNLP also supports CNNs and other simple architectures (for example,
    # just averaging over the input vectors).
    encoder = PytorchSeq2VecWrapper(
        torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

    model = LstmClassifier(word_embeddings, encoder, vocab)

    optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)

    trainer = GradientDescentTrainer(model=model,
                                     optimizer=optimizer,
                                     data_loader=train_data_loader,
                                     validation_data_loader=dev_data_loader,
                                     patience=10,
                                     num_epochs=20,
                                     cuda_device=-1)

    trainer.train()

    predictor = SentenceClassifierPredictor(model, dataset_reader=reader)
    logits = predictor.predict('This is the best movie ever!')['logits']
    label_id = np.argmax(logits)

    print(model.vocab.get_token_from_index(label_id, 'labels'))
Ejemplo n.º 3
0
class TrainerTestBase(AllenNlpTestCase):
    def setup_method(self):
        super().setup_method()
        self.data_path = str(self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv")
        self.reader = SequenceTaggingDatasetReader()
        self.data_loader = MultiProcessDataLoader(self.reader, self.data_path, batch_size=2)
        self.data_loader_lazy = MultiProcessDataLoader(
            self.reader, self.data_path, batch_size=2, max_instances_in_memory=10
        )
        self.instances = list(self.data_loader.iter_instances())
        self.vocab = Vocabulary.from_instances(self.instances)
        self.data_loader.index_with(self.vocab)
        self.data_loader_lazy.index_with(self.vocab)
        self.model_params = Params(
            {
                "text_field_embedder": {
                    "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}}
                },
                "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2},
            }
        )
        self.model = SimpleTagger.from_params(vocab=self.vocab, params=self.model_params)
        self.optimizer = torch.optim.SGD(self.model.parameters(), 0.01, momentum=0.9)
        self.validation_data_loader = MultiProcessDataLoader(
            self.reader, self.data_path, batch_size=2
        )
        self.validation_data_loader.index_with(self.vocab)
Ejemplo n.º 4
0
    def read_and_check_instances(self, filepath: str, num_workers: int = 0):
        data_loader = MultiProcessDataLoader(self.reader,
                                             filepath,
                                             num_workers=num_workers,
                                             batch_size=1,
                                             start_method="spawn")
        all_instances = []
        for instance in data_loader.iter_instances():
            all_instances.append(instance)

        # 100 files * 4 sentences / file
        assert len(all_instances) == 100 * 4

        counts = Counter(fingerprint(instance) for instance in all_instances)

        # should have the exact same data 100 times
        assert len(counts) == 4
        assert counts[("cats", "are", "animals", ".", "N", "V", "N",
                       "N")] == 100
        assert counts[("dogs", "are", "animals", ".", "N", "V", "N",
                       "N")] == 100
        assert counts[("snakes", "are", "animals", ".", "N", "V", "N",
                       "N")] == 100
        assert counts[("birds", "are", "animals", ".", "N", "V", "N",
                       "N")] == 100
Ejemplo n.º 5
0
def test_batches_per_epoch():
    loader = MultiProcessDataLoader(MockDatasetReader(),
                                    "some path",
                                    batch_size=4,
                                    batches_per_epoch=10)
    vocab = Vocabulary.from_instances(loader.iter_instances())
    loader.index_with(vocab)

    assert len(loader) == 10
    assert len(list(loader)) == 10
Ejemplo n.º 6
0
def test_load_to_cuda(options):
    reader = MockDatasetReader()
    loader = MultiProcessDataLoader(
        reader=reader,
        data_path="this doens't matter",
        cuda_device=0,
        **options,
    )
    vocab = Vocabulary.from_instances(loader.iter_instances())
    loader.index_with(vocab)
    for batch in loader:
        assert batch["tensor"].device == torch.device("cuda:0")
Ejemplo n.º 7
0
def test_drop_last():
    """
    Ensures that the `drop_last` option is respected.
    """
    loader = MultiProcessDataLoader(MockDatasetReader(),
                                    "some path",
                                    batch_size=16,
                                    drop_last=True)
    vocab = Vocabulary.from_instances(loader.iter_instances())
    loader.index_with(vocab)

    # Should still load all instances. `drop_last` only affects batches.
    assert len(list(
        loader.iter_instances())) == MockDatasetReader.NUM_INSTANCES

    # Just here because the assertions below depend on the exact value of NUM_INSTANCES.
    assert MockDatasetReader.NUM_INSTANCES == 100
    batches = list(loader)
    for batch in batches:
        assert len(batch["index"]) == 16
    assert len(batches) == 6
Ejemplo n.º 8
0
    def test_with_multi_process_loading(self, lazy):
        readers = {"a": PlainTextReader(), "b": PlainTextReader(), "c": PlainTextReader()}
        reader = InterleavingDatasetReader(readers)
        data_dir = self.FIXTURES_ROOT / "data"
        file_path = {
            "a": data_dir / "babi.txt",
            "b": data_dir / "conll2003.txt",
            "c": data_dir / "conll2003.txt",
        }
        vocab = Vocabulary.from_instances(reader.read(file_path))
        loader = MultiProcessDataLoader(
            reader,
            file_path,
            num_workers=1,
            batch_size=1,
            max_instances_in_memory=2 if lazy else None,
        )
        loader.index_with(vocab)

        list(loader.iter_instances())
        list(loader)
Ejemplo n.º 9
0
def test_error_raised_when_text_fields_contain_token_indexers(
        max_instances_in_memory):
    """
    This tests that the MultiProcessDataLoader raises an error when num_workers > 0
    but the dataset reader doesn't implement apply_token_indexers().

    It also tests that errors raised within a worker process are propogated upwards
    to the main process, and that when that happens, all workers will be successfully
    killed.
    """

    with pytest.raises(
            WorkerError,
            match="Make sure your dataset reader's text_to_instance()"):
        loader = MultiProcessDataLoader(
            MockOldDatasetReader(),
            "this-path-doesn't-matter",
            num_workers=2,
            max_instances_in_memory=max_instances_in_memory,
            batch_size=1,
        )
        list(loader.iter_instances())
Ejemplo n.º 10
0
def test_multiprocess_data_loader(options):
    reader = MockDatasetReader()
    data_path = "this doesn't matter"

    loader = MultiProcessDataLoader(reader=reader,
                                    data_path=data_path,
                                    **options)
    if not options.get("max_instances_in_memory"):
        # Instances should be loaded immediately if max_instances_in_memory is None.
        assert loader._instances

    instances: Iterable[Instance] = loader.iter_instances()
    # This should be a generator.
    assert not isinstance(instances, (list, tuple))
    instances = list(instances)
    assert len(instances) == MockDatasetReader.NUM_INSTANCES

    # Now build vocab.
    vocab = Vocabulary.from_instances(instances)

    # Before indexing the loader, trying to iterate through batches will raise an error.
    with pytest.raises(ValueError,
                       match="Did you forget to call DataLoader.index_with"):
        list(loader)

    loader.index_with(vocab)

    # Run through a couple epochs to make sure we collect all of the instances.
    for epoch in range(2):
        indices: List[int] = []
        for batch in loader:
            for index in batch["index"]:
                indices.append(index)  # type: ignore
        # Ensure no duplicates.
        assert len(indices) == len(set(indices)), indices
        # Ensure all collected.
        assert len(indices) == MockDatasetReader.NUM_INSTANCES, epoch