Ejemplo n.º 1
0
    def test_drop_last_works(self):
        sampler = BucketBatchSampler(
            batch_size=2,
            padding_noise=0,
            sorting_keys=["text"],
            drop_last=True,
        )

        # We use a custom collate_fn for testing, which doesn't actually create tensors,
        # just the allennlp Batches.
        def collate_fn(x, **kwargs):
            return Batch(x)

        data_loader = MultiProcessDataLoader(
            self.get_mock_reader(),
            "fake_path",
            batch_sampler=sampler,
        )
        data_loader.collate_fn = collate_fn
        data_loader.index_with(self.vocab)
        batches = [batch for batch in iter(data_loader)]
        stats = self.get_batches_stats(batches)

        # all batches have length batch_size
        assert all(batch_len == 2 for batch_len in stats["batch_lengths"])

        # we should have lost one instance by skipping the last batch
        assert stats["total_instances"] == len(self.instances) - 1
Ejemplo n.º 2
0
    def read_and_check_instances(self, filepath: str, num_workers: int = 0):
        data_loader = MultiProcessDataLoader(self.reader,
                                             filepath,
                                             num_workers=num_workers,
                                             batch_size=1,
                                             start_method="spawn")
        all_instances = []
        for instance in data_loader.iter_instances():
            all_instances.append(instance)

        # 100 files * 4 sentences / file
        assert len(all_instances) == 100 * 4

        counts = Counter(fingerprint(instance) for instance in all_instances)

        # should have the exact same data 100 times
        assert len(counts) == 4
        assert counts[("cats", "are", "animals", ".", "N", "V", "N",
                       "N")] == 100
        assert counts[("dogs", "are", "animals", ".", "N", "V", "N",
                       "N")] == 100
        assert counts[("snakes", "are", "animals", ".", "N", "V", "N",
                       "N")] == 100
        assert counts[("birds", "are", "animals", ".", "N", "V", "N",
                       "N")] == 100
Ejemplo n.º 3
0
def test_language_model_data_collator():
    """
    Ensure `LanguageModelingDataCollator` works
    """
    norm_loader = MultiProcessDataLoader(MockDatasetReader(),
                                         "some path",
                                         batch_size=16)
    vocab = Vocabulary.from_instances(norm_loader.iter_instances())
    norm_loader.index_with(vocab)
    batch0 = list(norm_loader)[0]

    model_name = "epwalsh/bert-xsmall-dummy"
    data_collate = LanguageModelingDataCollator(model_name)
    mlm_loader = MultiProcessDataLoader(MockDatasetReader(),
                                        "some path",
                                        batch_size=16,
                                        collate_fn=data_collate)
    vocab = Vocabulary.from_instances(mlm_loader.iter_instances())
    mlm_loader.index_with(vocab)
    batch1 = list(mlm_loader)[0]

    norm_inputs = batch0["source"]["tokens"]["token_ids"]
    mlm_inputs = batch1["source"]["tokens"]["token_ids"]
    mlm_labels = batch1["source"]["tokens"]["labels"]

    # if we replace the mlm inputs with their labels, should be same as origin inputs
    assert torch.where(mlm_labels != -100, mlm_labels,
                       mlm_inputs).tolist() == norm_inputs.tolist()
Ejemplo n.º 4
0
def build_data_loaders(config,
    dataset_reader: DatasetReader) -> Tuple[MultiProcessDataLoader, MultiProcessDataLoader, MultiProcessDataLoader]:

    train_loader = MultiProcessDataLoader(dataset_reader, data_path='train', batch_size=config.batch_size_for_train, shuffle=False)
    dev_loader = MultiProcessDataLoader(dataset_reader, data_path='dev', batch_size=config.batch_size_for_eval, shuffle=False)
    test_loader = MultiProcessDataLoader(dataset_reader, data_path='test', batch_size=config.batch_size_for_eval, shuffle=False)

    return train_loader, dev_loader, test_loader
Ejemplo n.º 5
0
 def test_batch_count(self):
     sampler = BucketBatchSampler(batch_size=2,
                                  padding_noise=0,
                                  sorting_keys=["text"])
     data_loader = MultiProcessDataLoader(self.get_mock_reader(),
                                          "fake_path",
                                          batch_sampler=sampler)
     data_loader.index_with(self.vocab)
     assert len(data_loader) == 3
Ejemplo n.º 6
0
def main():
    reader = StanfordSentimentTreeBankDatasetReader()
    train_path = 'https://s3.amazonaws.com/realworldnlpbook/data/stanfordSentimentTreebank/trees/train.txt'
    dev_path = 'https://s3.amazonaws.com/realworldnlpbook/data/stanfordSentimentTreebank/trees/dev.txt'

    sampler = BucketBatchSampler(batch_size=32, sorting_keys=["tokens"])
    train_data_loader = MultiProcessDataLoader(reader,
                                               train_path,
                                               batch_sampler=sampler)
    dev_data_loader = MultiProcessDataLoader(reader,
                                             dev_path,
                                             batch_sampler=sampler)

    # You can optionally specify the minimum count of tokens/labels.
    # `min_count={'tokens':3}` here means that any tokens that appear less than three times
    # will be ignored and not included in the vocabulary.
    vocab = Vocabulary.from_instances(chain(train_data_loader.iter_instances(),
                                            dev_data_loader.iter_instances()),
                                      min_count={'tokens': 3})
    train_data_loader.index_with(vocab)
    dev_data_loader.index_with(vocab)

    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=EMBEDDING_DIM)

    # BasicTextFieldEmbedder takes a dict - we need an embedding just for tokens,
    # not for labels, which are used as-is as the "answer" of the sentence classification
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

    # Seq2VecEncoder is a neural network abstraction that takes a sequence of something
    # (usually a sequence of embedded word vectors), processes it, and returns a single
    # vector. Oftentimes this is an RNN-based architecture (e.g., LSTM or GRU), but
    # AllenNLP also supports CNNs and other simple architectures (for example,
    # just averaging over the input vectors).
    encoder = PytorchSeq2VecWrapper(
        torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

    model = LstmClassifier(word_embeddings, encoder, vocab)

    optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)

    trainer = GradientDescentTrainer(model=model,
                                     optimizer=optimizer,
                                     data_loader=train_data_loader,
                                     validation_data_loader=dev_data_loader,
                                     patience=10,
                                     num_epochs=20,
                                     cuda_device=-1)

    trainer.train()

    predictor = SentenceClassifierPredictor(model, dataset_reader=reader)
    logits = predictor.predict('This is the best movie ever!')['logits']
    label_id = np.argmax(logits)

    print(model.vocab.get_token_from_index(label_id, 'labels'))
Ejemplo n.º 7
0
def test_batches_per_epoch():
    loader = MultiProcessDataLoader(MockDatasetReader(),
                                    "some path",
                                    batch_size=4,
                                    batches_per_epoch=10)
    vocab = Vocabulary.from_instances(loader.iter_instances())
    loader.index_with(vocab)

    assert len(loader) == 10
    assert len(list(loader)) == 10
Ejemplo n.º 8
0
def test_load_to_cuda(options):
    reader = MockDatasetReader()
    loader = MultiProcessDataLoader(
        reader=reader,
        data_path="this doens't matter",
        cuda_device=0,
        **options,
    )
    vocab = Vocabulary.from_instances(loader.iter_instances())
    loader.index_with(vocab)
    for batch in loader:
        assert batch["tensor"].device == torch.device("cuda:0")
def build_data_loaders(
    reader,
    train_data_path: str,
    validation_data_path: str,
) -> Tuple[DataLoader, DataLoader]:
    train_loader = MultiProcessDataLoader(reader,
                                          train_data_path,
                                          batch_size=8,
                                          shuffle=True)
    dev_loader = MultiProcessDataLoader(reader,
                                        validation_data_path,
                                        batch_size=8,
                                        shuffle=False)
    return train_loader, dev_loader
Ejemplo n.º 10
0
 def test_batch_count(self):
     sampler = MaxTokensBatchSampler(max_tokens=8,
                                     padding_noise=0,
                                     sorting_keys=["text"])
     data_loader = MultiProcessDataLoader(self.get_mock_reader(),
                                          "fake_path",
                                          batch_sampler=sampler)
     assert len(data_loader) == 3
Ejemplo n.º 11
0
class TrainerTestBase(AllenNlpTestCase):
    def setup_method(self):
        super().setup_method()
        self.data_path = str(self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv")
        self.reader = SequenceTaggingDatasetReader()
        self.data_loader = MultiProcessDataLoader(self.reader, self.data_path, batch_size=2)
        self.data_loader_lazy = MultiProcessDataLoader(
            self.reader, self.data_path, batch_size=2, max_instances_in_memory=10
        )
        self.instances = list(self.data_loader.iter_instances())
        self.vocab = Vocabulary.from_instances(self.instances)
        self.data_loader.index_with(self.vocab)
        self.data_loader_lazy.index_with(self.vocab)
        self.model_params = Params(
            {
                "text_field_embedder": {
                    "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}}
                },
                "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2},
            }
        )
        self.model = SimpleTagger.from_params(vocab=self.vocab, params=self.model_params)
        self.optimizer = torch.optim.SGD(self.model.parameters(), 0.01, momentum=0.9)
        self.validation_data_loader = MultiProcessDataLoader(
            self.reader, self.data_path, batch_size=2
        )
        self.validation_data_loader.index_with(self.vocab)
Ejemplo n.º 12
0
 def test_batch_count_with_drop_last(self):
     sampler = BucketBatchSampler(
         batch_size=2,
         padding_noise=0,
         sorting_keys=["text"],
         drop_last=True,
     )
     data_loader = MultiProcessDataLoader(self.get_mock_reader(),
                                          "fake_path",
                                          batch_sampler=sampler)
     assert len(data_loader) == 2
Ejemplo n.º 13
0
    def test_with_multi_process_loading(self, lazy):
        readers = {"a": PlainTextReader(), "b": PlainTextReader(), "c": PlainTextReader()}
        reader = InterleavingDatasetReader(readers)
        data_dir = self.FIXTURES_ROOT / "data"
        file_path = {
            "a": data_dir / "babi.txt",
            "b": data_dir / "conll2003.txt",
            "c": data_dir / "conll2003.txt",
        }
        vocab = Vocabulary.from_instances(reader.read(file_path))
        loader = MultiProcessDataLoader(
            reader,
            file_path,
            num_workers=1,
            batch_size=1,
            max_instances_in_memory=2 if lazy else None,
        )
        loader.index_with(vocab)

        list(loader.iter_instances())
        list(loader)
Ejemplo n.º 14
0
def test_error_raised_when_text_fields_contain_token_indexers(
        max_instances_in_memory):
    """
    This tests that the MultiProcessDataLoader raises an error when num_workers > 0
    but the dataset reader doesn't implement apply_token_indexers().

    It also tests that errors raised within a worker process are propogated upwards
    to the main process, and that when that happens, all workers will be successfully
    killed.
    """

    with pytest.raises(
            WorkerError,
            match="Make sure your dataset reader's text_to_instance()"):
        loader = MultiProcessDataLoader(
            MockOldDatasetReader(),
            "this-path-doesn't-matter",
            num_workers=2,
            max_instances_in_memory=max_instances_in_memory,
            batch_size=1,
        )
        list(loader.iter_instances())
Ejemplo n.º 15
0
def test_multiprocess_data_loader(options):
    reader = MockDatasetReader()
    data_path = "this doesn't matter"

    loader = MultiProcessDataLoader(reader=reader,
                                    data_path=data_path,
                                    **options)
    if not options.get("max_instances_in_memory"):
        # Instances should be loaded immediately if max_instances_in_memory is None.
        assert loader._instances

    instances: Iterable[Instance] = loader.iter_instances()
    # This should be a generator.
    assert not isinstance(instances, (list, tuple))
    instances = list(instances)
    assert len(instances) == MockDatasetReader.NUM_INSTANCES

    # Now build vocab.
    vocab = Vocabulary.from_instances(instances)

    # Before indexing the loader, trying to iterate through batches will raise an error.
    with pytest.raises(ValueError,
                       match="Did you forget to call DataLoader.index_with"):
        list(loader)

    loader.index_with(vocab)

    # Run through a couple epochs to make sure we collect all of the instances.
    for epoch in range(2):
        indices: List[int] = []
        for batch in loader:
            for index in batch["index"]:
                indices.append(index)  # type: ignore
        # Ensure no duplicates.
        assert len(indices) == len(set(indices)), indices
        # Ensure all collected.
        assert len(indices) == MockDatasetReader.NUM_INSTANCES, epoch
Ejemplo n.º 16
0
def build_data_loader(data_reader: DatasetReader,
                      data_path: Path,
                      batch_size: int,
                      shuffle: bool = True) -> DataLoader:
    """
    Build an AllenNLP DataLoader.

    :param train_data: The training dataset, torch object.
    :param dev_data: The dev dataset, torch object.
    :return train_loader, dev_loader: The train and dev data loaders as a
            tuple.
    """
    # Note that DataLoader is imported from allennlp above, *not* torch.
    # We need to get the allennlp-specific collate function, which is
    # what actually does indexing and batching.
    # log.debug("Building DataLoader.")
    loader = MultiProcessDataLoader(reader=data_reader,
                                    data_path=data_path,
                                    batch_size=batch_size,
                                    shuffle=shuffle)
    # log.debug("DataLoader built.")

    return loader
Ejemplo n.º 17
0
def test_drop_last():
    """
    Ensures that the `drop_last` option is respected.
    """
    loader = MultiProcessDataLoader(MockDatasetReader(),
                                    "some path",
                                    batch_size=16,
                                    drop_last=True)
    vocab = Vocabulary.from_instances(loader.iter_instances())
    loader.index_with(vocab)

    # Should still load all instances. `drop_last` only affects batches.
    assert len(list(
        loader.iter_instances())) == MockDatasetReader.NUM_INSTANCES

    # Just here because the assertions below depend on the exact value of NUM_INSTANCES.
    assert MockDatasetReader.NUM_INSTANCES == 100
    batches = list(loader)
    for batch in batches:
        assert len(batch["index"]) == 16
    assert len(batches) == 6
def evaluate_transformers_checkpoint(
    data_path: str,
    model_config_path: str,
    checkpoint_model_name: str,
    checkpoint_tokenizer_name: str,
    batch_size: int,
    cuda_device: int,
    result_save_path: str,
):
    """
    Expected results for ``test.json`` from the Open Entity dataset:
    {'micro_precision': 0.7997806072235107, 'micro_recall': 0.7657563090324402, 'micro_fscore': 0.7823987007141113}.

    Parameters
    ----------
    data_path : str
        Data path to the input file.
    model_config_path : str
        A config file that defines the model architecture to evaluate.
    checkpoint_model_name : str
        The name of the checkpoint in Hugging Face Model Hub.
    checkpoint_tokenizer_name : str
        This should be the name of the base pre-training model because sometimes
        the tokenizer of downstream task is not compatible with allennlp.
    batch_size : int
    cuda_device : int
    result_save_path : str
    """
    import_module_and_submodules("examples_allennlp")

    tokenizer_kwargs = {"additional_special_tokens": [ENT]}
    reader = EntityTypingReader(
        tokenizer=PretrainedTransformerTokenizer(
            model_name=checkpoint_tokenizer_name,
            add_special_tokens=True,
            tokenizer_kwargs=tokenizer_kwargs),
        token_indexers={
            "tokens":
            PretrainedTransformerIndexer(model_name=checkpoint_tokenizer_name,
                                         tokenizer_kwargs=tokenizer_kwargs)
        },
        use_entity_feature=True,
    )

    transformers_tokenizer = LukeTokenizer.from_pretrained(
        checkpoint_model_name)
    transformers_model = LukeForEntityClassification.from_pretrained(
        checkpoint_model_name)

    vocab = Vocabulary()
    vocab.add_transformer_vocab(transformers_tokenizer, "tokens")
    num_labels = len(transformers_model.config.id2label)
    labels = [transformers_model.config.id2label[i] for i in range(num_labels)]
    vocab.add_tokens_to_namespace(labels, namespace="labels")

    # read model
    params = Params.from_file(
        model_config_path,
        ext_vars={"TRANSFORMERS_MODEL_NAME": checkpoint_model_name})
    model = Model.from_params(params, vocab=vocab)
    model.classifier = transformers_model.classifier
    model.eval()

    # set the GPU device to use
    if cuda_device < 0:
        device = torch.device("cpu")
    else:
        device = torch.device(f"cuda:{cuda_device}")
    model = model.to(device)

    loader = MultiProcessDataLoader(reader,
                                    data_path,
                                    batch_size=batch_size,
                                    shuffle=False)
    loader.index_with(model.vocab)
    with torch.no_grad():
        for batch in tqdm.tqdm(loader):
            batch = nn_util.move_to_device(batch, device)
            output_dict = model(**batch)

    metrics = model.get_metrics(reset=True)
    print(metrics)
    if result_save_path is not None:
        with open(result_save_path, "w") as f:
            json.dump(metrics, f)
Ejemplo n.º 19
0
def evaluate_transformers_checkpoint(
    data_path: str,
    model_config_path: str,
    checkpoint_model_name: str,
    checkpoint_tokenizer_name: str,
    batch_size: int,
    cuda_device: int,
    result_save_path: str,
    prediction_save_path: str,
):
    """
    Expected results for CoNLL-2003 NER English test set.
    {'f1': 0.9461946902654867, 'precision': 0.945859872611465, 'recall': 0.9465297450424929}

    Parameters
    ----------
    data_path : str
        Data path to the input file.
    model_config_path : str
        A config file that defines the model architecture to evaluate.
    checkpoint_model_name : str
        The name of the checkpoint in Hugging Face Model Hub.
    checkpoint_tokenizer_name : str
        This should be the name of the base pre-training model because sometimes
        the tokenizer of downstream task is not compatible with allennlp.
    batch_size : int
    cuda_device : int
    result_save_path : str
    """
    import_module_and_submodules("examples_allennlp")

    reader = ConllSpanReader(
        tokenizer=PretrainedTransformerTokenizer(
            model_name=checkpoint_tokenizer_name,
            add_special_tokens=False,
            tokenizer_kwargs={"add_prefix_space": True}),
        token_indexers={
            "tokens":
            PretrainedTransformerIndexer(model_name=checkpoint_tokenizer_name)
        },
        use_entity_feature=True,
    )

    transformers_tokenizer = LukeTokenizer.from_pretrained(
        checkpoint_model_name)
    transformers_model = LukeForEntitySpanClassification.from_pretrained(
        checkpoint_model_name)

    vocab = Vocabulary()
    vocab.add_transformer_vocab(transformers_tokenizer, "tokens")
    num_labels = len(transformers_model.config.id2label)
    labels = [transformers_model.config.id2label[i] for i in range(num_labels)]
    labels = ["O" if l == "NIL" else l for l in labels]
    vocab.add_tokens_to_namespace(labels, namespace="labels")

    # read model
    params = Params.from_file(
        model_config_path,
        ext_vars={"TRANSFORMERS_MODEL_NAME": checkpoint_model_name})
    if prediction_save_path is not None:
        params["prediction_save_path"] = prediction_save_path
    model = Model.from_params(params, vocab=vocab)
    model.classifier = transformers_model.classifier
    model.eval()

    # set the GPU device to use
    if cuda_device < 0:
        device = torch.device("cpu")
    else:
        device = torch.device(f"cuda:{cuda_device}")
    model = model.to(device)

    loader = MultiProcessDataLoader(reader,
                                    data_path,
                                    batch_size=batch_size,
                                    shuffle=False)
    loader.index_with(model.vocab)
    with torch.no_grad():
        for batch in tqdm.tqdm(loader):
            batch = nn_util.move_to_device(batch, device)
            output_dict = model(**batch)

    metrics = model.get_metrics(reset=True)
    print(metrics)
    if result_save_path is not None:
        with open(result_save_path, "w") as f:
            json.dump(metrics, f)