예제 #1
0
    def test_caching_with_lazy_reader_in_multi_process_loader(self):
        data_file = (
            AllenNlpTestCase.FIXTURES_ROOT
            / "data"
            / "text_classification_json"
            / "imdb_corpus.jsonl"
        )
        reader = TextClassificationJsonReader(lazy=True, cache_directory=self.cache_directory)
        deque(
            PyTorchDataLoader(reader.read(data_file), collate_fn=lambda b: b[0], num_workers=2),
            maxlen=0,
        )

        # We shouldn't write to the cache when the data is being loaded from multiple
        # processes.
        cache_file = reader._get_cache_location_for_file_path(str(data_file))
        assert not os.path.exists(cache_file)

        # But try again from the main process and we should see the cache file.
        instances = list(reader.read(data_file))
        assert instances
        assert os.path.exists(cache_file)

        # Reading again from a multi-process loader should read from the cache.
        new_instances = list(
            PyTorchDataLoader(reader.read(data_file), collate_fn=lambda b: b[0], num_workers=2)
        )
        assert len(instances) == len(new_instances)
예제 #2
0
    def compute_influence_values(self, training_loader: PyTorchDataLoader,
                                 validation_loader: PyTorchDataLoader):

        training_loader = PyTorchDataLoader(training_loader.dataset,
                                            batch_size=1,
                                            shuffle=False)
        validation_loader = PyTorchDataLoader(validation_loader.dataset,
                                              batch_size=1,
                                              shuffle=False)

        influence_values = []
        validation_idx = []

        for batch in tqdm(iter(validation_loader)):
            assert len(batch["metadata"]) == 1, breakpoint()
            influence_values.append([])
            ihvp = self.ihvp(batch, training_loader
                             )  # (tuple of params) # = H^-1 . Grad(L(z_test))
            validation_idx.append(batch["metadata"][0]["idx"])

            training_idx = []
            for train_ex in tqdm(iter(training_loader)):
                assert len(train_ex["metadata"]) == 1, breakpoint()
                train_grad = self.get_grad(train_ex)

                if_value = sum(
                    (x * y).sum().item()
                    for x, y in zip(ihvp, train_grad)) / len(training_loader)
                influence_values[-1].append(if_value)

                training_idx.append(train_ex["metadata"][0]["idx"])

        return np.array(influence_values), training_idx, validation_idx
예제 #3
0
    def ihvp(self, test_example, training_loader):
        self._predictor._model.zero_grad()
        v = self.get_grad(test_example)

        if not self._use_hessian:
            return tuple(x.detach() for x in v)

        ihv_estimate = v

        training_loader = PyTorchDataLoader(training_loader.dataset,
                                            batch_size=5,
                                            shuffle=True)
        training_iter = iter(training_loader)
        for _ in tqdm(range(len(training_loader))):
            train_batch = next(training_iter)

            self._predictor._model.zero_grad()

            loss = self.get_outputs_for_batch(train_batch)
            hv = vhp_s(loss, self._valid_parameters, ihv_estimate)

            with torch.no_grad():
                ihv_estimate = tuple(
                    _v + (1 - self._damping) * _ihv - _hv / self._scale
                    for _v, _ihv, _hv in zip(v, ihv_estimate, hv))

        return tuple(x.detach() for x in ihv_estimate)
예제 #4
0
def test_multi_processing_with_lazy_dataset_warns():
    def fake_instance_generator(file_name: str) -> Iterable[Instance]:
        yield from []

    with pytest.warns(UserWarning, match=r".*deadlocks.*"):
        PyTorchDataLoader(AllennlpLazyDataset(fake_instance_generator,
                                              "nonexistent_file"),
                          num_workers=1)
예제 #5
0
    def test_batch_of_entirely_empty_lists_works(self):
        dataset = AllennlpDataset([self.empty_instance, self.empty_instance],
                                  self.vocab)

        model = DummyModel(self.vocab)
        model.eval()
        loader = PyTorchDataLoader(dataset, batch_size=2)
        batch = next(iter(loader))
        model.forward(**batch)
예제 #6
0
 def test_max_instances_with_multi_process_loader(self, num_workers):
     data_file = (AllenNlpTestCase.FIXTURES_ROOT / "data" /
                  "text_classification_json" / "imdb_corpus.jsonl")
     reader = TextClassificationJsonReader(max_instances=2, lazy=True)
     instances = list(
         PyTorchDataLoader(reader.read(data_file),
                           collate_fn=lambda b: b[0],
                           num_workers=num_workers))
     assert len(instances) == 2
예제 #7
0
def main():
    args = get_args()
    # Read config.
    file_dict = json.loads(evaluate_file(args.training_config))
    model_dict = file_dict["model"]

    if args.use_bert:
        bert_name = model_dict["embedder"]["token_embedders"]["bert"][
            "model_name"]
    else:
        bert_name = None

    # Hack to replace components that we're setting in the script.
    for name in ["type", "embedder", "initializer", "module_initializer"]:
        del model_dict[name]

    # Create indexer.
    if args.use_bert:
        tok_indexers = {
            "bert":
            token_indexers.PretrainedTransformerMismatchedIndexer(
                bert_name, max_length=512)
        }
    else:
        tok_indexers = {"tokens": token_indexers.SingleIdTokenIndexer()}

    # Read input data.
    reader = DyGIEReader(max_span_width=8,
                         token_indexers=tok_indexers,
                         max_instances=args.max_instances)
    data = reader.read(file_dict["train_data_path"])
    vocab = vocabulary.Vocabulary.from_instances(data)
    data.index_with(vocab)

    # Create embedder.
    if args.use_bert:
        token_embedder = token_embedders.PretrainedTransformerMismatchedEmbedder(
            bert_name, max_length=512)
        embedder = text_field_embedders.BasicTextFieldEmbedder(
            {"bert": token_embedder})
    else:
        token_embedder = token_embedders.Embedding(
            num_embeddings=vocab.get_vocab_size("tokens"), embedding_dim=100)
        embedder = text_field_embedders.BasicTextFieldEmbedder(
            {"tokens": token_embedder})

    # Create iterator and model.
    iterator = PyTorchDataLoader(batch_size=1, dataset=data)
    if args.model_archive is None:
        model = dygie.DyGIE(vocab=vocab, embedder=embedder, **model_dict)
    else:
        model = dygie.DyGIE.from_archive(args.model_archive)

    # Run forward pass over a single entry.
    for batch in iterator:
        output_dict = model(**batch)
def run(args):
    predictor: Predictor = get_predictor(args)

    training_file = args.training_file
    validation_file = args.validation_file

    training_data = read_data(predictor._dataset_reader, training_file)
    validation_data = read_data(predictor._dataset_reader, validation_file)

    print("Indexing with Vocabulary")
    training_data.index_with(predictor._model.vocab)
    validation_data.index_with(predictor._model.vocab)

    training_loader = PyTorchDataLoader(training_data,
                                        batch_size=args.training_batch_size,
                                        shuffle=False)
    validation_loader = PyTorchDataLoader(
        validation_data, batch_size=args.validation_batch_size, shuffle=False)

    print("Computing Influence Values")
    if args.run_all:
        influencers = get_influencer_iterable(predictor, args)
    else:
        influencers = [get_influencer(predictor, args)]

    for influencer in influencers:
        influence_values, training_idx, validation_idx = influencer.compute_influence_values(
            training_loader, validation_loader)

        output_folder = args.output_folder
        output_subfolder = influencer.get_output_subfolder().strip()
        if len(output_subfolder) > 0:
            output_folder = os.path.join(output_folder, output_subfolder)

        print(f"Dumping stuff to {output_folder}")
        dump_results(influence_values, training_idx, validation_idx,
                     output_folder)

        print("Job done. Rejoice !")
    def test_batch_count(self):
        dataset = AllennlpDataset(self.instances, vocab=self.vocab)
        sampler = BucketBatchSampler(dataset,
                                     batch_size=2,
                                     padding_noise=0,
                                     sorting_keys=["text"])
        # We use a custom collate_fn for testing, which doesn't actually create tensors,
        # just the allennlp Batches.
        dataloader = PyTorchDataLoader(dataset,
                                       batch_sampler=sampler,
                                       collate_fn=lambda x: Batch(x))

        assert len(dataloader) == 3
예제 #10
0
    def test_elmo_bilm(self):
        # get the raw data
        sentences, expected_lm_embeddings = self._load_sentences_embeddings()

        # load the test model
        elmo_bilm = _ElmoBiLm(self.options_file, self.weight_file)

        # Deal with the data.
        indexer = ELMoTokenCharactersIndexer()

        # For each sentence, first create a TextField, then create an instance
        instances = []
        for batch in zip(*sentences):
            for sentence in batch:
                tokens = [Token(token) for token in sentence.split()]
                field = TextField(tokens, {"character_ids": indexer})
                instance = Instance({"elmo": field})
                instances.append(instance)

        vocab = Vocabulary()
        dataset = AllennlpDataset(instances, vocab)
        # Now finally we can iterate through batches.
        loader = PyTorchDataLoader(dataset, 3)
        for i, batch in enumerate(loader):
            lm_embeddings = elmo_bilm(
                batch["elmo"]["character_ids"]["elmo_tokens"])
            top_layer_embeddings, mask = remove_sentence_boundaries(
                lm_embeddings["activations"][2], lm_embeddings["mask"])

            # check the mask lengths
            lengths = mask.data.numpy().sum(axis=1)
            batch_sentences = [sentences[k][i] for k in range(3)]
            expected_lengths = [
                len(sentence.split()) for sentence in batch_sentences
            ]
            assert lengths.tolist() == expected_lengths

            # get the expected embeddings and compare!
            expected_top_layer = [
                expected_lm_embeddings[k][i] for k in range(3)
            ]
            for k in range(3):
                assert numpy.allclose(
                    top_layer_embeddings[k, :lengths[k], :].data.numpy(),
                    expected_top_layer[k],
                    atol=1.0e-6,
                )
예제 #11
0
 def test_trainer_respects_epoch_size_smaller_tnan_total(self):
     batches_per_epoch = 1
     num_epochs = 2
     data_loader_smaller_epoch = PyTorchDataLoader(
         self.instances,
         batch_size=2,
         collate_fn=allennlp_collate,
         batches_per_epoch=batches_per_epoch,
     )
     trainer = GradientDescentTrainer(
         self.model,
         self.optimizer,
         data_loader_smaller_epoch,
         validation_data_loader=self.validation_data_loader,
         num_epochs=num_epochs,
         serialization_dir=self.TEST_DIR,
     )
     assert trainer._batch_num_total == 0
     metrics = trainer.train()
     epoch = metrics["epoch"]
     assert epoch == num_epochs - 1
     assert trainer._batch_num_total == num_epochs * batches_per_epoch
예제 #12
0
 def test_data_loader_lazy_epoch_size_correct_custom_epoch_size(self):
     batches_per_epoch = 3
     num_epochs = 3
     data_loader_custom_epoch_lazy = PyTorchDataLoader(
         self.instances_lazy,
         batch_size=2,
         collate_fn=allennlp_collate,
         batches_per_epoch=batches_per_epoch,
     )
     trainer = GradientDescentTrainer(
         self.model,
         self.optimizer,
         data_loader_custom_epoch_lazy,
         validation_data_loader=self.validation_data_loader,
         num_epochs=num_epochs,
         serialization_dir=self.TEST_DIR,
     )
     assert trainer._batch_num_total == 0
     metrics = trainer.train()
     epoch = metrics["epoch"]
     assert epoch == num_epochs - 1
     assert trainer._batch_num_total == num_epochs * batches_per_epoch
예제 #13
0
    def test_total_loss_is_average_of_batch_loss(self):

        batches_per_epoch = 3

        data_loader_custom_epoch_lazy = PyTorchDataLoader(
            self.instances_lazy,
            batch_size=2,
            collate_fn=allennlp_collate,
            batches_per_epoch=batches_per_epoch,
        )

        class FakeBatchCallback(BatchCallback):
            def __call__(
                self,
                trainer: "GradientDescentTrainer",
                batch_inputs: List[List[TensorDict]],
                batch_outputs: List[Dict[str, Any]],
                batch_metrics: Dict[str, Any],
                epoch: int,
                batch_number: int,
                is_training: bool,
                is_master: bool,
            ) -> None:
                if not hasattr(trainer, "batch_losses"):
                    trainer.batch_losses = []  # type: ignore
                trainer.batch_losses.append(
                    batch_outputs[0]["loss"].item())  # type: ignore

        trainer = GradientDescentTrainer(
            self.model,
            self.optimizer,
            data_loader_custom_epoch_lazy,
            num_epochs=1,
            batch_callbacks=[FakeBatchCallback()],
        )
        metrics = trainer.train()

        assert metrics["training_loss"] == float(
            sum(trainer.batch_losses) / batches_per_epoch)
예제 #14
0
    def test_drop_last_works(self):
        dataset = AllennlpDataset(self.instances, vocab=self.vocab)
        sampler = BucketBatchSampler(
            dataset,
            batch_size=2,
            padding_noise=0,
            sorting_keys=["text"],
            drop_last=True,
        )
        # We use a custom collate_fn for testing, which doesn't actually create tensors,
        # just the allennlp Batches.
        dataloader = PyTorchDataLoader(dataset,
                                       batch_sampler=sampler,
                                       collate_fn=lambda x: Batch(x))
        batches = [batch for batch in iter(dataloader)]
        stats = self.get_batches_stats(batches)

        # all batches have length batch_size
        assert all(batch_len == 2 for batch_len in stats["batch_lengths"])

        # we should have lost one instance by skipping the last batch
        assert stats["total_instances"] == len(self.instances) - 1
예제 #15
0
def test_loader_uses_all_instances_when_batches_per_epochs_set(lazy):
    NUM_INSTANCES = 20
    BATCH_SIZE = 2
    BATCHES_PER_EPOCH = 3
    EPOCHS = 4

    class FakeDatasetReader(DatasetReader):
        def _read(self, filename: str) -> Iterable[Instance]:
            for i in range(NUM_INSTANCES):
                yield Instance({"index": LabelField(i, skip_indexing=True)})

    reader = FakeDatasetReader(lazy=lazy)
    dataset = reader.read("blah")

    loader = PyTorchDataLoader(dataset,
                               batch_size=BATCH_SIZE,
                               batches_per_epoch=BATCHES_PER_EPOCH)
    epoch_batches = []
    for epoch in range(EPOCHS):
        batches = []
        for batch in loader:
            instances = []
            for index in batch["index"]:
                instances.append(index)
            batches.append(instances)
        epoch_batches.append(batches)

    assert epoch_batches == [
        # Epoch 0.
        [[0, 1], [2, 3], [4, 5]],
        # Epoch 1.
        [[6, 7], [8, 9], [10, 11]],
        # Epoch 2.
        [[12, 13], [14, 15], [16, 17]],
        # Epoch 3.
        [[18, 19], [0, 1], [2, 3]],
    ]
예제 #16
0
def main():
    args = get_args()
    # Read config.
    conf_dict = json.loads(evaluate_file(args.training_config))
    model_dict = conf_dict["model"]

    if args.use_bert:
        bert_name = model_dict["embedder"]["token_embedders"]["bert"][
            "model_name"]
        bert_max_length = model_dict["embedder"]["token_embedders"]["bert"][
            "max_length"]
    else:
        bert_name = None

    # Hack to replace components that we're setting in the script.
    for name in ["type", "embedder", "initializer", "module_initializer"]:
        del model_dict[name]

    # Create indexer.
    if args.use_bert:
        tok_indexers = {
            "bert":
            token_indexers.PretrainedTransformerMismatchedIndexer(
                bert_name, max_length=bert_max_length)
        }
    else:
        tok_indexers = {"tokens": token_indexers.SingleIdTokenIndexer()}

    # Read input data.
    reader_dict = conf_dict["dataset_reader"]
    reader = DyGIEReader(
        reader_dict["max_span_width"],
        max_trigger_span_width=reader_dict["max_trigger_span_width"],
        token_indexers=tok_indexers,
        max_instances=500)
    # token_indexers=tok_indexers, max_instances=args.max_instances)
    data = reader.read(conf_dict["train_data_path"])
    vocab = vocabulary.Vocabulary.from_instances(data)
    data.index_with(vocab)

    # Create embedder.
    if args.use_bert:
        token_embedder = token_embedders.PretrainedTransformerMismatchedEmbedder(
            bert_name, max_length=bert_max_length)
        embedder = text_field_embedders.BasicTextFieldEmbedder(
            {"bert": token_embedder})
    else:
        token_embedder = token_embedders.Embedding(
            num_embeddings=vocab.get_vocab_size("tokens"), embedding_dim=100)
        embedder = text_field_embedders.BasicTextFieldEmbedder(
            {"tokens": token_embedder})

    # Create context layer: if not passthrough always use lstm when testing
    if model_dict["context_layer"]["type"] != "pass_through":
        del model_dict["context_layer"]["type"]
        model_dict["context_layer"]["input_size"] = embedder.get_output_dim()
        context_layer = seq2seq_encoders.LstmSeq2SeqEncoder(
            **model_dict["context_layer"])
    else:
        context_layer = seq2seq_encoders.PassThroughEncoder(
            embedder.get_output_dim())
    del model_dict["context_layer"]

    # Create iterator and model.
    iterator = PyTorchDataLoader(batch_size=1, dataset=data)
    if args.model_archive is None:
        model = dygie.DyGIE(vocab=vocab,
                            embedder=embedder,
                            context_layer=context_layer,
                            **model_dict)
    else:
        model = dygie.DyGIE.from_archive(args.model_archive)

    # Run forward pass over a single entry.
    for batch in iterator:
        output_dict = model(**batch)
        print(output_dict)
예제 #17
0
    model = TK(word_embedder, n_kernels=11, n_layers = 2, n_tf_dim = 300, n_tf_heads = 10)


# todo optimizer, loss 

print('Model',config["model"],'total parameters:', sum(p.numel() for p in model.parameters() if p.requires_grad))
print('Network:', model)

#
# train
#

_triple_reader = IrTripleDatasetReader(lazy=True, max_doc_length=180, max_query_length=30)
_triple_reader = _triple_reader.read(config["train_data"])
_triple_reader.index_with(vocab)
loader = PyTorchDataLoader(_triple_reader, batch_size=32)

for epoch in range(2):

    for batch in Tqdm.tqdm(loader):
        # todo train loop
        pass


#
# eval (duplicate for validation inside train loop - but rename "loader", since
# otherwise it will overwrite the original train iterator, which is instantiated outside the loop)
#

_tuple_reader = IrLabeledTupleDatasetReader(lazy=True, max_doc_length=180, max_query_length=30)
_tuple_reader = _tuple_reader.read(config["test_data"])
예제 #18
0
from allennlp.data.dataloader import PyTorchDataLoader
from allennlp.modules.attention import DotProductAttention
from allennlp.modules import Embedding
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.modules.seq2seq_encoders import RnnSeq2SeqEncoder, PytorchSeq2SeqWrapper
from allennlp.training.trainer import GradientDescentTrainer, Trainer
from allennlp.training.optimizers import AdamOptimizer

import torch
import torch.nn as nn
from torch.autograd import Variable


reader = CopyNetDatasetReader(target_namespace="trg")
train_dataset = reader.read('data/train.tsv')
train_loader = PyTorchDataLoader(train_dataset, batch_size=8, shuffle=True)
vocab = Vocabulary.from_instances(train_dataset)
EMBEDDING_DIM = 128
HIDDEN_DIM = 256
TARGET_EMBEDDING_DIM = 512

token_embedding = Embedding(embedding_dim=EMBEDDING_DIM, num_embeddings=vocab.get_vocab_size(namespace="tokens"))
word_embedding = BasicTextFieldEmbedder({"token": token_embedding})

bi_rnn_encoder = RnnSeq2SeqEncoder(EMBEDDING_DIM, HIDDEN_DIM, 2, bidirectional=True)
dot_attn = DotProductAttention()
model = CopyNetSeq2Seq(vocab, word_embedding, bi_rnn_encoder, dot_attn,
                       target_namespace="trg", target_embedding_dim=TARGET_EMBEDDING_DIM)

with tempfile.TemporaryDirectory() as serialization_dir:
    parameters = [