def test_multiprocess_iterate_partial_does_not_hang(self):
     for test_instances in (self.instances, self.lazy_instances):
         base_iterator = BasicIterator(batch_size=2, max_instances_in_memory=1024)
         iterator = MultiprocessIterator(base_iterator, num_workers=4)
         iterator.index_with(self.vocab)
         generator = iterator(test_instances, num_epochs=1)
         # We only iterate through 3 of the 5 instances causing the
         # processes generating the tensors to remain active.
         for _ in range(3):
             next(generator)
 def test_yield_one_epoch_iterates_over_the_data_once(self):
     for test_instances in (self.instances, self.lazy_instances):
         base_iterator = BasicIterator(batch_size=2, max_instances_in_memory=1024)
         iterator = MultiprocessIterator(base_iterator, num_workers=4)
         iterator.index_with(self.vocab)
         batches = list(iterator(test_instances, num_epochs=1))
         # We just want to get the single-token array for the text field in the instance.
         instances = [tuple(instance.detach().cpu().numpy())
                      for batch in batches
                      for instance in batch['text']["tokens"]]
         assert len(instances) == 5
    def test_multiprocess_reader_with_multiprocess_iterator(self):
        # use SequenceTaggingDatasetReader as the base reader
        reader = MultiprocessDatasetReader(base_reader=self.base_reader, num_workers=2)
        base_iterator = BasicIterator(batch_size=32, max_instances_in_memory=1024)

        iterator = MultiprocessIterator(base_iterator, num_workers=2)
        iterator.index_with(self.vocab)

        instances = reader.read(self.glob)

        tensor_dicts = iterator(instances, num_epochs=1)
        sizes = [len(tensor_dict['tags']) for tensor_dict in tensor_dicts]
        assert sum(sizes) == 400
    def test_multiprocess_reader_with_multiprocess_iterator(self):
        # use SequenceTaggingDatasetReader as the base reader
        reader = MultiprocessDatasetReader(base_reader=self.base_reader, num_workers=2)
        base_iterator = BasicIterator(batch_size=32, max_instances_in_memory=1024)

        iterator = MultiprocessIterator(base_iterator, num_workers=2)
        iterator.index_with(self.vocab)

        instances = reader.read(self.glob)

        tensor_dicts = iterator(instances, num_epochs=1)
        sizes = [len(tensor_dict["tags"]) for tensor_dict in tensor_dicts]
        assert sum(sizes) == 400
 def test_yield_one_epoch_iterates_over_the_data_once(self):
     for test_instances in (self.instances, self.lazy_instances):
         base_iterator = BasicIterator(batch_size=2, max_instances_in_memory=1024)
         iterator = MultiprocessIterator(base_iterator, num_workers=4)
         iterator.index_with(self.vocab)
         batches = list(iterator(test_instances, num_epochs=1))
         # We just want to get the single-token array for the text field in the instance.
         instances = [
             tuple(instance.detach().cpu().numpy())
             for batch in batches
             for instance in batch["text"]["tokens"]["tokens"]
         ]
         assert len(instances) == 5
Ejemplo n.º 6
0
    def test_model_training(self):
        training_dataset = self.sample_instances if self.sample_only else self.train_instances
        #training_dataset = training_dataset[:500]
        validation_dataset = self.sample_instances if self.sample_only else self.test_instances
        serialization_dir = self.TEST_DATA_ROOT / "serialized_sample" if self.sample_only else "serialized"
        tensorboard_dir = self.TEST_DATA_ROOT / "tensorboard.seq2seq"

        batch_size = 64

        train_iterator = BucketIterator(sorting_keys=[("source_tokens",
                                                       "num_tokens")],
                                        padding_noise=0.1,
                                        batch_size=batch_size)
        train_iterator.index_with(vocab=self.vocab)
        multiproc_iterator = MultiprocessIterator(train_iterator,
                                                  num_workers=4,
                                                  output_queue_size=6000)

        tensorboard = TensorboardWriter(get_batch_num_total=lambda: np.ceil(
            len(training_dataset) / batch_size),
                                        serialization_dir=tensorboard_dir,
                                        summary_interval=5,
                                        histogram_interval=5,
                                        should_log_parameter_statistics=True,
                                        should_log_learning_rate=True)

        optimizer = torch.optim.Adam(self.model.parameters(), lr=1e-3)
        scheduler = CosineWithRestarts(optimizer=optimizer, t_initial=5)

        trainer = CallbackTrainer(
            model=self.model,
            serialization_dir=serialization_dir,
            iterator=multiproc_iterator,
            training_data=self.train_instances,
            num_epochs=100,
            cuda_device=0,
            optimizer=optimizer,
            callbacks=[
                LogToTensorboard(tensorboard),
                Validate(validation_data=self.test_instances,
                         validation_iterator=multiproc_iterator),
                TrackMetrics(),
                ResetMetricsCallback(),
                UpdateLearningRate(scheduler),
                ValidationLogCallback(self.train_reader, self.test_instances)
            ])

        # trainer = Trainer(model=self.model,
        #                   serialization_dir=serialization_dir,
        #                   iterator=train_iterator,
        #                   train_dataset=training_dataset,
        #                   num_epochs=1,
        #                   cuda_device=0,
        #                   optimizer=torch.optim.Adam(self.model.parameters(), lr=1e-3),
        #                   validation_dataset=training_dataset,
        #                   validation_iterator=train_iterator,
        #                   should_log_learning_rate=True,
        #                   learning_rate_scheduler=scheduler
        #                   )

        # for i in range(50):
        #     print('Epoch: {}'.format(i))
        #     trainer.train()
        #
        #     import itertools
        #
        #     predictor = Seq2SeqPredictor(self.model, self.train_reader)
        #
        #     for instance in itertools.islice(training_dataset, 10):
        #         print('SOURCE:', instance.fields['source_tokens'].tokens)
        #         print('GOLD:', instance.fields['target_tokens'].tokens)
        #         print('PRED:', predictor.predict_instance(instance)['predicted_tokens'])
        #
        # self.val_outputs_fp.close()

        trainer.train()
Ejemplo n.º 7
0
def main():
    ###############################################################################################
    prepare_global_logging(serialization_dir=args.serialization_dir,
                           file_friendly_logging=False)
    #DATA
    reader = MathDatasetReader(source_tokenizer=CharacterTokenizer(),
                               target_tokenizer=CharacterTokenizer(),
                               source_token_indexers={
                                   'tokens':
                                   SingleIdTokenIndexer(namespace='tokens')
                               },
                               target_token_indexers={
                                   'tokens':
                                   SingleIdTokenIndexer(namespace='tokens')
                               },
                               target=False,
                               label=True,
                               lazy=True)
    train_data = reader.read("../../datasets/math/label-data/train-all")
    # val_data = reader.read("../../datasets/math/label-data/interpolate")

    vocab = Vocabulary()
    vocab.add_tokens_to_namespace([
        START_SYMBOL, END_SYMBOL, ' ', '!', "'", '(', ')', '*', '+', ',', '-',
        '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '<',
        '=', '>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
        'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y',
        'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
        'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{',
        '}'
    ],
                                  namespace='tokens')
    vocab.add_tokens_to_namespace([
        'algebra', 'arithmetic', 'calculus', 'comparison', 'measurement',
        'numbers', 'polynomials', 'probability'
    ],
                                  namespace='labels')

    # MODEL
    embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                          embedding_dim=EMBEDDING_DIM)
    source_embedder = BasicTextFieldEmbedder({"tokens": embedding})

    if args.model == 'lstm':
        encoder = PytorchSeq2VecWrapper(
            torch.nn.LSTM(EMBEDDING_DIM,
                          HIDDEN_DIM,
                          num_layers=NUM_LAYERS,
                          batch_first=True))
    elif args.model == 'cnn':
        encoder = CnnEncoder(embedding_dim=EMBEDDING_DIM,
                             num_filters=NUM_FILTERS,
                             output_dim=HIDDEN_DIM)
    else:
        raise NotImplemented("The classifier model should be LSTM or CNN")

    model = TextClassifier(
        vocab=vocab,
        source_text_embedder=source_embedder,
        encoder=encoder,
    )
    model.to(device)

    optimizer = optim.Adam(model.parameters(),
                           lr=1e-3,
                           betas=(0.9, 0.995),
                           eps=1e-6)

    train_iterator = BucketIterator(batch_size=BATCH_SIZE,
                                    max_instances_in_memory=1024,
                                    sorting_keys=[("source_tokens",
                                                   "num_tokens")])
    train_iterator = MultiprocessIterator(train_iterator, num_workers=16)
    train_iterator.index_with(vocab)

    val_iterator = BucketIterator(batch_size=BATCH_SIZE,
                                  max_instances_in_memory=1024,
                                  sorting_keys=[("source_tokens", "num_tokens")
                                                ])
    val_iterator = MultiprocessIterator(val_iterator, num_workers=16)
    val_iterator.index_with(vocab)
    #pdb.set_trace()

    LR_SCHEDULER = {"type": "exponential", "gamma": 0.5, "last_epoch": -1}
    lr_scheduler = LearningRateScheduler.from_params(optimizer,
                                                     Params(LR_SCHEDULER))

    # TRAIN
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=train_iterator,
                      validation_iterator=None,
                      train_dataset=train_data,
                      validation_dataset=None,
                      patience=None,
                      shuffle=True,
                      num_epochs=1,
                      summary_interval=100,
                      learning_rate_scheduler=lr_scheduler,
                      cuda_device=CUDA_DEVICES,
                      grad_norm=5,
                      grad_clipping=5,
                      model_save_interval=600,
                      serialization_dir=args.serialization_dir,
                      keep_serialized_model_every_num_seconds=3600,
                      should_log_parameter_statistics=True,
                      should_log_learning_rate=True)
    trainer.train()