def setUp(self): super(SemanticRoleLabelerTest, self).setUp() dataset = SrlReader().read('tests/fixtures/conll_2012/') vocab = Vocabulary.from_dataset(dataset) self.vocab = vocab dataset.index_instances(vocab) self.dataset = dataset params = Params({ "text_field_embedder": { "tokens": { "type": "embedding", "embedding_dim": 5 } }, "stacked_encoder": { "type": "lstm", "input_size": 6, "hidden_size": 7, "num_layers": 2 } }) self.model = SemanticRoleLabeler.from_params(self.vocab, params)
def test_forward(self): lr = 0.5 batch_size = 16 embedding_dim = 50 squad_reader = SquadReader() # Read SQuAD train set (use the test set, since it's smaller) train_dataset = squad_reader.read(self.squad_test) vocab = Vocabulary.from_dataset(train_dataset) train_dataset.index_instances(vocab) # Random embeddings for test test_embed_matrix = torch.rand(vocab.get_vocab_size(), embedding_dim) test_cbow = CBOW(test_embed_matrix) optimizer = optim.Adadelta(filter(lambda p: p.requires_grad, test_cbow.parameters()), lr=lr) iterator = BucketIterator(batch_size=batch_size, sorting_keys=[("passage", "num_tokens"), ("question", "num_tokens")]) for batch in iterator(train_dataset, num_epochs=1): passage = batch["passage"]["tokens"] question = batch["question"]["tokens"] span_start = batch["span_start"] span_end = batch["span_end"] output_dict = test_cbow(passage, question) softmax_start_logits = output_dict["softmax_start_logits"] softmax_end_logits = output_dict["softmax_end_logits"] loss = nll_loss(softmax_start_logits, span_start.view(-1)) loss += nll_loss(softmax_end_logits, span_end.view(-1)) optimizer.zero_grad() loss.backward() optimizer.step()
def test_batch_predictions_are_consistent(self): # The CNN encoder has problems with this kind of test - it's not properly masked yet, so # changing the amount of padding in the batch will result in small differences in the # output of the encoder. Because BiDAF is so deep, these differences get magnified through # the network and make this test impossible. So, we'll remove the CNN encoder entirely # from the model for this test. If/when we fix the CNN encoder to work correctly with # masking, we can change this back to how the other models run this test, with just a # single line. # pylint: disable=protected-access,attribute-defined-outside-init # Save some state. saved_dataset = self.dataset saved_model = self.model # Modify the state, run the test with modified state. params = Params.from_file(self.param_file) reader = DatasetReader.from_params(params['dataset_reader']) reader._token_indexers = {'tokens': reader._token_indexers['tokens']} self.dataset = reader.read('tests/fixtures/data/squad.json') vocab = Vocabulary.from_dataset(self.dataset) self.dataset.index_instances(vocab) del params['model']['text_field_embedder']['token_characters'] params['model']['phrase_layer']['input_size'] = 2 self.model = Model.from_params(vocab, params['model']) self.ensure_batch_predictions_are_consistent() # Restore the state. self.model = saved_model self.dataset = saved_dataset
def setUp(self): super(TestTrainer, self).setUp() dataset = SequenceTaggingDatasetReader().read( 'tests/fixtures/data/sequence_tagging.tsv') vocab = Vocabulary.from_dataset(dataset) self.vocab = vocab dataset.index_instances(vocab) self.dataset = dataset self.model_params = Params({ "text_field_embedder": { "tokens": { "type": "embedding", "embedding_dim": 5 } }, "stacked_encoder": { "type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2 } }) self.model = SimpleTagger.from_params(self.vocab, self.model_params) self.optimizer = torch.optim.SGD(self.model.parameters(), 0.01) self.iterator = BasicIterator(batch_size=2)
def setUp(self): super(SimpleTaggerTest, self).setUp() dataset = SequenceTaggingDatasetReader().read( 'tests/fixtures/data/sequence_tagging.tsv') vocab = Vocabulary.from_dataset(dataset) self.vocab = vocab dataset.index_instances(vocab) self.dataset = dataset params = Params({ "text_field_embedder": { "tokens": { "type": "embedding", "embedding_dim": 5 } }, "stacked_encoder": { "type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2 } }) self.model = SimpleTagger.from_params(self.vocab, params)
def read_data(squad_train_path, squad_dev_path, max_passage_length, max_question_length, min_token_count): """ Read SQuAD data, and filter by passage and question length. """ squad_reader = SquadReader() # Read SQuAD train set train_dataset = squad_reader.read(squad_train_path) logger.info("Read {} training examples".format(len( train_dataset.instances))) # Filter out examples with passage length greater than max_passage_length # or question length greater than max_question_length logger.info("Filtering out examples in train set with passage length " "greater than {} or question length greater than {}".format( max_passage_length, max_question_length)) train_dataset.instances = [ instance for instance in tqdm(train_dataset.instances) if len(instance.fields["passage"].tokens) <= max_passage_length and len(instance.fields["question"].tokens) <= max_question_length ] logger.info("{} training examples remain after filtering".format( len(train_dataset.instances))) # Make a vocabulary object from the train set train_vocab = Vocabulary.from_dataset(train_dataset, min_count=min_token_count) # Index the instances with the train vocabulary. # This converts string tokens to numerical indices. train_dataset.index_instances(train_vocab) # Read SQuAD validation set logger.info("Reading SQuAD validation set at {}".format(squad_dev_path)) validation_dataset = squad_reader.read(squad_dev_path) logger.info("Read {} validation examples".format( len(validation_dataset.instances))) # Filter out examples with passage length greater than max_passage_length # or question length greater than max_question_length logger.info("Filtering out examples in validation set with passage length " "greater than {} or question length greater than {}".format( max_passage_length, max_question_length)) validation_dataset.instances = [ instance for instance in tqdm(validation_dataset.instances) if len(instance.fields["passage"].tokens) <= max_passage_length and len(instance.fields["question"].tokens) <= max_question_length ] logger.info("{} validation examples remain after filtering".format( len(validation_dataset.instances))) # Index the instances with the train vocabulary. # This converts string tokens to numerical indices. validation_dataset.index_instances(train_vocab) return train_dataset, train_vocab, validation_dataset
def set_up_model(self, param_file, dataset_file): # pylint: disable=attribute-defined-outside-init self.param_file = param_file params = Params.from_file(self.param_file) reader = DatasetReader.from_params(params['dataset_reader']) dataset = reader.read(dataset_file) vocab = Vocabulary.from_dataset(dataset) self.vocab = vocab dataset.index_instances(vocab) self.dataset = dataset self.model = Model.from_params(self.vocab, params['model'])
def setUp(self): super(TestDecomposableAttention, self).setUp() constants.GLOVE_PATH = 'tests/fixtures/glove.6B.300d.sample.txt.gz' dataset = SnliReader().read('tests/fixtures/data/snli.jsonl') vocab = Vocabulary.from_dataset(dataset) self.vocab = vocab dataset.index_instances(vocab) self.dataset = dataset self.token_indexers = {'tokens': SingleIdTokenIndexer()} self.model = DecomposableAttention.from_params(self.vocab, Params({})) initializer = InitializerApplicator() initializer(self.model)
def test_forward(self): lr = 0.5 batch_size = 16 embedding_dim = 50 hidden_size = 15 dropout = 0.2 squad_reader = SquadReader() # Read SQuAD train set (use the test set, since it's smaller) train_dataset = squad_reader.read(self.squad_test) vocab = Vocabulary.from_dataset(train_dataset) train_dataset.index_instances(vocab) # Random embeddings for test test_embed_matrix = torch.rand(vocab.get_vocab_size(), embedding_dim) test_attention_rnn = AttentionRNN(test_embed_matrix, hidden_size, dropout) try: optimizer = optim.Adadelta(filter(lambda p: p.requires_grad, test_attention_rnn.parameters()), lr=lr) except ValueError: # Likely there are no parameters to optimize, because # the code is not complete. pass iterator = BucketIterator(batch_size=batch_size, sorting_keys=[("passage", "num_tokens"), ("question", "num_tokens")]) for batch in iterator(train_dataset, num_epochs=1): passage = batch["passage"]["tokens"] question = batch["question"]["tokens"] span_start = batch["span_start"] span_end = batch["span_end"] try: output_dict = test_attention_rnn(passage, question) softmax_start_logits = output_dict["softmax_start_logits"] softmax_end_logits = output_dict["softmax_end_logits"] loss = nll_loss(softmax_start_logits, span_start.view(-1)) loss += nll_loss(softmax_end_logits, span_end.view(-1)) optimizer.zero_grad() loss.backward() optimizer.step() except NotImplementedError: # AttentionRNN.forward() not implemented yet, don't fail tests. pass
def setUp(self): super(SimpleTaggerTest, self).setUp() self.write_sequence_tagging_data() dataset = SequenceTaggingDatasetReader().read(self.TRAIN_FILE) vocab = Vocabulary.from_dataset(dataset) self.vocab = vocab dataset.index_instances(vocab) self.dataset = dataset params = Params({ "text_field_embedder": { "tokens": { "type": "embedding", "embedding_dim": 5 } }, "hidden_size": 7, "num_layers": 2 }) self.model = SimpleTagger.from_params(self.vocab, params)
def setUp(self): super(BidirectionalAttentionFlowTest, self).setUp() constants.GLOVE_PATH = 'tests/fixtures/glove.6B.100d.sample.txt.gz' reader_params = Params({ 'token_indexers': { 'tokens': { 'type': 'single_id' }, 'token_characters': { 'type': 'characters' } } }) dataset = SquadReader.from_params(reader_params).read( 'tests/fixtures/data/squad.json') vocab = Vocabulary.from_dataset(dataset) self.vocab = vocab dataset.index_instances(vocab) self.dataset = dataset self.token_indexers = { 'tokens': SingleIdTokenIndexer(), 'token_characters': TokenCharactersIndexer() } self.model = BidirectionalAttentionFlow.from_params( self.vocab, Params({})) small_params = Params({ 'text_field_embedder': { 'tokens': { 'type': 'embedding', 'pretrained_file': constants.GLOVE_PATH, 'trainable': False, 'projection_dim': 4 }, 'token_characters': { 'type': 'character_encoding', 'embedding': { 'embedding_dim': 8 }, 'encoder': { 'type': 'cnn', 'embedding_dim': 8, 'num_filters': 4, 'ngram_filter_sizes': [5] } } }, 'phrase_layer': { 'type': 'lstm', 'bidirectional': True, 'input_size': 8, 'hidden_size': 4, 'num_layers': 1, }, 'similarity_function': { 'type': 'linear', 'combination': 'x,y,x*y', 'tensor_1_dim': 8, 'tensor_2_dim': 8 }, 'modeling_layer': { 'type': 'lstm', 'bidirectional': True, 'input_size': 32, 'hidden_size': 4, 'num_layers': 1, }, 'span_end_encoder': { 'type': 'lstm', 'bidirectional': True, 'input_size': 56, 'hidden_size': 4, 'num_layers': 1, }, }) self.small_model = BidirectionalAttentionFlow.from_params( self.vocab, small_params)