def test_lstms_are_interleaved(self): lstm = StackedAlternatingLstm(3, 7, 8) for i, layer in enumerate(lstm.lstm_layers): if i % 2 == 0: assert layer.go_forward else: assert not layer.go_forward
def test_wrapper_works_with_alternating_lstm(self): model = PytorchSeq2VecWrapper( StackedAlternatingLstm(input_size=4, hidden_size=5, num_layers=3)) input_tensor = torch.randn(2, 3, 4) mask = torch.ones(2, 3).bool() output = model(input_tensor, mask) assert tuple(output.size()) == (2, 5)
def test_stacked_alternating_lstm_completes_forward_pass(self): input_tensor = torch.rand(4, 5, 3) input_tensor[1, 4:, :] = 0.0 input_tensor[2, 2:, :] = 0.0 input_tensor[3, 1:, :] = 0.0 input_tensor = pack_padded_sequence(input_tensor, [5, 4, 2, 1], batch_first=True) lstm = StackedAlternatingLstm(3, 7, 3) output, _ = lstm(input_tensor) output_sequence, _ = pad_packed_sequence(output, batch_first=True) numpy.testing.assert_array_equal(output_sequence.data[1, 4:, :].numpy(), 0.0) numpy.testing.assert_array_equal(output_sequence.data[2, 2:, :].numpy(), 0.0) numpy.testing.assert_array_equal(output_sequence.data[3, 1:, :].numpy(), 0.0)
def get_models_and_inputs(batch_size, input_size, output_size, num_layers, timesteps, dropout_prob): # Import is here because the layer requires a GPU. from allennlp.modules.alternating_highway_lstm import AlternatingHighwayLSTM baseline = StackedAlternatingLstm( input_size, output_size, num_layers, dropout_prob, use_input_projection_bias=False).cuda() kernel_version = AlternatingHighwayLSTM(input_size, output_size, num_layers, dropout_prob).cuda() # Copy weights from non-cuda version into cuda version, # so we are starting from exactly the same place. weight_index = 0 bias_index = 0 for layer_index in range(num_layers): layer = getattr(baseline, u'layer_%d' % layer_index) input_weight = layer.input_linearity.weight state_weight = layer.state_linearity.weight bias = layer.state_linearity.bias kernel_version.weight.data[weight_index: weight_index + input_weight.nelement()]\ .view_as(input_weight.t()).copy_(input_weight.data.t()) weight_index += input_weight.nelement() kernel_version.weight.data[weight_index: weight_index + state_weight.nelement()]\ .view_as(state_weight.t()).copy_(state_weight.data.t()) weight_index += state_weight.nelement() kernel_version.bias.data[bias_index:bias_index + bias.nelement()].copy_(bias.data) bias_index += bias.nelement() baseline_input = torch.randn(batch_size, timesteps, input_size, requires_grad=True).cuda() # Clone variable so different models are # completely separate in the graph. kernel_version_input = baseline_input.clone() lengths = [timesteps - int((i / 2)) for i in range(batch_size)] lengths = lengths[:batch_size] return baseline, kernel_version, baseline_input, kernel_version_input, lengths
def __init__( self, input_size: int, hidden_size: int, num_layers: int, recurrent_dropout_probability: float = 0.0, use_highway: bool = True, use_input_projection_bias: bool = True, ) -> None: module = StackedAlternatingLstm( input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, recurrent_dropout_probability=recurrent_dropout_probability, use_highway=use_highway, use_input_projection_bias=use_input_projection_bias, ) super().__init__(module=module)
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, binary_feature_dim: int, embedding_dropout: float = 0.0, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None, label_smoothing: float = None, ignore_span_metric: bool = False, srl_eval_path: str = DEFAULT_SRL_EVAL_PATH) -> None: super(GCN_model, self).__init__(vocab, regularizer) self.text_field_embedder = text_field_embedder self.num_classes = self.vocab.get_vocab_size("labels") if srl_eval_path is not None: # For the span based evaluation, we don't want to consider labels # for verb, because the verb index is provided to the model. self.span_metric = SrlEvalScorer(srl_eval_path, ignore_classes=["V"]) else: self.span_metric = None self.encoder = encoder self.gcn_layer = GCN(nfeat=self.encoder.get_output_dim(), nhid=200, nclass=64, dropout=0.1) self.decoder = PytorchSeq2SeqWrapper( StackedAlternatingLstm(input_size=64, hidden_size=32, num_layers=2, recurrent_dropout_probability=0.1, use_highway=True)) self.tag_projection_layer = TimeDistributed(Linear(32, self.num_classes)) # self.tag_projection_layer = TimeDistributed(Linear(self.encoder.get_output_dim(), self.num_classes)) # There are exactly 2 binary features for the verb predicate embedding. self.binary_feature_embedding = Embedding(2, binary_feature_dim) self.embedding_dropout = Dropout(p=embedding_dropout) self._label_smoothing = label_smoothing self.ignore_span_metric = ignore_span_metric check_dimensions_match(text_field_embedder.get_output_dim() + binary_feature_dim, encoder.get_input_dim(), "text embedding dim + verb indicator embedding dim", "encoder input dim") initializer(self)
def main(): parser = create_parser() args = parser.parse_args() torch.manual_seed(args.seed) model_id = create_model_id(args) if not path.exists(args.out_dir): print("# Create directory: {}".format(args.out_dir)) os.mkdir(args.out_dir) # log file out_dir = path.join(args.out_dir, "out-" + model_id) print("# Create output directory: {}".format(out_dir)) os.mkdir(out_dir) log = StandardLogger(path.join(out_dir, "log-" + model_id + ".txt")) log.write(args=args) write_args_log(args, path.join(out_dir, "args.json")) # dataset reader token_indexers = { "tokens": SingleIdTokenIndexer(), "elmo": ELMoTokenCharactersIndexer(), "bert": PretrainedBertIndexer(BERT_MODEL, use_starting_offsets=True), "xlnet": PretrainedTransformerIndexer(XLNET_MODEL, do_lowercase=False) } reader = SrlDatasetReader(token_indexers) # dataset train_dataset = reader.read_with_ratio(args.train, args.data_ratio) validation_dataset = reader.read_with_ratio(args.dev, 100) pseudo_dataset = reader.read_with_ratio( args.pseudo, args.data_ratio) if args.pseudo else [] all_dataset = train_dataset + validation_dataset + pseudo_dataset if args.test: test_dataset = reader.read_with_ratio(args.test, 100) all_dataset += test_dataset vocab = Vocabulary.from_instances(all_dataset) # embedding input_size = args.binary_dim * 2 if args.multi_predicate else args.binary_dim if args.glove: token_embedding = Embedding( num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=GLOVE_DIM, trainable=True, pretrained_file=GLOVE) input_size += GLOVE_DIM else: token_embedding = Embedding( num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=args.embed_dim, trainable=True) input_size += args.embed_dim token_embedders = {"tokens": token_embedding} if args.elmo: elmo_embedding = ElmoTokenEmbedder(options_file=ELMO_OPT, weight_file=ELMO_WEIGHT) token_embedders["elmo"] = elmo_embedding input_size += ELMO_DIM if args.bert: bert_embedding = PretrainedBertEmbedder(BERT_MODEL) token_embedders["bert"] = bert_embedding input_size += BERT_DIM if args.xlnet: xlnet_embedding = PretrainedTransformerEmbedder(XLNET_MODEL) token_embedders["xlnet"] = xlnet_embedding input_size += XLNET_DIM word_embeddings = BasicTextFieldEmbedder(token_embedders=token_embedders, allow_unmatched_keys=True, embedder_to_indexer_map={ "bert": ["bert", "bert-offsets"], "elmo": ["elmo"], "tokens": ["tokens"], "xlnet": ["xlnet"] }) # encoder if args.highway: lstm = PytorchSeq2SeqWrapper( StackedAlternatingLstm(input_size=input_size, hidden_size=args.hidden_dim, num_layers=args.n_layers, recurrent_dropout_probability=args.dropout)) else: pytorch_lstm = torch.nn.LSTM(input_size=input_size, hidden_size=args.hidden_dim, num_layers=int(args.n_layers / 2), batch_first=True, dropout=args.dropout, bidirectional=True) # initialize for name, param in pytorch_lstm.named_parameters(): if 'weight_ih' in name: torch.nn.init.xavier_uniform_(param.data) elif 'weight_hh' in name: # Wii, Wif, Wic, Wio for n in range(4): torch.nn.init.orthogonal_( param.data[args.hidden_dim * n:args.hidden_dim * (n + 1)]) elif 'bias' in name: param.data.fill_(0) lstm = PytorchSeq2SeqWrapper(pytorch_lstm) # model hidden_dim = args.hidden_dim if args.highway else args.hidden_dim * 2 # pytorch.nn.LSTMはconcatされるので2倍 model = SemanticRoleLabelerWithAttention( vocab=vocab, text_field_embedder=word_embeddings, encoder=lstm, binary_feature_dim=args.binary_dim, embedding_dropout=args.embed_dropout, attention_dropout=0.0, use_attention=args.attention, use_multi_predicate=args.multi_predicate, hidden_dim=hidden_dim) if args.model: print("# Load model parameter: {}".format(args.model)) with open(args.model, 'rb') as f: state_dict = torch.load(f, map_location='cpu') model.load_state_dict(state_dict) if torch.cuda.is_available(): cuda_device = 0 model = model.cuda(cuda_device) else: cuda_device = -1 # optimizer if args.optimizer == "Adam": optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) elif args.optimizer == "SGD": optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate) elif args.optimizer == "Adadelta": optimizer = torch.optim.Adadelta(model.parameters(), rho=0.95) else: raise ValueError("unsupported value: '{}'".format(args.optimizer)) # iterator # iterator = BucketIterator(batch_size=args.batch, sorting_keys=[("tokens", "num_tokens")]) iterator = BasicIterator(batch_size=args.batch) iterator.index_with(vocab) if not args.test_only: # Train print("# Train Method: {}".format(args.train_method)) print("# Start Train", flush=True) if args.train_method == "concat": trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset + pseudo_dataset, validation_dataset=validation_dataset, validation_metric="+f1-measure-overall", patience=args.early_stopping, num_epochs=args.max_epoch, num_serialized_models_to_keep=5, grad_clipping=args.grad_clipping, serialization_dir=out_dir, cuda_device=cuda_device) trainer.train() elif args.train_method == "pre-train": pre_train_out_dir = path.join(out_dir + "pre-train") fine_tune_out_dir = path.join(out_dir + "fine-tune") os.mkdir(pre_train_out_dir) os.mkdir(fine_tune_out_dir) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=pseudo_dataset, validation_dataset=validation_dataset, validation_metric="+f1-measure-overall", patience=args.early_stopping, num_epochs=args.max_epoch, num_serialized_models_to_keep=3, grad_clipping=args.grad_clipping, serialization_dir=pre_train_out_dir, cuda_device=cuda_device) trainer.train() if args.optimizer == "Adam": optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) elif args.optimizer == "SGD": optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate) elif args.optimizer == "Adadelta": optimizer = torch.optim.Adadelta(model.parameters(), rho=0.95) else: raise ValueError("unsupported value: '{}'".format( args.optimizer)) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=validation_dataset, validation_metric="+f1-measure-overall", patience=args.early_stopping, num_epochs=args.max_epoch, num_serialized_models_to_keep=3, grad_clipping=args.grad_clipping, serialization_dir=fine_tune_out_dir, cuda_device=cuda_device) trainer.train() else: raise ValueError("Unsupported Value '{}'".format( args.train_method)) # Test if args.test: print("# Test") result = evaluate(model=model, instances=test_dataset, data_iterator=iterator, cuda_device=cuda_device, batch_weight_key="") with open(path.join(out_dir, "test.score"), 'w') as fo: json.dump(result, fo) log.write_endtime()
if label[0] == "I": start_transitions[i] = float("-inf") return start_transitions reader = SrlReader() train_dataset = reader.read(cached_path("data/train")) validation_dataset = reader.read(cached_path("data/dev")) vocab = Vocabulary.from_instances(train_dataset + validation_dataset) token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=100, pretrained_file="https://s3-us-west-2.amazonaws.com/allennlp/" "datasets/glove/glove.6B.100d.txt.gz", trainable=True) encoder = PytorchSeq2SeqWrapper(StackedAlternatingLstm(input_size=100, hidden_size=300, num_layers=4, recurrent_dropout_probability=0.1, use_highway=True)) source_embedder = BasicTextFieldEmbedder({"tokens": token_embedding}) model = SemanticRoleLabeler(vocab, source_embedder, encoder, binary_feature_dim=100) optimizer = optim.Adadelta(model.parameters(), rho=0.95) iterator = BucketIterator(batch_size=100, sorting_keys=[("tokens", "num_tokens")]) iterator.index_with(vocab) trainer = Trainer( model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=validation_dataset, validation_metric="+f1-measure-overall", grad_clipping=1.0,