def __init__(self, num_layers: int, text_encoder_out_dim: int, target_encoder_out_dim: int, highway: bool = True, dropout: float = 0.0) -> None: ''' :param num_layers: Number of times to perform the CPT layer :param text_encoder_out_dim: The output dimension of the text encoder :param target_encoder_out_dim: The output dimension of the target encoder :param highway: highway adds the contextualised word vector (input word representation to CPT) to the transformed word vector (output word representation of CPT). Setting this is the equivalent of using Lossless Forwarding (LF) from the original paper. :param dropout: Wether or not to apply standard dropout to the transformed word vector after each CPT layer. ''' super().__init__() target_text_enc_out = target_encoder_out_dim + text_encoder_out_dim self.cpt_feedforward = Linear(target_text_enc_out, text_encoder_out_dim) self.attention = DotProductAttention(normalize=True) self.num_layers = num_layers self._highway = highway self._activation = Hardtanh() self._naive_dropout = Dropout(dropout) self._output_dim = text_encoder_out_dim
def test_lstm_cell_decoder_net_forward_without_bidirectionality(self): decoder_inout_dim = 10 lstm_decoder_net = LstmCellDecoderNet( decoding_dim=decoder_inout_dim, target_embedding_dim=decoder_inout_dim, attention=DotProductAttention(), bidirectional_input=False) batch_size = 5 time_steps = 10 encoded_state = torch.rand(batch_size, time_steps, decoder_inout_dim) source_mask = torch.ones(batch_size, time_steps) source_mask[0, 7:] = 0 source_mask[1, 5:] = 0 encoder_out = { "source_mask": source_mask, "encoder_outputs": encoded_state } prev_step_prediction_embeded = torch.rand(batch_size, 1, decoder_inout_dim) prev_state = lstm_decoder_net.init_decoder_state(encoder_out) next_state, decoded_vec = lstm_decoder_net(prev_state, encoded_state, source_mask, prev_step_prediction_embeded) assert list(next_state["decoder_hidden"].shape) == [batch_size, decoder_inout_dim] assert list(next_state["decoder_context"].shape) == [batch_size, decoder_inout_dim] assert list(decoded_vec.shape) == [batch_size, decoder_inout_dim]
def __init__(self, vocab: Vocabulary, bert: TextFieldEmbedder, classifier: FeedForward, dropout: float = 0.1, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None) -> None: super().__init__(vocab, regularizer) self._bert = bert self._classifier = classifier if dropout: self.dropout = torch.nn.Dropout(dropout) else: self.dropout = None self.attention = DotProductAttention() self._pooler = FeedForward(input_dim=bert.get_output_dim(), num_layers=1, hidden_dims=bert.get_output_dim(), activations=torch.tanh) check_dimensions_match(bert.get_output_dim() * 3, classifier.get_input_dim(), "bert embedding dim", "classifier input dim") self._accuracy = CategoricalAccuracy() self._loss = torch.nn.CrossEntropyLoss() initializer(self)
def setUp(self): self.reader = ToyReader() self.train_instances = self.reader.read("/home/IAIS/nchakrabor/nmt_data/toy_reverse/train/toy_train.txt") self.dev_instances = self.reader.read("/home/IAIS/nchakrabor/nmt_data/toy_reverse/dev/toy_dev.txt") self.vocab = Vocabulary.from_instances(self.train_instances + self.dev_instances) token_embedding = Embedding(num_embeddings=self.vocab.get_vocab_size('tokens') + 2, embedding_dim=256, padding_index=0) word_embeddings: TextFieldEmbedder = BasicTextFieldEmbedder({"tokens": token_embedding}) encoder = PytorchSeq2SeqWrapper(nn.LSTM(input_size=word_embeddings.get_output_dim(), num_layers=2, hidden_size=256, bidirectional=True, dropout=0.4, batch_first=True)) # self.set_up_model(model_params_file_path, dataset_sample_file_path) self.model = SimpleSeq2Seq(vocab=self.vocab, source_embedder=word_embeddings, encoder=encoder, target_embedding_dim=256, target_namespace='target_tokens', attention=DotProductAttention(), max_decoding_steps=25, beam_size=5, use_bleu=True ) self.model.cuda(0)
def test_lstm_cell_decoder_net_init(self): decoder_inout_dim = 10 lstm_decoder_net = LstmCellDecoderNet( decoding_dim=decoder_inout_dim, target_embedding_dim=decoder_inout_dim, attention=DotProductAttention(), bidirectional_input=False, ) batch_size = 5 time_steps = 10 encoded_state = torch.rand(batch_size, time_steps, decoder_inout_dim) source_mask = torch.ones(batch_size, time_steps).bool() source_mask[0, 7:] = 0 source_mask[1, 5:] = 0 encoder_out = { "source_mask": source_mask, "encoder_outputs": encoded_state } decoder_init_state = lstm_decoder_net.init_decoder_state(encoder_out) assert list(decoder_init_state["decoder_hidden"].shape) == [ batch_size, decoder_inout_dim ] assert list(decoder_init_state["decoder_context"].shape) == [ batch_size, decoder_inout_dim ]
def get_attention(st_ds_conf, attn_type, *dims): emb_sz = st_ds_conf[ 'emb_sz'] # dim for both the decoder output and the encoder output attn_type = attn_type.lower() if attn_type == "bilinear": if len(dims) < 2: dims = [emb_sz, emb_sz] attn = BilinearAttention(vector_dim=dims[0], matrix_dim=dims[1]) attn = AllenNLPAttentionWrapper(attn, st_ds_conf['attention_dropout']) elif attn_type == "dot_product": if len(dims) >= 2: assert dims[0] == dims[ 1], "encoder hidden states must be able to multiply with decoder output" attn = DotProductAttention() attn = AllenNLPAttentionWrapper(attn, st_ds_conf['attention_dropout']) elif attn_type == "multihead": attn = GeneralMultiHeadAttention( num_heads=st_ds_conf['num_heads'], input_dim=emb_sz, total_attention_dim=emb_sz, total_value_dim=emb_sz, attend_to_dim=emb_sz, output_dim=emb_sz, attention_dropout=st_ds_conf['attention_dropout'], use_future_blinding=False, ) attn = SingleTokenMHAttentionWrapper(attn) elif attn_type == "none": attn = None else: assert False return attn
def __init__(self, input_dim, dropout=0.0, use_ffnn=True, query_dim=None, activation='tanh'): super(Attention, self).__init__() self.use_ffnn = use_ffnn if self.use_ffnn: self.ffnn = FeedForward( \ input_dim = input_dim, num_layers = 1, hidden_dims = query_dim, activations = get_activation(activation), dropout = 0) else: query_dim = input_dim # Dot product attention self.attention = DotProductAttention(normalize=True) # Event-specific attention vector # (input_dim) self.vector = Parameter(torch.Tensor(query_dim)) torch.nn.init.normal_(self.vector) # Dropout self.drop_layer = nn.Dropout(p=dropout)
def build_seq2seq_model(flags, data_reader, vocab: Vocabulary, source_namespace: str = 'source_tokens', target_namespace: str = 'target_tokens') -> Model: source_embedding = Embedding( vocab.get_vocab_size(namespace=source_namespace), embedding_dim=flags.source_embedding_dim) source_embedder = BasicTextFieldEmbedder({'tokens': source_embedding}) lstm_encoder = PytorchSeq2SeqWrapper( torch.nn.LSTM(flags.source_embedding_dim, flags.encoder_hidden_dim, batch_first=True, bidirectional=flags.encoder_bidirectional)) attention = DotProductAttention() model = SimpleSeq2Seq(vocab, source_embedder, lstm_encoder, flags.max_decode_length, target_embedding_dim=flags.decoder_hidden_dim, target_namespace=target_namespace, attention=attention, beam_size=flags.beam_size, use_bleu=True) return model
def get_attention(st_ds_conf, attn_type): emb_sz = st_ds_conf[ 'emb_sz'] # dim for both the decoder output and the encoder output attn_type = attn_type.lower() if attn_type == "bilinear": attn = BilinearAttention(vector_dim=emb_sz, matrix_dim=emb_sz) attn = AllenNLPAttentionWrapper(attn, st_ds_conf['attention_dropout']) elif attn_type == "dot_product": attn = DotProductAttention() attn = AllenNLPAttentionWrapper(attn, st_ds_conf['attention_dropout']) elif attn_type == "multihead": attn = GeneralMultiHeadAttention( num_heads=st_ds_conf['num_heads'], input_dim=emb_sz, total_attention_dim=emb_sz, total_value_dim=emb_sz, attend_to_dim=emb_sz, output_dim=emb_sz, attention_dropout=st_ds_conf['attention_dropout'], use_future_blinding=False, ) attn = SingleTokenMHAttentionWrapper(attn) elif attn_type == "none": attn = None else: assert False return attn
def __init__(self) -> None: self.turn_num = 0 self.past_hidden_states = [] self.past_cell_states = [] self.encoder_outputs = [] self.past_dec_hidden_states = [] self.past_dec_cell_states = [] self.attention = DotProductAttention()
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, text_encoder: Seq2SeqEncoder, classifier_feedforward: Optional[FeedForward] = None, dropout: Optional[float] = 0.0, code_switching_regularizer: Optional[float] = 0.0, bivalency_regularizer: Optional[float] = 0.0, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None) -> None: ''' :param dropout: The amount of dropout to apply. Dropout is applied after each non-linear layer and the word embeddings lookup. Two types of dropout are applied, variational dropout is applied if the input is to the dropout is a sequence of vectors (each vector in the sequence representing a word), and normal dropout if the input is a vector. :param code_switching_regularizer: The weight associated to the code switching lexicon regulisation the lower the less affect it has. This requires that the dataset reader is going to supply the code switching arrays for the forward function of this class. If set a good values is 0.001 :param bivalency_regularizer: The weight associated to the bivalency regulisation the lower the less affect it has. This requires that the dataset reader is going to supply the bivalency arrays for the forward function of this class. ''' super().__init__(vocab, regularizer) self._naive_dropout = Dropout(dropout) self._variational_dropout = InputVariationalDropout(dropout) self.text_field_embedder = text_field_embedder self.num_classes = self.vocab.get_vocab_size("labels") self.text_encoder = text_encoder text_encoder_dim = text_encoder.get_output_dim() # Attention parameters self.project_encoded_text = TimeDistributed( Linear(text_encoder_dim, text_encoder_dim)) self.attention_vector = Parameter(torch.Tensor(text_encoder_dim)) self.reset_parameters() self.attention_layer = DotProductAttention(normalize=True) self.classifier_feedforward = classifier_feedforward output_dim = text_encoder_dim if classifier_feedforward: output_dim = classifier_feedforward.get_output_dim() self.label_projection = Linear(output_dim, self.num_classes) self.metrics = {"accuracy": CategoricalAccuracy()} self.code_switching_regularizer = code_switching_regularizer self.bivalency_regularizer = bivalency_regularizer self.loss = torch.nn.CrossEntropyLoss() initializer(self)
def main(): reader = Seq2SeqDatasetReader( source_tokenizer=WordTokenizer(), target_tokenizer=CharacterTokenizer(), source_token_indexers={'tokens': SingleIdTokenIndexer()}, target_token_indexers={'tokens': SingleIdTokenIndexer(namespace='target_tokens')}) train_dataset = reader.read('data/mt/tatoeba.eng_cmn.train.tsv') validation_dataset = reader.read('data/mt/tatoeba.eng_cmn.dev.tsv') vocab = Vocabulary.from_instances(train_dataset + validation_dataset, min_count={'tokens': 3, 'target_tokens': 3}) en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EN_EMBEDDING_DIM) # encoder = PytorchSeq2SeqWrapper( # torch.nn.LSTM(EN_EMBEDDING_DIM, HIDDEN_DIM, batch_first=True)) encoder = StackedSelfAttentionEncoder(input_dim=EN_EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, projection_dim=128, feedforward_hidden_dim=128, num_layers=1, num_attention_heads=8) source_embedder = BasicTextFieldEmbedder({"tokens": en_embedding}) # attention = LinearAttention(HIDDEN_DIM, HIDDEN_DIM, activation=Activation.by_name('tanh')()) # attention = BilinearAttention(HIDDEN_DIM, HIDDEN_DIM) attention = DotProductAttention() max_decoding_steps = 20 # TODO: make this variable model = SimpleSeq2Seq(vocab, source_embedder, encoder, max_decoding_steps, target_embedding_dim=ZH_EMBEDDING_DIM, target_namespace='target_tokens', attention=attention, beam_size=8, use_bleu=True) optimizer = optim.Adam(model.parameters()) iterator = BucketIterator(batch_size=32, sorting_keys=[("source_tokens", "num_tokens")]) iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=validation_dataset, num_epochs=1, cuda_device=CUDA_DEVICE) for i in range(50): print('Epoch: {}'.format(i)) trainer.train() predictor = SimpleSeq2SeqPredictor(model, reader) for instance in itertools.islice(validation_dataset, 10): print('SOURCE:', instance.fields['source_tokens'].tokens) print('GOLD:', instance.fields['target_tokens'].tokens) print('PRED:', predictor.predict_instance(instance)['predicted_tokens'])
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, shared_encoder: Seq2VecEncoder, private_encoder: Seq2VecEncoder, input_dropout: float = 0.0, regularizer: RegularizerApplicator = None) -> None: super(CNNEncoder, self).__init__(vocab, regularizer) self._text_field_embedder = text_field_embedder self._shared_encoder = shared_encoder self._private_encoder = private_encoder # self._U = nn.Linear() self._attention = DotProductAttention() self._input_dropout = Dropout(input_dropout)
def __init__(self, input_size: int, hidden_size: int, num_layers: int = 1, attention: Attention = DotProductAttention(), input_feeding: bool = True, residual: Union[bool, List[bool]] = False, inter_layer_dropout: float = 0.1, weight_dropout: float = 0.0, rnn: str = 'LSTM'): super().__init__(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, residual=residual, inter_layer_dropout=inter_layer_dropout, weight_dropout=weight_dropout, rnn=rnn) self.attention = attention self.input_feeding = input_feeding self.fuse_attention = nn.Linear(2 * hidden_size, hidden_size)
def __init__(self, input_size: int, hidden_size: int, num_layers: int = 1, attention: Attention = DotProductAttention(), residual: Union[bool, List[bool]] = False, inter_layer_dropout: float = 0.1, weight_dropout: float = 0.0, output_size: int = None, rnn: str = 'LSTM'): super().__init__(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, residual=residual, inter_layer_dropout=inter_layer_dropout, weight_dropout=weight_dropout, rnn=rnn) self.attention = attention self.output_size = output_size if self.output_size is not None: self.linear = nn.Linear(self._hidden_size + self.get_input_dim(), self.output_size)
en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EN_EMBEDDING_DIM) # encoder = PytorchSeq2SeqWrapper( # torch.nn.LSTM(EN_EMBEDDING_DIM, HIDDEN_DIM, batch_first=True)) encoder = StackedSelfAttentionEncoder(input_dim=EN_EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, projection_dim=128, feedforward_hidden_dim=128, num_layers=1, num_attention_heads=8) source_embedder = BasicTextFieldEmbedder({"tokens": en_embedding}) # attention = LinearAttention(HIDDEN_DIM, HIDDEN_DIM, activation=Activation.by_name('tanh')()) # attention = BilinearAttention(HIDDEN_DIM, HIDDEN_DIM) attention = DotProductAttention() max_decoding_steps = 800 model = SimpleSeq2Seq(vocab, source_embedder, encoder, max_decoding_steps, target_embedding_dim=ZH_EMBEDDING_DIM, target_namespace='target_tokens', attention=attention, beam_size=12, use_bleu=True) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to( device
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, text_encoder: Seq2SeqEncoder, target_encoder: Seq2VecEncoder, feedforward: Optional[FeedForward] = None, target_field_embedder: Optional[TextFieldEmbedder] = None, target_concat_text_embedding: bool = False, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None, word_dropout: float = 0.0, dropout: float = 0.0) -> None: ''' :param vocab: vocab : A Vocabulary, required in order to compute sizes for input/output projections. :param text_field_embedder: Used to embed the text and target text if target_field_embedder is None but the target_encoder is not None. :param text_encoder: Sequence Encoder that will create the representation of each token in the context sentence. :param target_encoder: Encoder that will create the representation of target text tokens. :param feedforward: An optional feed forward layer to apply after either the text encoder if target encoder is None. Else it would be after the target and the text encoded representations have been concatenated. :param target_field_embedder: Used to embed the target text to give as input to the target_encoder. Thus this allows a seperate embedding for text and target text. :param target_concat_text_embedding: Whether or not the target should be concatenated to the each word embedding within the text before being encoded. :param initializer: Used to initialize the model parameters. :param regularizer: If provided, will be used to calculate the regularization penalty during training. :param word_dropout: Dropout that is applied after the embedding of the tokens/words. It will drop entire words with this probabilty. :param dropout: To apply dropout after each layer apart from the last layer. All dropout that is applied to timebased data will be `variational dropout`_ all else will be standard dropout. This class is all based around the following paper `Attention-based LSTM for Aspect-level Sentiment Classification <https://www.aclweb.org/anthology/D16-1058>`_. The default model here is the equivalent to the AT-LSTM within this paper (Figure 2). If the `target_concat_text_embedding` argument is `True` then the model becomes the ATAE-LSTM within the cited paper (Figure 3). The only difference between this model and the attention based models in the paper is that the final sentence representation is `r` rather than `h* = tanh(Wpr + WxhN)` as we found this projection to not help the performance. .. _variational dropout: https://papers.nips.cc/paper/6241-a-theoretically-grounded-application-of-dropout-in-recurrent-neural-networks.pdf ''' super().__init__(vocab, regularizer) self.text_field_embedder = text_field_embedder self.target_field_embedder = target_field_embedder self.num_classes = self.vocab.get_vocab_size("labels") self.text_encoder = text_encoder self.target_encoder = target_encoder self.feedforward = feedforward target_text_encoder_dim = (target_encoder.get_output_dim() + text_encoder.get_output_dim()) self.encoded_target_text_fusion = TimeDistributed( Linear(target_text_encoder_dim, target_text_encoder_dim)) self.attention_vector = Parameter( torch.Tensor(target_text_encoder_dim)) self.attention_layer = DotProductAttention(normalize=True) if feedforward is not None: output_dim = self.feedforward.get_output_dim() else: output_dim = text_encoder.get_output_dim() self.label_projection = Linear(output_dim, self.num_classes) self.metrics = {"accuracy": CategoricalAccuracy()} self.f1_metrics = {} # F1 Scores label_index_name = self.vocab.get_index_to_token_vocabulary('labels') for label_index, label_name in label_index_name.items(): label_name = f'F1_{label_name.capitalize()}' self.f1_metrics[label_name] = F1Measure(label_index) self._word_dropout = WordDrouput(word_dropout) self._variational_dropout = InputVariationalDropout(dropout) self._naive_dropout = Dropout(dropout) self.target_concat_text_embedding = target_concat_text_embedding self.loss = torch.nn.CrossEntropyLoss() # Ensure the text encoder has the correct input dimension if target_concat_text_embedding: text_encoder_expected_in = (text_field_embedder.get_output_dim() + target_encoder.get_output_dim()) check_dimensions_match( text_encoder_expected_in, text_encoder.get_input_dim(), "text field embedding dim + target encoder output dim", "text encoder input dim") else: check_dimensions_match(text_field_embedder.get_output_dim(), text_encoder.get_input_dim(), "text field embedding dim", "text encoder input dim") # Ensure that the dimensions of the target or text field embedder and # the target encoder match target_field_embedder_dim = text_field_embedder.get_output_dim() target_field_error = "text field embedding dim" if self.target_field_embedder: target_field_embedder_dim = target_field_embedder.get_output_dim() target_field_error = "target field embedding dim" check_dimensions_match(target_field_embedder_dim, target_encoder.get_input_dim(), target_field_error, "target encoder input dim") self.reset_parameters() initializer(self)
def run(trainp="overnight/calendar_train_delex.tsv", testp="overnight/calendar_test_delex.tsv", batsize=8, embdim=50, encdim=50, maxtime=100, lr=.001, gpu=0, cuda=False, epochs=20): tt = q.ticktock("script") tt.tick("loading data") def tokenizer(x:str, splitter:WordSplitter=None)->List[str]: return [xe.text for xe in splitter.split_words(x)] reader = OvernightReader(partial(tokenizer, splitter=JustSpacesWordSplitter()), partial(tokenizer, splitter=JustSpacesWordSplitter()), SingleIdTokenIndexer(namespace="nl_tokens"), SingleIdTokenIndexer(namespace="fl_tokens")) trainds = reader.read(trainp) testds = reader.read(testp) tt.tock("data loaded") tt.tick("building vocabulary") vocab = Vocabulary.from_instances(trainds) tt.tock("vocabulary built") tt.tick("making iterator") iterator = BucketIterator(sorting_keys=[("nl", "num_tokens"), ("fl", "num_tokens")], batch_size=batsize, biggest_batch_first=True) iterator.index_with(vocab) batch = next(iter(iterator(trainds))) #print(batch["id"]) #print(batch["nl"]) tt.tock("made iterator") # region model nl_emb = Embedding(vocab.get_vocab_size(namespace="nl_tokens"), embdim, padding_index=0) fl_emb = Embedding(vocab.get_vocab_size(namespace="fl_tokens"), embdim, padding_index=0) nl_field_emb = BasicTextFieldEmbedder({"tokens": nl_emb}) fl_field_emb = BasicTextFieldEmbedder({"tokens": fl_emb}) encoder = PytorchSeq2SeqWrapper(torch.nn.LSTM(embdim, encdim, bidirectional=True, batch_first=True)) attention = DotProductAttention() smodel = Seq2Seq(vocab, nl_field_emb, encoder, maxtime, target_embedding_dim=embdim, attention=attention, target_namespace='fl_tokens', beam_size=1, use_bleu=True) smodel_out = smodel(batch["nl"], batch["fl"]) optim = torch.optim.Adam(smodel.parameters(), lr=lr) trainer = Trainer(model=smodel, optimizer=optim, iterator=iterator, train_dataset=trainds, validation_dataset=testds, num_epochs=epochs, cuda_device=gpu if cuda else -1) metrics = trainer.train() sys.exit() class MModel(Model): def __init__(self, nlemb:Embedding, flemb:Embedding, vocab:Vocabulary, **kwargs): super(MModel, self).__init__(vocab, **kwargs) self.nlemb, self.flemb = nlemb, flemb @overrides def forward(self, nl:Dict[str, torch.Tensor], fl:Dict[str, torch.Tensor], id:Any): nlemb = self.nlemb(nl["tokens"]) flemb = self.flemb(fl["tokens"]) print(nlemb.size()) pass m = MModel(nl_emb, fl_emb, vocab) batch = next(iter(iterator(trainds))) out = m(**batch)
def main(): trainFile = "../srcData/trainData.csv" validFile = "../srcData/devData.csv" testFile = "../srcData/testData.csv" trainSeq2SeqFile = data.dataPreparation(trainFile) validSeq2SeqFile = data.dataPreparation(validFile) testSeq2SeqFile = data.dataPreparation(testFile) print(testSeq2SeqFile) # TokenIndexer Determines how string tokens gets represented as arrays of indexes in a model # SingleIdTokenIndexer = Tokens are single integers # TokenCharactersIndexer = Tokens as a list of integers # Read a tsvfile with paired instances (source, target) reader = CopyNetDatasetReader( source_tokenizer=WordTokenizer(), target_tokenizer=WordTokenizer(), # Defaults to source_tokenizer source_token_indexers={'tokens': SingleIdTokenIndexer()}, target_namespace='tokens' # Defaults to source_token_indexers ) # Each of the dataset is a list of each tokens (source_tokens, target_tokens) train_dataset = reader.read(trainSeq2SeqFile) validation_dataset = reader.read(validSeq2SeqFile) test_dataset = reader.read(testSeq2SeqFile) """ # Finding extra fact2 vocab trainExtraVocab = findExtraVocab(train_dataset) validExtraVocab = findExtraVocab(validation_dataset) testExtraVocab = findExtraVocab(test_dataset) finalExtraVocab = list(set(trainExtraVocab + validExtraVocab + testExtraVocab)) print("length:", len(finalExtraVocab)) # input() """ # vocab = Vocabulary.from_instances(train_dataset + validation_dataset, min_count={'tokens': 3, 'target_tokens': 3}) vocab = Vocabulary.from_instances(train_dataset + validation_dataset + test_dataset) # Train + Valid = 9703 # Train + Valid + Test = 10099 print("Vocab SIze :", vocab.get_vocab_size('tokens')) encEmbedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=ENC_EMBEDDING_DIM) # Embedding for tokens since in the dataset creation time it is mentioned tokens source_embedder = BasicTextFieldEmbedder({"tokens": encEmbedding}) encoder = PytorchSeq2SeqWrapper( torch.nn.LSTM(ENC_EMBEDDING_DIM, HIDDEN_DIM, batch_first=True, dropout=0.2)) Attention = DotProductAttention() print(Attention) max_decoding_steps = 4 # TODO: make this variable model = CopyNetSeq2Seq( vocab, source_embedder, encoder, max_decoding_steps=max_decoding_steps, target_embedding_dim=TGT_EMBEDDING_DIM, # target_namespace = 'target_tokens', beam_size=beamSize, attention=Attention) # Can also specify lr=0.001 optimizer = optim.Adam(model.parameters()) # Data Iterator that specify how to batch our dataset # Takes data shuffles it and creates fixed sized batches # iterator = BasicIterator(batch_size=2) # iterator.index_with(vocab) # Pads batches wrt max input lengths per batch, sorts dataset wrt the fieldnames and padding keys provided for efficient computations iterator = BucketIterator(batch_size=50, sorting_keys=[("source_tokens", "num_tokens")]) iterator.index_with(vocab) trainer = Trainer( model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=validation_dataset, # patience = 3, num_epochs=numEpochs, cuda_device=CUDA_DEVICE) trainer.train() """
def get_model_fn(model_name, embeddings, vocab, input_dims=128, hidden_dims=128, dataset_name='debug', max_len=40, **kwargs): is_seq2seq = seq2seq_models.get(model_name, False) wrapped = None # TODO: Factory or something else. encoder_args = { 'lstm': [ input_dims, hidden_dims, ], 'transformer': [ input_dims, hidden_dims, input_dims, input_dims, 1, 4, # TODO: add more if you need. ] } encoder_arg = encoder_args[model_name] if is_seq2seq: # default use LSTM wrapped_fn = seq2seq_wrapped[model_name] seq_model_fn = seq2seq_model_fn[model_name] if wrapped_fn is not None: wrapped = wrapped_fn(seq_model_fn(*encoder_arg, **kwargs)) else: wrapped = seq_model_fn(*encoder_arg) # TODO: get a factory or something if 'nc_zhen' not in dataset_name: model_args = { 'word_embeddings': embeddings, 'encoder': wrapped, 'vocab': vocab, 'output_feature_key': output_feature_keys[dataset_name], 'max_len': max_len, 'hidden_size': hidden_dims } else: model_args = { 'vocab': vocab, 'source_embedder': embeddings, 'encoder': wrapped, 'max_decoding_steps': 20, # arbitrary 'attention': DotProductAttention(), # arbitrary 'beam_size': 8, #arbitrary } model = dataset_model[dataset_name](**model_args) return output_feature_keys[dataset_name], model
def setUp(self): self.sample_only = False # self.setupstubexecutor() model_params_file_path = self.TEST_DATA_ROOT / "experiment.json" self.dataset_sample_file_path = self.TEST_DATA_ROOT / "lcquad.annotated.lisp.v3.deurified.simple.sample.json" self.dataset_train_file_path = self.TEST_DATA_ROOT / "lcquad.annotated.lisp.v3.train.json" self.dataset_test_file_path = self.TEST_DATA_ROOT / "lcquad.annotated.lisp.v3.test.json" predicates_file_path = self.TEST_DATA_ROOT / "properties.txt" with codecs.open(predicates_file_path) as fp: self.predicates = [i.strip() for i in fp] dbo_classes = set([ dbo for dbo in self.predicates if dbo.split("/")[-1][0].isupper() ]) binary_predicates = set(self.predicates) - dbo_classes if self.sample_only: self.sample_reader = LCQuADReaderSimple( predicates=binary_predicates, ontology_types=dbo_classes) else: self.train_reader = LCQuADReaderSimple( predicates=binary_predicates, ontology_types=dbo_classes) # self.test_reader = LCQuADReaderSimple(predicates=binary_predicates, ontology_types=dbo_classes) # sample_reader.cache_data("sample_dataset") # train_reader.cache_data("train_dataset") # test_reader.cache_data("test_dataset") if self.sample_only: self.sample_instances = list( self.sample_reader.read(str(self.dataset_sample_file_path))) else: self.train_instances = list( self.train_reader.read(str(self.dataset_train_file_path))) self.test_instances = list( self.train_reader.read(str(self.dataset_test_file_path))) if self.sample_only: self.vocab = Vocabulary.from_instances(self.sample_instances) else: self.vocab = Vocabulary.from_instances(self.train_instances + self.test_instances, min_count={ 'tokens': 3, 'target_tokens': 3 }) #min_count={'tokens': 3, 'target_tokens': 3}) #self.vocab = Vocabulary() token_embedding = Embedding( num_embeddings=self.vocab.get_vocab_size('tokens') + 2, embedding_dim=512, padding_index=0) #options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json' #weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5' # the embedder maps the input tokens to the appropriate embedding matrix #elmo_embedder = ElmoTokenEmbedder(options_file, weight_file) #word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder}) word_embeddings: TextFieldEmbedder = BasicTextFieldEmbedder( {"tokens": token_embedding}) encoder = PytorchSeq2SeqWrapper( nn.LSTM(input_size=word_embeddings.get_output_dim(), num_layers=2, hidden_size=256, bidirectional=True, dropout=0.5, batch_first=True)) val_outputs = self.TEST_DATA_ROOT / "val_outputs.seq2seq.json" self.val_outputs_fp = codecs.open(val_outputs, 'w') # self.set_up_model(model_params_file_path, dataset_sample_file_path) self.model = SimpleSeq2Seq(vocab=self.vocab, source_embedder=word_embeddings, encoder=encoder, target_embedding_dim=128, target_namespace='target_tokens', attention=DotProductAttention(), max_decoding_steps=25, beam_size=5, use_bleu=True, scheduled_sampling_ratio=0.3) self.model.cuda(0)
def main(): target_namespace = "target_tokens" if not USE_COPY: reader = Seq2SeqDatasetReader( source_tokenizer=WordTokenizer( word_splitter=JustSpacesWordSplitter()), target_tokenizer=WordTokenizer( word_splitter=JustSpacesWordSplitter()), source_token_indexers={'tokens': SingleIdTokenIndexer()}, target_token_indexers={ 'tokens': SingleIdTokenIndexer(namespace=target_namespace) }) else: reader = CopyNetDatasetReader( source_tokenizer=WordTokenizer( word_splitter=JustSpacesWordSplitter()), target_tokenizer=WordTokenizer( word_splitter=JustSpacesWordSplitter()), target_namespace=target_namespace) train_dataset = reader.read('./data/data_train.tsv') validation_dataset = reader.read('./data/data_val.tsv') vocab = Vocabulary.from_instances(train_dataset, min_count={ 'tokens': 3, 'target_tokens': 3 }) en_embedding = Embedding( num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=SRC_EMBEDDING_DIM, pretrained_file="../opennmt/glove_dir/glove.840B.300d.txt") assert en_embedding.weight.requires_grad datas = _read_pretrained_embeddings_file(en_embedding._pretrained_file, SRC_EMBEDDING_DIM, vocab) datas.requires_grad = True en_embedding.weight.data = datas print(en_embedding.weight.data) assert en_embedding.weight.requires_grad encoder = PytorchSeq2SeqWrapper( torch.nn.LSTM(SRC_EMBEDDING_DIM, HIDDEN_DIM, batch_first=True, bidirectional=True, dropout=0.3, num_layers=1)) #encoder = StackedSelfAttentionEncoder(input_dim=SRC_EMBEDDING_DIM, # hidden_dim=HIDDEN_DIM, # projection_dim=128, feedforward_hidden_dim=128, # num_layers=1, num_attention_heads=8) source_embedder = BasicTextFieldEmbedder({"tokens": en_embedding}) attention = DotProductAttention() if not USE_COPY: model = SimpleSeq2Seq(vocab, source_embedder, encoder, MAX_DECODING_STEPS, target_embedding_dim=TGT_EMBEDDING_DIM, target_namespace='target_tokens', attention=attention, beam_size=8, use_bleu=True) else: model = MyCopyNet(vocab, source_embedder, encoder, max_decoding_steps=MAX_DECODING_STEPS, target_embedding_dim=TGT_EMBEDDING_DIM, target_namespace=target_namespace, attention=attention, beam_size=8, tgt_embedder_pretrain_file= "../opennmt/glove_dir/glove.840B.300d.txt") model.to(torch.device('cuda')) optimizer = optim.Adam(model.parameters()) iterator = BucketIterator(batch_size=64, sorting_keys=[("source_tokens", "num_tokens")], padding_noise=0.2) iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=validation_dataset, num_epochs=22, patience=4, serialization_dir="./checkpoints", cuda_device=CUDA_DEVICE, summary_interval=100) trainer.train() print(en_embedding.weight.data) predictor = Seq2SeqPredictor(model, reader) # Dump all predictions to a file # TODO (DNGros): Is there an automatic way in allennlp to do this?? pred_toks = [] with open("pred.txt", "w") as outfile: for instance in tqdm(validation_dataset): pred = predictor.predict_instance(instance) toks = pred['predicted_tokens'] if toks: outfile.write(" ".join(toks[0]) + "\n") else: outfile.write("" + "\n")
from torch.autograd import Variable reader = CopyNetDatasetReader(target_namespace="trg") train_dataset = reader.read('data/train.tsv') train_loader = PyTorchDataLoader(train_dataset, batch_size=8, shuffle=True) vocab = Vocabulary.from_instances(train_dataset) EMBEDDING_DIM = 128 HIDDEN_DIM = 256 TARGET_EMBEDDING_DIM = 512 token_embedding = Embedding(embedding_dim=EMBEDDING_DIM, num_embeddings=vocab.get_vocab_size(namespace="tokens")) word_embedding = BasicTextFieldEmbedder({"token": token_embedding}) bi_rnn_encoder = RnnSeq2SeqEncoder(EMBEDDING_DIM, HIDDEN_DIM, 2, bidirectional=True) dot_attn = DotProductAttention() model = CopyNetSeq2Seq(vocab, word_embedding, bi_rnn_encoder, dot_attn, target_namespace="trg", target_embedding_dim=TARGET_EMBEDDING_DIM) with tempfile.TemporaryDirectory() as serialization_dir: parameters = [ [n, p] for n, p in model.named_parameters() if p.requires_grad ] optimizer = AdamOptimizer(parameters) trainer = GradientDescentTrainer( model=model, serialization_dir=serialization_dir, data_loader=train_loader, validation_data_loader=None, num_epochs=5,
def setUp(self): self.sample_only = False self.setUpExecutor() # self.setupstubexecutor() model_params_file_path = self.TEST_DATA_ROOT / "experiment.json" self.dataset_sample_file_path = self.TEST_DATA_ROOT / "lcquad.annotated.lisp.v2.deurified.sample.json" self.dataset_train_file_path = self.TEST_DATA_ROOT / "lcquad.annotated.lisp.v2.deurified.train.json" self.dataset_test_file_path = self.TEST_DATA_ROOT / "lcquad.annotated.lisp.v2.deurified.test.json" predicates_file_path = self.TEST_DATA_ROOT / "properties.txt" with codecs.open(predicates_file_path) as fp: self.predicates = [i.strip() for i in fp] dbo_classes = set([ dbo for dbo in self.predicates if dbo.split("/")[-1][0].isupper() ]) binary_predicates = set(self.predicates) - dbo_classes token_indexer = None #{'tokens': ELMoTokenCharactersIndexer()} if self.sample_only: sample_reader = LCQuADReader(executor=self.executor, predicates=binary_predicates, token_indexers=token_indexer, ontology_types=dbo_classes) else: train_reader = LCQuADReader(executor=self.executor, predicates=binary_predicates, token_indexers=token_indexer, ontology_types=dbo_classes) test_reader = LCQuADReader(executor=self.executor, predicates=binary_predicates, token_indexers=token_indexer, ontology_types=dbo_classes) # sample_reader.cache_data("sample_dataset") # train_reader.cache_data("train_dataset") # test_reader.cache_data("test_dataset") if self.sample_only: self.sample_instances = list( sample_reader.read(str(self.dataset_sample_file_path))) else: self.train_instances = list( train_reader.read(str(self.dataset_train_file_path))) self.test_instances = list( test_reader.read(str(self.dataset_test_file_path))) if self.sample_only: self.vocab = Vocabulary.from_instances(self.sample_instances) else: self.vocab = Vocabulary.from_instances(self.train_instances + self.test_instances) #self.vocab = Vocabulary() token_embedding = Embedding( num_embeddings=self.vocab.get_vocab_size() + 2, embedding_dim=256, padding_index=0) #options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json' #weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5' # the embedder maps the input tokens to the appropriate embedding matrix #elmo_embedder = ElmoTokenEmbedder(options_file, weight_file) #word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder}) word_embeddings: TextFieldEmbedder = BasicTextFieldEmbedder( {"tokens": token_embedding}) encoder = PytorchSeq2SeqWrapper( nn.LSTM( input_size=word_embeddings.get_output_dim(), num_layers=1, hidden_size=128, bidirectional=True, # dropout=0.4, batch_first=True)) val_outputs = self.TEST_DATA_ROOT / "val_outputs.json" self.val_outputs_fp = codecs.open(val_outputs, 'w') # self.set_up_model(model_params_file_path, dataset_sample_file_path) self.model = LCQuADMmlSemanticParser( vocab=self.vocab, sentence_embedder=word_embeddings, action_embedding_dim=256, encoder=encoder, attention=DotProductAttention(), decoder_beam_search=BeamSearch(beam_size=1), max_decoding_steps=50, dropout=0.5, val_outputs=self.val_outputs_fp) self.model.cuda(0)
def __init__( self, vocab: Vocabulary, model_name: str, decoder: DecoderNet, decoder_type: str = "lstm", # `lstm` / `transformer` decoder_num_layers: int = 1, share_decoder_params: bool = True, # valid for `transformer` text_field_embedder: TextFieldEmbedder = None, start_token: str = "[CLS]", end_token: str = "[SEP]", index_name: str = "bert", beam_size: int = 4, min_dec_len: int = 4, max_dec_len: int = 30, coverage_factor: float = 0.0, device: Union[int, str, List[int]] = -1, trainable: bool = True, # 表示bert的参数是否可训练 metrics: Optional[List[Metric]] = None, valid_metric_keys: List[str] = None, seed: int = 42, initializer: InitializerApplicator = InitializerApplicator(), regularizer: RegularizerApplicator = None): super().__init__(vocab, regularizer) # ---------- 定义编码器并获取输出维度 ------------- if model_name is None and text_field_embedder is None: raise ValueError( f"`model_name` and `text_field_embedder` can't both equal to None." ) # 对于预训练模型来说,这里相当于encoder self._text_field_embedder = text_field_embedder or BasicTextFieldEmbedder( { index_name: PretrainedChineseBertEmbedder(model_name, train_parameters=trainable, return_all=False, output_hidden_states=False) }) # 保存bert编码器的输出维度 self.encoder_output_dim = self._text_field_embedder.get_output_dim() # ---------- 通用初始化过程 ------------- self.common_init(self.encoder_output_dim, decoder, decoder_type, decoder_num_layers, share_decoder_params, start_token, end_token, index_name, beam_size, min_dec_len, max_dec_len, coverage_factor, device, metrics, valid_metric_keys, seed, initializer) # ---------- 不同编码器独特的初始化过程 ------------- # 由于编码器是bert,所以需要保存编码器的embedding部分 # 如果是albert,还有embedding到hidden的映射部分 bert_token_embedder = self._text_field_embedder._token_embedders[ index_name] self.bert_type = model_name or bert_token_embedder.model_name # 获取model的名称 self.word_embeddings = bert_token_embedder.transformer_model.get_input_embeddings( ) if "albert" in self.bert_type: # 从embedding层到隐层的映射 self.embedding_to_hidden = bert_token_embedder.transformer_model.encoder.embedding_hidden_mapping_in # 如果解码器是LSTM,则需要使用attention初始化LSTM的初始状态 # 如果编码器也是LSTM,则不需要 if self.params["decoder_type"] == "lstm": self.h_query = torch.nn.Parameter(torch.randn( [self.encoder_output_dim]), requires_grad=True) self.c_query = torch.nn.Parameter(torch.randn( [self.encoder_output_dim]), requires_grad=True) # 当编码器是transformer,解码器是LSTM时,需要计算LSTM的初始化状态 self.init_attention = DotProductAttention()
def main(): elmo_token_indexer = ELMoTokenCharactersIndexer() reader = Seq2SeqDatasetReader( source_tokenizer=WordTokenizer(), target_tokenizer=WordTokenizer(), source_token_indexers={'tokens': elmo_token_indexer}, target_token_indexers={ 'tokens': SingleIdTokenIndexer(namespace='target_tokens') }) train_dataset, test_dataset, dev_dataset = ( reader.read(DATA_ROOT + "/" + fname) for fname in ["train_all_seq.txt", "test_all_seq.txt", "val_all_seq.txt"]) vocab = Vocabulary.from_instances(train_dataset + dev_dataset + test_dataset, min_count={ 'tokens': 1, 'target_tokens': 1 }) # en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), # embedding_dim=256) # en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), # embedding_dim=elmo_embedding_dim) #elmo_embedder = Elmo(options_file, weight_file, 2, dropout=0.5) elmo_embedder = ElmoTokenEmbedder(options_file, weight_file) # word_embeddings = BasicTextFieldEmbedder({'tokens': elmo_embedder}) # en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), # embedding_dim=256) source_embedder = BasicTextFieldEmbedder({"tokens": elmo_embedder}) #Initializing the model max_decoding_steps = 20 encoder = PytorchSeq2SeqWrapper( torch.nn.LSTM(elmo_embedding_dim, hidden_dim, batch_first=True)) # encoder = StackedSelfAttentionEncoder(input_dim=elmo_embedding_dim, hidden_dim=hidden_dim, projection_dim=128, feedforward_hidden_dim=128, num_layers=1, num_attention_heads=8) attention = DotProductAttention() model = SimpleSeq2Seq(vocab, source_embedder, encoder, max_decoding_steps, target_embedding_dim=elmo_embedding_dim, target_namespace='target_tokens', attention=attention, beam_size=8, use_bleu=True) if USE_GPU: model.cuda() else: model # Training the model optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5) iterator = BucketIterator(batch_size=32, sorting_keys=[("source_tokens", "num_tokens")]) iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=dev_dataset, patience=10, num_epochs=1, cuda_device=0 if USE_GPU else -1) for i in range(20): print('Epoch: {}'.format(i)) trainer.train() predictor = SimpleSeq2SeqPredictor(model, reader) for instance in itertools.islice(dev_dataset, 10): print('SOURCE:', instance.fields['source_tokens'].tokens) print('GOLD:', instance.fields['target_tokens'].tokens) print('PRED:', predictor.predict_instance(instance)['predicted_tokens']) #Saving the model with open("model_seq2seq.th", 'wb') as f: torch.save(model.state_dict(), f) vocab.save_to_files("vocabulary_seq2seq") predictor = SimpleSeq2SeqPredictor(model, reader) with open('predict_seq2seq.txt', 'w+') as f: for instance in itertools.islice(test_dataset, 10): preds = predictor.predict_instance(instance)['predicted_tokens'] f.write(" ".join(preds) + "\n")
def __init__(self, vocab: Vocabulary, encoder: Seq2SeqEncoder, entity_encoder: Seq2VecEncoder, decoder_beam_search: BeamSearch, question_embedder: TextFieldEmbedder, input_attention: Attention, past_attention: Attention, graph_attention: Attention, max_decoding_steps: int, action_embedding_dim: int, enable_gating: bool = False, ablation_mode: str = None, gnn: bool = True, graph_loss_lambda: float = 0.5, decoder_use_graph_entities: bool = True, decoder_self_attend: bool = True, gnn_timesteps: int = 2, pruning_gnn_timesteps: int = 2, parse_sql_on_decoding: bool = True, add_action_bias: bool = False, use_neighbor_similarity_for_linking: bool = True, dataset_path: str = 'dataset', log_path: str = '', training_beam_size: int = None, decoder_num_layers: int = 1, dropout: float = 0.0, rule_namespace: str = 'rule_labels') -> None: super().__init__(vocab, encoder, entity_encoder, question_embedder, gnn_timesteps, dropout, rule_namespace) self.enable_gating = enable_gating self.ablation_mode = ablation_mode self._log_path = log_path self._max_decoding_steps = max_decoding_steps self._add_action_bias = add_action_bias self._parse_sql_on_decoding = parse_sql_on_decoding self._self_attend = decoder_self_attend self._decoder_use_graph_entities = decoder_use_graph_entities self._use_neighbor_similarity_for_linking = use_neighbor_similarity_for_linking self._action_padding_index = -1 # the padding value used by IndexField self._exact_match = Average() self._sql_evaluator_match = Average() self._action_similarity = Average() self._beam_hit = Average() self._action_embedding_dim = action_embedding_dim self._graph_loss_lambda = graph_loss_lambda num_actions = vocab.get_vocab_size(self._rule_namespace) if self._add_action_bias: input_action_dim = action_embedding_dim + 1 else: input_action_dim = action_embedding_dim self._action_embedder = Embedding(num_embeddings=num_actions, embedding_dim=input_action_dim) self._output_action_embedder = Embedding(num_embeddings=num_actions, embedding_dim=action_embedding_dim) self._embedding_projector = torch.nn.Linear(question_embedder.get_output_dim(), self._embedding_dim, bias=False) self._bert_embedding_dim = question_embedder.get_output_dim() encoder_output_dim = self._encoder.get_output_dim() + self._embedding_dim self._neighbor_encoder = TimeDistributed(BagOfEmbeddingsEncoder(self._embedding_dim, averaged=True)) self._first_action_embedding = torch.nn.Parameter(torch.FloatTensor(action_embedding_dim)) self._first_attended_utterance = torch.nn.Parameter(torch.FloatTensor(encoder_output_dim)) self._first_attended_output = torch.nn.Parameter(torch.FloatTensor(action_embedding_dim)) torch.nn.init.normal_(self._first_action_embedding) torch.nn.init.normal_(self._first_attended_utterance) torch.nn.init.normal_(self._first_attended_output) self._entity_type_decoder_embedding = Embedding(self._num_entity_types, action_embedding_dim) self._decoder_num_layers = decoder_num_layers self._beam_search = decoder_beam_search self._decoder_trainer = MaximumMarginalLikelihood(training_beam_size) self._graph_pruning = GraphPruning(3, self._embedding_dim, encoder.get_output_dim(), dropout, timesteps=pruning_gnn_timesteps) if decoder_self_attend: self._transition_function = AttendPastSchemaItemsTransitionFunction(encoder_output_dim=encoder_output_dim, action_embedding_dim=action_embedding_dim, input_attention=input_attention, past_attention=past_attention, enable_gating=self.enable_gating, ablation_mode=self.ablation_mode, predict_start_type_separately=False, add_action_bias=self._add_action_bias, dropout=dropout, num_layers=self._decoder_num_layers) else: self._transition_function = LinkingTransitionFunction(encoder_output_dim=encoder_output_dim, action_embedding_dim=action_embedding_dim, input_attention=input_attention, predict_start_type_separately=False, add_action_bias=self._add_action_bias, dropout=dropout, num_layers=self._decoder_num_layers) if self.enable_gating: self._graph_attention = graph_attention else: self._graph_attention = DotProductAttention() self._embedding_sim_attn = CosineMatrixAttention() # TODO: Remove hard-coded dirs self._evaluate_func = partial(evaluate, db_dir=os.path.join(dataset_path, 'database'), table=os.path.join(dataset_path, 'tables.json'), check_valid=False)
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, label_namespace: str = "labels", encoder: Optional[Seq2VecEncoder] = None, seq_encoder: Optional[Seq2SeqEncoder] = None, feedforward: Optional[FeedForward] = None, dropout: Optional[float] = None, incl_neutral: Optional[bool] = False, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None) -> None: super().__init__(vocab, regularizer) self.label_namespace = label_namespace self.text_field_embedder = text_field_embedder self.num_labels = self.vocab.get_vocab_size(label_namespace) self.encoder = encoder self.seq_encoder = seq_encoder if self.seq_encoder is not None: self.attention_vector = Parameter(torch.Tensor(self.seq_encoder.get_output_dim())) self.attention_layer = DotProductAttention(normalize=True) embedding_output_dim = self.text_field_embedder.get_output_dim() if dropout is not None: self.dropout = torch.nn.Dropout(dropout) self.variational_dropout = InputVariationalDropout(dropout) else: self.dropout = None self._feedforward = feedforward if feedforward is not None: output_dim = feedforward.get_output_dim() elif encoder is not None: output_dim = self.encoder.get_output_dim() elif seq_encoder is not None: output_dim = self.seq_encoder.get_output_dim() else: output_dim = embedding_output_dim # Have to create a tag projection layer for each label in the # multi label classifier self._tag_projection_layers: Any = [] for k in range(self.num_labels): tag_projection_layer = Linear(output_dim, 1) self.add_module(f'tag_projection_layer_{k}', tag_projection_layer) self._tag_projection_layers.append(tag_projection_layer) self.output_activation = torch.nn.Sigmoid() self.loss_criterion = torch.nn.BCEWithLogitsLoss(reduction='mean') self.incl_neutral = incl_neutral self.metrics = {"jaccard_index": JaccardIndex(self.incl_neutral)} if encoder is not None: check_dimensions_match(embedding_output_dim, encoder.get_input_dim(), "text field embedding dim", "encoder input dim") if feedforward is not None and encoder is not None: check_dimensions_match(encoder.get_output_dim(), feedforward.get_input_dim(), "encoder output dim", "feedforward input dim") elif feedforward is not None and encoder is None: check_dimensions_match(embedding_output_dim, feedforward.get_input_dim(), "text field output dim", "feedforward input dim") if self.seq_encoder is not None: self.reset_parameters() initializer(self)
def __init__(self, vocab: Vocabulary, context_field_embedder: TextFieldEmbedder, context_encoder: Seq2SeqEncoder, target_encoder: Seq2VecEncoder, feedforward: Optional[FeedForward] = None, context_attention_activation_function: str = 'tanh', target_field_embedder: Optional[TextFieldEmbedder] = None, AE: bool = True, AttentionAE: bool = True, inter_target_encoding: Optional[InterTarget] = None, target_position_weight: Optional[TargetPositionWeight] = None, target_position_embedding: Optional[TextFieldEmbedder] = None, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None, dropout: float = 0.0, label_name: str = 'target-sentiment-labels', loss_weights: Optional[List[float]] = None, use_target_sequences: bool = False) -> None: super().__init__(vocab, regularizer) ''' :param vocab: A Vocabulary, required in order to compute sizes for input/output projections. :param context_field_embedder: Used to embed the context/sentence and target text if target_field_embedder is None but the target_encoder is NOT None. :param context_encoder: Encoder that will create the representation for the sentence/context that the target appears in. :param target_encoder: Encoder that will create the representation of target text tokens. :param feedforward: An optional feed forward layer(s) to apply before the final softmax layer. :param context_attention_activation_function: The activation function to be used after the projection of the encoded context. (Equation 7) in the original paper. :param target_field_embedder: Used to embed the target text to give as input to the target_encoder. Thus this allows a separate embedding for context and target text. :param AE: Whether to concatentate the target representations to each words word embedding. :param AttentionAE: Whether to concatenate the target representations to each contextualised word representation i.e. to each word's vector after the `context_encoder` :param inter_target_encoding: Whether to model the relationship between targets/aspect. :param target_position_weight: Whether to weight the output of the context encoding based on the position of the tokens to the target tokens. This weighting is applied before any attention is applied. :param target_position_embedding: Whether or not to concatenate a position embedding on to the input embeddings before being an input to the `context_encoder`. :param initializer: Used to initialize the model parameters. :param regularizer: If provided, will be used to calculate the regularization penalty during training. :param dropout: To apply dropout after each layer apart from the last layer. All dropout that is applied to timebased data will be `variational dropout <https://arxiv.org/abs/1512.05287>`_ all else will be standard dropout. Variation dropout is applied to the target vectors after they have been processed by the `inter_target_encoding` if this is set. :param label_name: Name of the label name space. :param loss_weights: The amount of weight to give the negative, neutral, positive classes respectively. e.g. [0.2, 0.5, 0.3] would weight the negative class by a factor of 0.2, neutral by 0.5 and positive by 0.3. NOTE It assumes the sentiment labels are the following: [negative, neutral, positive]. :param use_target_sequences: Whether or not to use target tokens within the context as the targets contextualized word representation (CWR). This would only make sense to use if the word representation i.e. field embedder is a contextualized embedder e.g. ELMO etc. This also requires that the dataset reader has the following argument set to True `target_sequences`. ANOTHER reason why you would want to use this even when not using CWR is that you want to get contextualised POS/Dep tags etc. This is based around the models in `Attention-based LSTM for Aspect-level Sentiment Classification <https://aclweb.org/anthology/D16-1058>`_. The models re-created are: 1. AE-LSTM where instead of just encoding using an LSTM also applies an attention network after the LSTM as in the model within `Modeling Inter-Aspect Dependencies for Aspect-Based Sentiment Analysis <https://www.aclweb.org/anthology/N18-2043>`_ 3. AT-LSTM 2. ATAE For the 1'st model ensure `AE` is True and `AttentionAE` is False. For the 2'nd ensure that `AE` is False and `AttentionAE` is True. For the the 3'rd ensure both `AE` and `AttentionAE` are True. This can also be used to re-create the model from `Modeling Inter-Aspect Dependencies for Aspect-Based Sentiment Analysis <https://www.aclweb.org/anthology/N18-2043>`_ with the fustion part being `concat`. To do so `inter_target_encoding` argument must be a LSTM. .. _variational dropout: https://papers.nips.cc/paper/6241-a-theoretically-grounded-application-of-dropout-in-recurrent-neural-networks.pdf ''' if not AE and not AttentionAE: raise ConfigurationError('Either `AE` or `AttentionAE` have to ' 'be True') self.label_name = label_name self.context_field_embedder = context_field_embedder self.target_field_embedder = target_field_embedder self.num_classes = self.vocab.get_vocab_size(self.label_name) self.context_encoder = context_encoder self.target_encoder = target_encoder self.feedforward = feedforward self._use_target_sequences = use_target_sequences if self._use_target_sequences and self.target_field_embedder: raise ConfigurationError( '`use_target_sequences` cannot be True at' ' the same time as a value for ' '`target_field_embedder` as the embeddings' ' come from the context and not a separate embedder') target_encoder_out = self.target_encoder.get_output_dim() context_encoder_out = self.context_encoder.get_output_dim() self.context_encoder_bidirectional = self.context_encoder.is_bidirectional( ) # Applied after the contextulisation layer and before the attention layer attention_projection_layer_dim = context_encoder_out if AttentionAE: attention_projection_layer_dim = context_encoder_out + target_encoder_out self.attention_project_layer = Linear(attention_projection_layer_dim, attention_projection_layer_dim, bias=False) self.attention_project_layer = TimeDistributed( self.attention_project_layer) # Activation function to be applied after projection and before attention context_attention_activation_function = Activation.by_name( f'{context_attention_activation_function}')() self._context_attention_activation_function = context_attention_activation_function attention_vector_dim = context_encoder_out if AttentionAE: attention_vector_dim = context_encoder_out + target_encoder_out self.attention_vector = Parameter(torch.Tensor(attention_vector_dim)) self.context_attention_layer = DotProductAttention(normalize=True) # Final projection layers, these are applied after the attention layer self.final_attention_projection_layer = Linear(context_encoder_out, context_encoder_out, bias=False) self.final_hidden_state_projection_layer = Linear(context_encoder_out, context_encoder_out, bias=False) # Set the loss weights (have to sort them by order of label index in # the vocab) self.loss_weights = target_sentiment.util.loss_weight_order( self, loss_weights, self.label_name) # Inter target modelling self.inter_target_encoding = inter_target_encoding if feedforward is not None: output_dim = self.feedforward.get_output_dim() elif self.inter_target_encoding is not None: output_dim = self.inter_target_encoding.get_output_dim() else: output_dim = context_encoder_out self.label_projection = Linear(output_dim, self.num_classes) self.metrics = {"accuracy": CategoricalAccuracy()} self.f1_metrics = {} # F1 Scores label_index_name = self.vocab.get_index_to_token_vocabulary( self.label_name) for label_index, _label_name in label_index_name.items(): _label_name = f'F1_{_label_name.capitalize()}' self.f1_metrics[_label_name] = F1Measure(label_index) # Dropout self._variational_dropout = InputVariationalDropout(dropout) self._naive_dropout = Dropout(dropout) # Ensure that the dimensions of the target or text field embedder and # the target encoder match target_field_embedder_dim = context_field_embedder.get_output_dim() target_field_error = "context field embedding dim" if self.target_field_embedder: target_field_embedder_dim = target_field_embedder.get_output_dim() target_field_error = "target field embedding dim" check_dimensions_match(target_field_embedder_dim, target_encoder.get_input_dim(), target_field_error, "target encoder input dim") # If AE is True ensure that the context encoder input is equal to the # the output of the target encoder plus the context field embedder context_field_embedder_out = context_field_embedder.get_output_dim() # position embeddings self.target_position_embedding = target_position_embedding if self.target_position_embedding is not None: context_field_embedder_out += self.target_position_embedding.get_output_dim( ) if AE: check_dimensions_match( context_field_embedder_out + target_encoder_out, context_encoder.get_input_dim(), "context field embedding dim + Target Encoder out", "text encoder input dim") else: check_dimensions_match(context_field_embedder_out, context_encoder.get_input_dim(), "context field embedding dim", "text encoder input dim") if self.inter_target_encoding is not None: check_dimensions_match(context_encoder_out, self.inter_target_encoding.get_input_dim(), 'Context field enocder output', 'Inter target encoder input') if self.feedforward is not None: if self.inter_target_encoding is not None: check_dimensions_match( self.inter_target_encoding.get_output_dim(), self.feedforward.get_input_dim(), 'Inter target encoder output', 'FeedForward input dim') else: check_dimensions_match(context_encoder_out, self.feedforward.get_input_dim(), 'Context encoder output', 'FeedForward input dim') self.target_position_weight = target_position_weight # TimeDistributed anything that is related to the targets. if self.feedforward is not None: self.feedforward = TimeDistributed(self.feedforward) self.label_projection = TimeDistributed(self.label_projection) self._time_variational_dropout = TimeDistributed( self._variational_dropout) self._AE = AE self._AttentionAE = AttentionAE self.reset_parameters() initializer(self)
def main(): trainFile = "../srcData/trainData.csv" validFile = "../srcData/devData.csv" testFile = "../srcData/testData.csv" trainSeq2SeqFile = data.dataPreparation(trainFile) validSeq2SeqFile = data.dataPreparation(validFile) testSeq2SeqFile = data.dataPreparation(testFile) print(testSeq2SeqFile) #TokenIndexer Determines how string tokens gets represented as arrays of indexes in a model #SingleIdTokenIndexer = Tokens are single integers #TokenCharactersIndexer = Tokens as a list of integers # Read a tsvfile with paired instances (source, target) reader = Seq2SeqDatasetReader( source_tokenizer=WordTokenizer(), target_tokenizer=WordTokenizer(), # Defaults to source_tokenizer source_token_indexers={'tokens': SingleIdTokenIndexer()}, target_token_indexers={'tokens': SingleIdTokenIndexer() } # Defaults to source_token_indexers ) # Each of the dataset is a list of each tokens (source_tokens, target_tokens) train_dataset = reader.read(trainSeq2SeqFile) validation_dataset = reader.read(validSeq2SeqFile) test_dataset = reader.read(testSeq2SeqFile) # Finding extra fact2 vocab trainExtraVocab = findExtraVocab(train_dataset) validExtraVocab = findExtraVocab(validation_dataset) testExtraVocab = findExtraVocab(test_dataset) finalExtraVocab = list( set(trainExtraVocab + validExtraVocab + testExtraVocab)) print("length:", len(finalExtraVocab)) #input() #vocab = Vocabulary.from_instances(train_dataset + validation_dataset, min_count={'tokens': 3, 'target_tokens': 3}) vocab = Vocabulary.from_instances(train_dataset + validation_dataset + test_dataset) # Train + Valid = 9703 # Train + Valid + Test = 10099 print("Vocab SIze :", vocab.get_vocab_size('tokens')) encEmbedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=ENC_EMBEDDING_DIM) # Embedding for tokens since in the dataset creation time it is mentioned tokens source_embedder = BasicTextFieldEmbedder({"tokens": encEmbedding}) encoder = PytorchSeq2SeqWrapper( torch.nn.LSTM(ENC_EMBEDDING_DIM, HIDDEN_DIM, batch_first=True, dropout=0.2)) attention = DotProductAttention() max_decoding_steps = 4 # TODO: make this variable model = SimpleSeq2Seq( vocab, source_embedder, encoder, max_decoding_steps, target_embedding_dim=TGT_EMBEDDING_DIM, #target_namespace = 'target_tokens', attention=attention, beam_size=beamSize, use_bleu=True, extra_vocab=finalExtraVocab) #Can also specify lr=0.001 optimizer = optim.Adam(model.parameters()) # Data Iterator that specify how to batch our dataset # Takes data shuffles it and creates fixed sized batches #iterator = BasicIterator(batch_size=2) #iterator.index_with(vocab) # Pads batches wrt max input lengths per batch, sorts dataset wrt the fieldnames and padding keys provided for efficient computations iterator = BucketIterator(batch_size=50, sorting_keys=[("source_tokens", "num_tokens")]) iterator.index_with(vocab) trainer = Trainer( model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=validation_dataset, #patience = 3, num_epochs=numEpochs, cuda_device=CUDA_DEVICE) trainer.train() predictor = SimpleSeq2SeqPredictor(model, reader) '''for i in range(2): print ("Epoch: {}".format(i)) trainer.train() predictor = SimpleSeq2SeqPredictor(model, reader) for instance in itertools.islice(validation_dataset, 10): print('SOURCE:', instance.fields['source_tokens'].tokens) print('GOLD:', instance.fields['target_tokens'].tokens) print('PRED:', predictor.predict_instance(instance)['predicted_tokens']) """'{'predictions': [[1, 4, 5, 92, 8, 6, 1, 8, 6, 26, 3]], 'loss': 5.9835076332092285, 'class_log_probabilities': [-20.10894012451172], 'predicted_tokens': ['@@UNKNOWN@@', 'is', 'a', 'type', 'of', 'the', '@@UNKNOWN@@', 'of', 'the', 'sun']} """ print (predictor.predict_instance(instance)) ''' outFile = open( "output_" + str(HIDDEN_DIM) + "_" + str(numEpochs) + "_" + str(beamSize) + ".csv", "w") writer = csv.writer(outFile, delimiter="\t") for instance in itertools.islice(test_dataset, 500): src = instance.fields['source_tokens'].tokens gold = instance.fields['target_tokens'].tokens pred = predictor.predict_instance(instance)['predicted_tokens'] writer.writerow([src, gold, pred]) outFile.close()