Exemple #1
0
 def __init__(self,
              num_layers: int,
              text_encoder_out_dim: int,
              target_encoder_out_dim: int,
              highway: bool = True,
              dropout: float = 0.0) -> None:
     '''
     :param num_layers: Number of times to perform the CPT layer
     :param text_encoder_out_dim: The output dimension of the text encoder
     :param target_encoder_out_dim: The output dimension of the target 
                                    encoder
     :param highway: highway adds the contextualised word vector (input word 
                     representation to CPT) to the transformed word vector
                     (output word representation of CPT). Setting this is 
                     the equivalent of using Lossless Forwarding (LF) from 
                     the original paper.
     :param dropout: Wether or not to apply standard dropout to the 
                     transformed word vector after each CPT layer.
     '''
     super().__init__()
     target_text_enc_out = target_encoder_out_dim + text_encoder_out_dim
     self.cpt_feedforward = Linear(target_text_enc_out,
                                   text_encoder_out_dim)
     self.attention = DotProductAttention(normalize=True)
     self.num_layers = num_layers
     self._highway = highway
     self._activation = Hardtanh()
     self._naive_dropout = Dropout(dropout)
     self._output_dim = text_encoder_out_dim
    def test_lstm_cell_decoder_net_forward_without_bidirectionality(self):
        decoder_inout_dim = 10
        lstm_decoder_net = LstmCellDecoderNet(
                decoding_dim=decoder_inout_dim,
                target_embedding_dim=decoder_inout_dim,
                attention=DotProductAttention(),
                bidirectional_input=False)
        batch_size = 5
        time_steps = 10
        encoded_state = torch.rand(batch_size, time_steps, decoder_inout_dim)
        source_mask = torch.ones(batch_size, time_steps)
        source_mask[0, 7:] = 0
        source_mask[1, 5:] = 0
        encoder_out = {
                "source_mask": source_mask,
                "encoder_outputs": encoded_state
        }
        prev_step_prediction_embeded = torch.rand(batch_size, 1, decoder_inout_dim)
        prev_state = lstm_decoder_net.init_decoder_state(encoder_out)

        next_state, decoded_vec = lstm_decoder_net(prev_state, encoded_state,
                                                   source_mask, prev_step_prediction_embeded)
        assert list(next_state["decoder_hidden"].shape) == [batch_size, decoder_inout_dim]
        assert list(next_state["decoder_context"].shape) == [batch_size, decoder_inout_dim]
        assert list(decoded_vec.shape) == [batch_size, decoder_inout_dim]
Exemple #3
0
    def __init__(self,
                 vocab: Vocabulary,
                 bert: TextFieldEmbedder,
                 classifier: FeedForward,
                 dropout: float = 0.1,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super().__init__(vocab, regularizer)

        self._bert = bert
        self._classifier = classifier

        if dropout:
            self.dropout = torch.nn.Dropout(dropout)
        else:
            self.dropout = None

        self.attention = DotProductAttention()

        self._pooler = FeedForward(input_dim=bert.get_output_dim(),
                                   num_layers=1,
                                   hidden_dims=bert.get_output_dim(),
                                   activations=torch.tanh)

        check_dimensions_match(bert.get_output_dim() * 3,
                               classifier.get_input_dim(),
                               "bert embedding dim", "classifier input dim")

        self._accuracy = CategoricalAccuracy()
        self._loss = torch.nn.CrossEntropyLoss()

        initializer(self)
Exemple #4
0
    def setUp(self):
        self.reader = ToyReader()
        self.train_instances = self.reader.read("/home/IAIS/nchakrabor/nmt_data/toy_reverse/train/toy_train.txt")
        self.dev_instances = self.reader.read("/home/IAIS/nchakrabor/nmt_data/toy_reverse/dev/toy_dev.txt")
        self.vocab = Vocabulary.from_instances(self.train_instances + self.dev_instances)

        token_embedding = Embedding(num_embeddings=self.vocab.get_vocab_size('tokens') + 2,
                                    embedding_dim=256, padding_index=0)

        word_embeddings: TextFieldEmbedder = BasicTextFieldEmbedder({"tokens": token_embedding})

        encoder = PytorchSeq2SeqWrapper(nn.LSTM(input_size=word_embeddings.get_output_dim(),
                                                num_layers=2,
                                                hidden_size=256,
                                                bidirectional=True,
                                                dropout=0.4,
                                                batch_first=True))

        # self.set_up_model(model_params_file_path, dataset_sample_file_path)
        self.model = SimpleSeq2Seq(vocab=self.vocab,
                                   source_embedder=word_embeddings,
                                   encoder=encoder,
                                   target_embedding_dim=256,
                                   target_namespace='target_tokens',
                                   attention=DotProductAttention(),
                                   max_decoding_steps=25,
                                   beam_size=5,
                                   use_bleu=True
                                   )

        self.model.cuda(0)
 def test_lstm_cell_decoder_net_init(self):
     decoder_inout_dim = 10
     lstm_decoder_net = LstmCellDecoderNet(
         decoding_dim=decoder_inout_dim,
         target_embedding_dim=decoder_inout_dim,
         attention=DotProductAttention(),
         bidirectional_input=False,
     )
     batch_size = 5
     time_steps = 10
     encoded_state = torch.rand(batch_size, time_steps, decoder_inout_dim)
     source_mask = torch.ones(batch_size, time_steps).bool()
     source_mask[0, 7:] = 0
     source_mask[1, 5:] = 0
     encoder_out = {
         "source_mask": source_mask,
         "encoder_outputs": encoded_state
     }
     decoder_init_state = lstm_decoder_net.init_decoder_state(encoder_out)
     assert list(decoder_init_state["decoder_hidden"].shape) == [
         batch_size, decoder_inout_dim
     ]
     assert list(decoder_init_state["decoder_context"].shape) == [
         batch_size, decoder_inout_dim
     ]
Exemple #6
0
def get_attention(st_ds_conf, attn_type, *dims):
    emb_sz = st_ds_conf[
        'emb_sz']  # dim for both the decoder output and the encoder output
    attn_type = attn_type.lower()
    if attn_type == "bilinear":
        if len(dims) < 2:
            dims = [emb_sz, emb_sz]
        attn = BilinearAttention(vector_dim=dims[0], matrix_dim=dims[1])
        attn = AllenNLPAttentionWrapper(attn, st_ds_conf['attention_dropout'])
    elif attn_type == "dot_product":
        if len(dims) >= 2:
            assert dims[0] == dims[
                1], "encoder hidden states must be able to multiply with decoder output"
        attn = DotProductAttention()
        attn = AllenNLPAttentionWrapper(attn, st_ds_conf['attention_dropout'])
    elif attn_type == "multihead":
        attn = GeneralMultiHeadAttention(
            num_heads=st_ds_conf['num_heads'],
            input_dim=emb_sz,
            total_attention_dim=emb_sz,
            total_value_dim=emb_sz,
            attend_to_dim=emb_sz,
            output_dim=emb_sz,
            attention_dropout=st_ds_conf['attention_dropout'],
            use_future_blinding=False,
        )
        attn = SingleTokenMHAttentionWrapper(attn)
    elif attn_type == "none":
        attn = None
    else:
        assert False

    return attn
Exemple #7
0
    def __init__(self,
                 input_dim,
                 dropout=0.0,
                 use_ffnn=True,
                 query_dim=None,
                 activation='tanh'):
        super(Attention, self).__init__()

        self.use_ffnn = use_ffnn

        if self.use_ffnn:
            self.ffnn = FeedForward( \
                    input_dim = input_dim,
                    num_layers = 1,
                    hidden_dims = query_dim,
                    activations = get_activation(activation),
                    dropout = 0)
        else:
            query_dim = input_dim

        # Dot product attention
        self.attention = DotProductAttention(normalize=True)

        # Event-specific attention vector
        # (input_dim)
        self.vector = Parameter(torch.Tensor(query_dim))
        torch.nn.init.normal_(self.vector)

        # Dropout
        self.drop_layer = nn.Dropout(p=dropout)
Exemple #8
0
def build_seq2seq_model(flags,
                        data_reader,
                        vocab: Vocabulary,
                        source_namespace: str = 'source_tokens',
                        target_namespace: str = 'target_tokens') -> Model:
    source_embedding = Embedding(
        vocab.get_vocab_size(namespace=source_namespace),
        embedding_dim=flags.source_embedding_dim)
    source_embedder = BasicTextFieldEmbedder({'tokens': source_embedding})
    lstm_encoder = PytorchSeq2SeqWrapper(
        torch.nn.LSTM(flags.source_embedding_dim,
                      flags.encoder_hidden_dim,
                      batch_first=True,
                      bidirectional=flags.encoder_bidirectional))
    attention = DotProductAttention()
    model = SimpleSeq2Seq(vocab,
                          source_embedder,
                          lstm_encoder,
                          flags.max_decode_length,
                          target_embedding_dim=flags.decoder_hidden_dim,
                          target_namespace=target_namespace,
                          attention=attention,
                          beam_size=flags.beam_size,
                          use_bleu=True)
    return model
Exemple #9
0
def get_attention(st_ds_conf, attn_type):
    emb_sz = st_ds_conf[
        'emb_sz']  # dim for both the decoder output and the encoder output
    attn_type = attn_type.lower()
    if attn_type == "bilinear":
        attn = BilinearAttention(vector_dim=emb_sz, matrix_dim=emb_sz)
        attn = AllenNLPAttentionWrapper(attn, st_ds_conf['attention_dropout'])
    elif attn_type == "dot_product":
        attn = DotProductAttention()
        attn = AllenNLPAttentionWrapper(attn, st_ds_conf['attention_dropout'])
    elif attn_type == "multihead":
        attn = GeneralMultiHeadAttention(
            num_heads=st_ds_conf['num_heads'],
            input_dim=emb_sz,
            total_attention_dim=emb_sz,
            total_value_dim=emb_sz,
            attend_to_dim=emb_sz,
            output_dim=emb_sz,
            attention_dropout=st_ds_conf['attention_dropout'],
            use_future_blinding=False,
        )
        attn = SingleTokenMHAttentionWrapper(attn)
    elif attn_type == "none":
        attn = None
    else:
        assert False

    return attn
Exemple #10
0
 def __init__(self) -> None:
     self.turn_num = 0
     self.past_hidden_states = []
     self.past_cell_states = []
     self.encoder_outputs = []
     self.past_dec_hidden_states = []
     self.past_dec_cell_states = []
     self.attention = DotProductAttention()
Exemple #11
0
    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 text_encoder: Seq2SeqEncoder,
                 classifier_feedforward: Optional[FeedForward] = None,
                 dropout: Optional[float] = 0.0,
                 code_switching_regularizer: Optional[float] = 0.0,
                 bivalency_regularizer: Optional[float] = 0.0,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        '''
        :param dropout: The amount of dropout to apply. Dropout is applied 
                        after each non-linear layer and the word embeddings 
                        lookup. Two types of dropout are applied, variational 
                        dropout is applied if the input is to the dropout is 
                        a sequence of vectors (each vector in the sequence 
                        representing a word), and normal dropout if the input 
                        is a vector.
        :param code_switching_regularizer: The weight associated to the code 
                                           switching lexicon regulisation the 
                                           lower the less affect it has. This 
                                           requires that the dataset reader is 
                                           going to supply the code switching 
                                           arrays for the forward function of 
                                           this class. If set a good values is 
                                           0.001
        :param bivalency_regularizer: The weight associated to the bivalency 
                                      regulisation the lower the less affect it 
                                      has. This requires that the dataset 
                                      reader is going to supply the bivalency
                                      arrays for the forward function of this 
                                      class.
        '''
        super().__init__(vocab, regularizer)
        self._naive_dropout = Dropout(dropout)
        self._variational_dropout = InputVariationalDropout(dropout)

        self.text_field_embedder = text_field_embedder
        self.num_classes = self.vocab.get_vocab_size("labels")
        self.text_encoder = text_encoder
        text_encoder_dim = text_encoder.get_output_dim()
        # Attention parameters
        self.project_encoded_text = TimeDistributed(
            Linear(text_encoder_dim, text_encoder_dim))
        self.attention_vector = Parameter(torch.Tensor(text_encoder_dim))
        self.reset_parameters()
        self.attention_layer = DotProductAttention(normalize=True)

        self.classifier_feedforward = classifier_feedforward
        output_dim = text_encoder_dim
        if classifier_feedforward:
            output_dim = classifier_feedforward.get_output_dim()
        self.label_projection = Linear(output_dim, self.num_classes)
        self.metrics = {"accuracy": CategoricalAccuracy()}
        self.code_switching_regularizer = code_switching_regularizer
        self.bivalency_regularizer = bivalency_regularizer
        self.loss = torch.nn.CrossEntropyLoss()
        initializer(self)
Exemple #12
0
def main():
    reader = Seq2SeqDatasetReader(
        source_tokenizer=WordTokenizer(),
        target_tokenizer=CharacterTokenizer(),
        source_token_indexers={'tokens': SingleIdTokenIndexer()},
        target_token_indexers={'tokens': SingleIdTokenIndexer(namespace='target_tokens')})
    train_dataset = reader.read('data/mt/tatoeba.eng_cmn.train.tsv')
    validation_dataset = reader.read('data/mt/tatoeba.eng_cmn.dev.tsv')

    vocab = Vocabulary.from_instances(train_dataset + validation_dataset,
                                      min_count={'tokens': 3, 'target_tokens': 3})

    en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                             embedding_dim=EN_EMBEDDING_DIM)
    # encoder = PytorchSeq2SeqWrapper(
    #     torch.nn.LSTM(EN_EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))
    encoder = StackedSelfAttentionEncoder(input_dim=EN_EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, projection_dim=128, feedforward_hidden_dim=128, num_layers=1, num_attention_heads=8)

    source_embedder = BasicTextFieldEmbedder({"tokens": en_embedding})

    # attention = LinearAttention(HIDDEN_DIM, HIDDEN_DIM, activation=Activation.by_name('tanh')())
    # attention = BilinearAttention(HIDDEN_DIM, HIDDEN_DIM)
    attention = DotProductAttention()

    max_decoding_steps = 20   # TODO: make this variable
    model = SimpleSeq2Seq(vocab, source_embedder, encoder, max_decoding_steps,
                          target_embedding_dim=ZH_EMBEDDING_DIM,
                          target_namespace='target_tokens',
                          attention=attention,
                          beam_size=8,
                          use_bleu=True)
    optimizer = optim.Adam(model.parameters())
    iterator = BucketIterator(batch_size=32, sorting_keys=[("source_tokens", "num_tokens")])

    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=validation_dataset,
                      num_epochs=1,
                      cuda_device=CUDA_DEVICE)

    for i in range(50):
        print('Epoch: {}'.format(i))
        trainer.train()

        predictor = SimpleSeq2SeqPredictor(model, reader)

        for instance in itertools.islice(validation_dataset, 10):
            print('SOURCE:', instance.fields['source_tokens'].tokens)
            print('GOLD:', instance.fields['target_tokens'].tokens)
            print('PRED:', predictor.predict_instance(instance)['predicted_tokens'])
Exemple #13
0
 def __init__(self,
              vocab: Vocabulary,
              text_field_embedder: TextFieldEmbedder,
              shared_encoder: Seq2VecEncoder,
              private_encoder: Seq2VecEncoder,
              input_dropout: float = 0.0,
              regularizer: RegularizerApplicator = None) -> None:
     super(CNNEncoder, self).__init__(vocab, regularizer)
     self._text_field_embedder = text_field_embedder
     self._shared_encoder = shared_encoder
     self._private_encoder = private_encoder
     # self._U = nn.Linear()
     self._attention = DotProductAttention()
     self._input_dropout = Dropout(input_dropout)
Exemple #14
0
    def __init__(self,
                 input_size: int,
                 hidden_size: int,
                 num_layers: int = 1,
                 attention: Attention = DotProductAttention(),
                 input_feeding: bool = True,
                 residual: Union[bool, List[bool]] = False,
                 inter_layer_dropout: float = 0.1,
                 weight_dropout: float = 0.0,
                 rnn: str = 'LSTM'):
        super().__init__(input_size=input_size,
                         hidden_size=hidden_size,
                         num_layers=num_layers,
                         residual=residual,
                         inter_layer_dropout=inter_layer_dropout,
                         weight_dropout=weight_dropout,
                         rnn=rnn)

        self.attention = attention
        self.input_feeding = input_feeding
        self.fuse_attention = nn.Linear(2 * hidden_size, hidden_size)
Exemple #15
0
    def __init__(self,
                 input_size: int,
                 hidden_size: int,
                 num_layers: int = 1,
                 attention: Attention = DotProductAttention(),
                 residual: Union[bool, List[bool]] = False,
                 inter_layer_dropout: float = 0.1,
                 weight_dropout: float = 0.0,
                 output_size: int = None,
                 rnn: str = 'LSTM'):
        super().__init__(input_size=input_size,
                         hidden_size=hidden_size,
                         num_layers=num_layers,
                         residual=residual,
                         inter_layer_dropout=inter_layer_dropout,
                         weight_dropout=weight_dropout,
                         rnn=rnn)

        self.attention = attention
        self.output_size = output_size
        if self.output_size is not None:
            self.linear = nn.Linear(self._hidden_size + self.get_input_dim(),
                                    self.output_size)
Exemple #16
0
en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                         embedding_dim=EN_EMBEDDING_DIM)
# encoder = PytorchSeq2SeqWrapper(
#     torch.nn.LSTM(EN_EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))
encoder = StackedSelfAttentionEncoder(input_dim=EN_EMBEDDING_DIM,
                                      hidden_dim=HIDDEN_DIM,
                                      projection_dim=128,
                                      feedforward_hidden_dim=128,
                                      num_layers=1,
                                      num_attention_heads=8)

source_embedder = BasicTextFieldEmbedder({"tokens": en_embedding})

# attention = LinearAttention(HIDDEN_DIM, HIDDEN_DIM, activation=Activation.by_name('tanh')())
# attention = BilinearAttention(HIDDEN_DIM, HIDDEN_DIM)
attention = DotProductAttention()

max_decoding_steps = 800
model = SimpleSeq2Seq(vocab,
                      source_embedder,
                      encoder,
                      max_decoding_steps,
                      target_embedding_dim=ZH_EMBEDDING_DIM,
                      target_namespace='target_tokens',
                      attention=attention,
                      beam_size=12,
                      use_bleu=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(
    device
Exemple #17
0
    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 text_encoder: Seq2SeqEncoder,
                 target_encoder: Seq2VecEncoder,
                 feedforward: Optional[FeedForward] = None,
                 target_field_embedder: Optional[TextFieldEmbedder] = None,
                 target_concat_text_embedding: bool = False,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None,
                 word_dropout: float = 0.0,
                 dropout: float = 0.0) -> None:
        '''
        :param vocab: vocab : A Vocabulary, required in order to compute sizes 
                              for input/output projections.
        :param text_field_embedder: Used to embed the text and target text if
                                    target_field_embedder is None but the 
                                    target_encoder is not None.
        :param text_encoder: Sequence Encoder that will create the 
                             representation of each token in the context 
                             sentence.
        :param target_encoder: Encoder that will create the representation of 
                               target text tokens.
        :param feedforward: An optional feed forward layer to apply after
                            either the text encoder if target encoder is None. 
                            Else it would be after the target and the text 
                            encoded representations have been concatenated.
        :param target_field_embedder: Used to embed the target text to give as 
                                      input to the target_encoder. Thus this 
                                      allows a seperate embedding for text and 
                                      target text.
        :param target_concat_text_embedding: Whether or not the target should be 
                                             concatenated to the each word 
                                             embedding within the text before 
                                             being encoded.
        :param initializer: Used to initialize the model parameters.
        :param regularizer: If provided, will be used to calculate the 
                            regularization penalty during training.
        :param word_dropout: Dropout that is applied after the embedding of the 
                             tokens/words. It will drop entire words with this 
                             probabilty.
        :param dropout: To apply dropout after each layer apart from the last 
                        layer. All dropout that is applied to timebased data 
                        will be `variational dropout`_ all else will be  
                        standard dropout.
        
        This class is all based around the following paper `Attention-based 
        LSTM for Aspect-level Sentiment Classification 
        <https://www.aclweb.org/anthology/D16-1058>`_. The default model here 
        is the equivalent to the AT-LSTM within this paper (Figure 2). If the 
        `target_concat_text_embedding` argument is `True` then the model becomes 
        the ATAE-LSTM within the cited paper (Figure 3).

        The only difference between this model and the attention based models 
        in the paper is that the final sentence representation is `r` rather 
        than `h* = tanh(Wpr + WxhN)` as we found this projection to not help 
        the performance.

        .. _variational dropout:
           https://papers.nips.cc/paper/6241-a-theoretically-grounded-application-of-dropout-in-recurrent-neural-networks.pdf
        '''
        super().__init__(vocab, regularizer)

        self.text_field_embedder = text_field_embedder
        self.target_field_embedder = target_field_embedder
        self.num_classes = self.vocab.get_vocab_size("labels")
        self.text_encoder = text_encoder
        self.target_encoder = target_encoder
        self.feedforward = feedforward

        target_text_encoder_dim = (target_encoder.get_output_dim() +
                                   text_encoder.get_output_dim())
        self.encoded_target_text_fusion = TimeDistributed(
            Linear(target_text_encoder_dim, target_text_encoder_dim))
        self.attention_vector = Parameter(
            torch.Tensor(target_text_encoder_dim))
        self.attention_layer = DotProductAttention(normalize=True)

        if feedforward is not None:
            output_dim = self.feedforward.get_output_dim()
        else:
            output_dim = text_encoder.get_output_dim()
        self.label_projection = Linear(output_dim, self.num_classes)
        self.metrics = {"accuracy": CategoricalAccuracy()}
        self.f1_metrics = {}
        # F1 Scores
        label_index_name = self.vocab.get_index_to_token_vocabulary('labels')
        for label_index, label_name in label_index_name.items():
            label_name = f'F1_{label_name.capitalize()}'
            self.f1_metrics[label_name] = F1Measure(label_index)

        self._word_dropout = WordDrouput(word_dropout)
        self._variational_dropout = InputVariationalDropout(dropout)
        self._naive_dropout = Dropout(dropout)

        self.target_concat_text_embedding = target_concat_text_embedding
        self.loss = torch.nn.CrossEntropyLoss()

        # Ensure the text encoder has the correct input dimension
        if target_concat_text_embedding:
            text_encoder_expected_in = (text_field_embedder.get_output_dim() +
                                        target_encoder.get_output_dim())
            check_dimensions_match(
                text_encoder_expected_in, text_encoder.get_input_dim(),
                "text field embedding dim + target encoder output dim",
                "text encoder input dim")
        else:
            check_dimensions_match(text_field_embedder.get_output_dim(),
                                   text_encoder.get_input_dim(),
                                   "text field embedding dim",
                                   "text encoder input dim")
        # Ensure that the dimensions of the target or text field embedder and
        # the target encoder match
        target_field_embedder_dim = text_field_embedder.get_output_dim()
        target_field_error = "text field embedding dim"
        if self.target_field_embedder:
            target_field_embedder_dim = target_field_embedder.get_output_dim()
            target_field_error = "target field embedding dim"

        check_dimensions_match(target_field_embedder_dim,
                               target_encoder.get_input_dim(),
                               target_field_error, "target encoder input dim")
        self.reset_parameters()
        initializer(self)
Exemple #18
0
def run(trainp="overnight/calendar_train_delex.tsv",
        testp="overnight/calendar_test_delex.tsv",
        batsize=8,
        embdim=50,
        encdim=50,
        maxtime=100,
        lr=.001,
        gpu=0,
        cuda=False, epochs=20):
    tt = q.ticktock("script")
    tt.tick("loading data")
    def tokenizer(x:str, splitter:WordSplitter=None)->List[str]:
        return [xe.text for xe in splitter.split_words(x)]

    reader = OvernightReader(partial(tokenizer, splitter=JustSpacesWordSplitter()),
                             partial(tokenizer, splitter=JustSpacesWordSplitter()),
                             SingleIdTokenIndexer(namespace="nl_tokens"),
                             SingleIdTokenIndexer(namespace="fl_tokens"))
    trainds = reader.read(trainp)
    testds = reader.read(testp)
    tt.tock("data loaded")

    tt.tick("building vocabulary")
    vocab = Vocabulary.from_instances(trainds)
    tt.tock("vocabulary built")

    tt.tick("making iterator")
    iterator = BucketIterator(sorting_keys=[("nl", "num_tokens"), ("fl", "num_tokens")],
                              batch_size=batsize,
                              biggest_batch_first=True)
    iterator.index_with(vocab)
    batch = next(iter(iterator(trainds)))
    #print(batch["id"])
    #print(batch["nl"])
    tt.tock("made iterator")

    # region model
    nl_emb = Embedding(vocab.get_vocab_size(namespace="nl_tokens"),
                       embdim, padding_index=0)
    fl_emb = Embedding(vocab.get_vocab_size(namespace="fl_tokens"),
                       embdim, padding_index=0)
    nl_field_emb = BasicTextFieldEmbedder({"tokens": nl_emb})
    fl_field_emb = BasicTextFieldEmbedder({"tokens": fl_emb})

    encoder = PytorchSeq2SeqWrapper(torch.nn.LSTM(embdim, encdim, bidirectional=True, batch_first=True))
    attention = DotProductAttention()

    smodel = Seq2Seq(vocab, nl_field_emb, encoder, maxtime,
                  target_embedding_dim=embdim,
                  attention=attention,
                  target_namespace='fl_tokens',
                  beam_size=1,
                  use_bleu=True)

    smodel_out = smodel(batch["nl"], batch["fl"])

    optim = torch.optim.Adam(smodel.parameters(), lr=lr)
    trainer = Trainer(model=smodel,
                      optimizer=optim,
                      iterator=iterator,
                      train_dataset=trainds,
                      validation_dataset=testds,
                      num_epochs=epochs,
                      cuda_device=gpu if cuda else -1)

    metrics = trainer.train()

    sys.exit()
    class MModel(Model):
        def __init__(self, nlemb:Embedding,
                           flemb:Embedding,
                            vocab:Vocabulary,
                     **kwargs):
            super(MModel, self).__init__(vocab, **kwargs)
            self.nlemb, self.flemb = nlemb, flemb

        @overrides
        def forward(self,
                    nl:Dict[str, torch.Tensor],
                    fl:Dict[str, torch.Tensor],
                    id:Any):
            nlemb = self.nlemb(nl["tokens"])
            flemb = self.flemb(fl["tokens"])
            print(nlemb.size())
            pass

    m = MModel(nl_emb, fl_emb, vocab)
    batch = next(iter(iterator(trainds)))
    out = m(**batch)
Exemple #19
0
def main():
    trainFile = "../srcData/trainData.csv"
    validFile = "../srcData/devData.csv"
    testFile = "../srcData/testData.csv"
    trainSeq2SeqFile = data.dataPreparation(trainFile)
    validSeq2SeqFile = data.dataPreparation(validFile)
    testSeq2SeqFile = data.dataPreparation(testFile)
    print(testSeq2SeqFile)
    # TokenIndexer Determines how string tokens gets represented as arrays of indexes in a model
    # SingleIdTokenIndexer = Tokens are single integers
    # TokenCharactersIndexer = Tokens as a list of integers
    # Read a tsvfile with paired instances (source, target)
    reader = CopyNetDatasetReader(
        source_tokenizer=WordTokenizer(),
        target_tokenizer=WordTokenizer(),  # Defaults to source_tokenizer
        source_token_indexers={'tokens': SingleIdTokenIndexer()},
        target_namespace='tokens'  # Defaults to source_token_indexers
    )

    # Each of the dataset is a list of each tokens (source_tokens, target_tokens)
    train_dataset = reader.read(trainSeq2SeqFile)
    validation_dataset = reader.read(validSeq2SeqFile)
    test_dataset = reader.read(testSeq2SeqFile)
    """
    # Finding extra fact2 vocab
    trainExtraVocab = findExtraVocab(train_dataset)
    validExtraVocab = findExtraVocab(validation_dataset)
    testExtraVocab = findExtraVocab(test_dataset)
    finalExtraVocab = list(set(trainExtraVocab + validExtraVocab + testExtraVocab))
    print("length:", len(finalExtraVocab))
    # input()
    """
    # vocab = Vocabulary.from_instances(train_dataset + validation_dataset, min_count={'tokens': 3, 'target_tokens': 3})
    vocab = Vocabulary.from_instances(train_dataset + validation_dataset +
                                      test_dataset)
    # Train + Valid = 9703
    # Train + Valid + Test = 10099

    print("Vocab SIze :", vocab.get_vocab_size('tokens'))

    encEmbedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                             embedding_dim=ENC_EMBEDDING_DIM)

    # Embedding for tokens since in the dataset creation time it is mentioned tokens
    source_embedder = BasicTextFieldEmbedder({"tokens": encEmbedding})

    encoder = PytorchSeq2SeqWrapper(
        torch.nn.LSTM(ENC_EMBEDDING_DIM,
                      HIDDEN_DIM,
                      batch_first=True,
                      dropout=0.2))

    Attention = DotProductAttention()
    print(Attention)

    max_decoding_steps = 4  # TODO: make this variable

    model = CopyNetSeq2Seq(
        vocab,
        source_embedder,
        encoder,
        max_decoding_steps=max_decoding_steps,
        target_embedding_dim=TGT_EMBEDDING_DIM,
        # target_namespace = 'target_tokens',
        beam_size=beamSize,
        attention=Attention)
    # Can also specify lr=0.001
    optimizer = optim.Adam(model.parameters())

    # Data Iterator that specify how to batch our dataset
    # Takes data shuffles it and creates fixed sized batches
    # iterator = BasicIterator(batch_size=2)
    # iterator.index_with(vocab)
    # Pads batches wrt max input lengths per batch, sorts dataset wrt the fieldnames and padding keys provided for efficient computations
    iterator = BucketIterator(batch_size=50,
                              sorting_keys=[("source_tokens", "num_tokens")])
    iterator.index_with(vocab)

    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        iterator=iterator,
        train_dataset=train_dataset,
        validation_dataset=validation_dataset,
        # patience = 3,
        num_epochs=numEpochs,
        cuda_device=CUDA_DEVICE)

    trainer.train()
    """
Exemple #20
0
def get_model_fn(model_name,
                 embeddings,
                 vocab,
                 input_dims=128,
                 hidden_dims=128,
                 dataset_name='debug',
                 max_len=40,
                 **kwargs):
    is_seq2seq = seq2seq_models.get(model_name, False)

    wrapped = None

    # TODO: Factory or something else.
    encoder_args = {
        'lstm': [
            input_dims,
            hidden_dims,
        ],
        'transformer': [
            input_dims,
            hidden_dims,
            input_dims,
            input_dims,
            1,
            4,
            # TODO: add more if you need.
        ]
    }

    encoder_arg = encoder_args[model_name]

    if is_seq2seq:
        # default use LSTM
        wrapped_fn = seq2seq_wrapped[model_name]
        seq_model_fn = seq2seq_model_fn[model_name]

        if wrapped_fn is not None:
            wrapped = wrapped_fn(seq_model_fn(*encoder_arg, **kwargs))
        else:
            wrapped = seq_model_fn(*encoder_arg)

    # TODO: get a factory or something
    if 'nc_zhen' not in dataset_name:
        model_args = {
            'word_embeddings': embeddings,
            'encoder': wrapped,
            'vocab': vocab,
            'output_feature_key': output_feature_keys[dataset_name],
            'max_len': max_len,
            'hidden_size': hidden_dims
        }
    else:
        model_args = {
            'vocab': vocab,
            'source_embedder': embeddings,
            'encoder': wrapped,
            'max_decoding_steps': 20,  # arbitrary
            'attention': DotProductAttention(),  # arbitrary
            'beam_size': 8,  #arbitrary
        }

    model = dataset_model[dataset_name](**model_args)
    return output_feature_keys[dataset_name], model
Exemple #21
0
    def setUp(self):
        self.sample_only = False
        # self.setupstubexecutor()

        model_params_file_path = self.TEST_DATA_ROOT / "experiment.json"
        self.dataset_sample_file_path = self.TEST_DATA_ROOT / "lcquad.annotated.lisp.v3.deurified.simple.sample.json"
        self.dataset_train_file_path = self.TEST_DATA_ROOT / "lcquad.annotated.lisp.v3.train.json"
        self.dataset_test_file_path = self.TEST_DATA_ROOT / "lcquad.annotated.lisp.v3.test.json"
        predicates_file_path = self.TEST_DATA_ROOT / "properties.txt"
        with codecs.open(predicates_file_path) as fp:
            self.predicates = [i.strip() for i in fp]

        dbo_classes = set([
            dbo for dbo in self.predicates if dbo.split("/")[-1][0].isupper()
        ])
        binary_predicates = set(self.predicates) - dbo_classes

        if self.sample_only:
            self.sample_reader = LCQuADReaderSimple(
                predicates=binary_predicates, ontology_types=dbo_classes)
        else:
            self.train_reader = LCQuADReaderSimple(
                predicates=binary_predicates, ontology_types=dbo_classes)
            # self.test_reader = LCQuADReaderSimple(predicates=binary_predicates, ontology_types=dbo_classes)

        # sample_reader.cache_data("sample_dataset")
        # train_reader.cache_data("train_dataset")
        # test_reader.cache_data("test_dataset")

        if self.sample_only:
            self.sample_instances = list(
                self.sample_reader.read(str(self.dataset_sample_file_path)))
        else:
            self.train_instances = list(
                self.train_reader.read(str(self.dataset_train_file_path)))
            self.test_instances = list(
                self.train_reader.read(str(self.dataset_test_file_path)))

        if self.sample_only:
            self.vocab = Vocabulary.from_instances(self.sample_instances)
        else:
            self.vocab = Vocabulary.from_instances(self.train_instances +
                                                   self.test_instances,
                                                   min_count={
                                                       'tokens': 3,
                                                       'target_tokens': 3
                                                   })
            #min_count={'tokens': 3, 'target_tokens': 3})

        #self.vocab = Vocabulary()

        token_embedding = Embedding(
            num_embeddings=self.vocab.get_vocab_size('tokens') + 2,
            embedding_dim=512,
            padding_index=0)

        #options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json'
        #weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5'

        # the embedder maps the input tokens to the appropriate embedding matrix
        #elmo_embedder = ElmoTokenEmbedder(options_file, weight_file)
        #word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder})

        word_embeddings: TextFieldEmbedder = BasicTextFieldEmbedder(
            {"tokens": token_embedding})

        encoder = PytorchSeq2SeqWrapper(
            nn.LSTM(input_size=word_embeddings.get_output_dim(),
                    num_layers=2,
                    hidden_size=256,
                    bidirectional=True,
                    dropout=0.5,
                    batch_first=True))

        val_outputs = self.TEST_DATA_ROOT / "val_outputs.seq2seq.json"

        self.val_outputs_fp = codecs.open(val_outputs, 'w')

        # self.set_up_model(model_params_file_path, dataset_sample_file_path)
        self.model = SimpleSeq2Seq(vocab=self.vocab,
                                   source_embedder=word_embeddings,
                                   encoder=encoder,
                                   target_embedding_dim=128,
                                   target_namespace='target_tokens',
                                   attention=DotProductAttention(),
                                   max_decoding_steps=25,
                                   beam_size=5,
                                   use_bleu=True,
                                   scheduled_sampling_ratio=0.3)

        self.model.cuda(0)
Exemple #22
0
def main():
    target_namespace = "target_tokens"
    if not USE_COPY:
        reader = Seq2SeqDatasetReader(
            source_tokenizer=WordTokenizer(
                word_splitter=JustSpacesWordSplitter()),
            target_tokenizer=WordTokenizer(
                word_splitter=JustSpacesWordSplitter()),
            source_token_indexers={'tokens': SingleIdTokenIndexer()},
            target_token_indexers={
                'tokens': SingleIdTokenIndexer(namespace=target_namespace)
            })
    else:
        reader = CopyNetDatasetReader(
            source_tokenizer=WordTokenizer(
                word_splitter=JustSpacesWordSplitter()),
            target_tokenizer=WordTokenizer(
                word_splitter=JustSpacesWordSplitter()),
            target_namespace=target_namespace)
    train_dataset = reader.read('./data/data_train.tsv')
    validation_dataset = reader.read('./data/data_val.tsv')

    vocab = Vocabulary.from_instances(train_dataset,
                                      min_count={
                                          'tokens': 3,
                                          'target_tokens': 3
                                      })

    en_embedding = Embedding(
        num_embeddings=vocab.get_vocab_size('tokens'),
        embedding_dim=SRC_EMBEDDING_DIM,
        pretrained_file="../opennmt/glove_dir/glove.840B.300d.txt")
    assert en_embedding.weight.requires_grad
    datas = _read_pretrained_embeddings_file(en_embedding._pretrained_file,
                                             SRC_EMBEDDING_DIM, vocab)
    datas.requires_grad = True
    en_embedding.weight.data = datas
    print(en_embedding.weight.data)
    assert en_embedding.weight.requires_grad
    encoder = PytorchSeq2SeqWrapper(
        torch.nn.LSTM(SRC_EMBEDDING_DIM,
                      HIDDEN_DIM,
                      batch_first=True,
                      bidirectional=True,
                      dropout=0.3,
                      num_layers=1))
    #encoder = StackedSelfAttentionEncoder(input_dim=SRC_EMBEDDING_DIM,
    #                                      hidden_dim=HIDDEN_DIM,
    #                                      projection_dim=128, feedforward_hidden_dim=128,
    #                                      num_layers=1, num_attention_heads=8)

    source_embedder = BasicTextFieldEmbedder({"tokens": en_embedding})
    attention = DotProductAttention()

    if not USE_COPY:
        model = SimpleSeq2Seq(vocab,
                              source_embedder,
                              encoder,
                              MAX_DECODING_STEPS,
                              target_embedding_dim=TGT_EMBEDDING_DIM,
                              target_namespace='target_tokens',
                              attention=attention,
                              beam_size=8,
                              use_bleu=True)
    else:
        model = MyCopyNet(vocab,
                          source_embedder,
                          encoder,
                          max_decoding_steps=MAX_DECODING_STEPS,
                          target_embedding_dim=TGT_EMBEDDING_DIM,
                          target_namespace=target_namespace,
                          attention=attention,
                          beam_size=8,
                          tgt_embedder_pretrain_file=
                          "../opennmt/glove_dir/glove.840B.300d.txt")
    model.to(torch.device('cuda'))
    optimizer = optim.Adam(model.parameters())
    iterator = BucketIterator(batch_size=64,
                              sorting_keys=[("source_tokens", "num_tokens")],
                              padding_noise=0.2)

    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=validation_dataset,
                      num_epochs=22,
                      patience=4,
                      serialization_dir="./checkpoints",
                      cuda_device=CUDA_DEVICE,
                      summary_interval=100)
    trainer.train()
    print(en_embedding.weight.data)
    predictor = Seq2SeqPredictor(model, reader)

    # Dump all predictions to a file
    # TODO (DNGros): Is there an automatic way in allennlp to do this??
    pred_toks = []
    with open("pred.txt", "w") as outfile:
        for instance in tqdm(validation_dataset):
            pred = predictor.predict_instance(instance)
            toks = pred['predicted_tokens']
            if toks:
                outfile.write(" ".join(toks[0]) + "\n")
            else:
                outfile.write("" + "\n")
Exemple #23
0
from torch.autograd import Variable


reader = CopyNetDatasetReader(target_namespace="trg")
train_dataset = reader.read('data/train.tsv')
train_loader = PyTorchDataLoader(train_dataset, batch_size=8, shuffle=True)
vocab = Vocabulary.from_instances(train_dataset)
EMBEDDING_DIM = 128
HIDDEN_DIM = 256
TARGET_EMBEDDING_DIM = 512

token_embedding = Embedding(embedding_dim=EMBEDDING_DIM, num_embeddings=vocab.get_vocab_size(namespace="tokens"))
word_embedding = BasicTextFieldEmbedder({"token": token_embedding})

bi_rnn_encoder = RnnSeq2SeqEncoder(EMBEDDING_DIM, HIDDEN_DIM, 2, bidirectional=True)
dot_attn = DotProductAttention()
model = CopyNetSeq2Seq(vocab, word_embedding, bi_rnn_encoder, dot_attn,
                       target_namespace="trg", target_embedding_dim=TARGET_EMBEDDING_DIM)

with tempfile.TemporaryDirectory() as serialization_dir:
    parameters = [
        [n, p]
        for n, p in model.named_parameters() if p.requires_grad
    ]
    optimizer = AdamOptimizer(parameters)
    trainer = GradientDescentTrainer(
        model=model,
        serialization_dir=serialization_dir,
        data_loader=train_loader,
        validation_data_loader=None,
        num_epochs=5,
Exemple #24
0
    def setUp(self):
        self.sample_only = False
        self.setUpExecutor()
        # self.setupstubexecutor()

        model_params_file_path = self.TEST_DATA_ROOT / "experiment.json"
        self.dataset_sample_file_path = self.TEST_DATA_ROOT / "lcquad.annotated.lisp.v2.deurified.sample.json"
        self.dataset_train_file_path = self.TEST_DATA_ROOT / "lcquad.annotated.lisp.v2.deurified.train.json"
        self.dataset_test_file_path = self.TEST_DATA_ROOT / "lcquad.annotated.lisp.v2.deurified.test.json"
        predicates_file_path = self.TEST_DATA_ROOT / "properties.txt"
        with codecs.open(predicates_file_path) as fp:
            self.predicates = [i.strip() for i in fp]

        dbo_classes = set([
            dbo for dbo in self.predicates if dbo.split("/")[-1][0].isupper()
        ])
        binary_predicates = set(self.predicates) - dbo_classes

        token_indexer = None  #{'tokens': ELMoTokenCharactersIndexer()}

        if self.sample_only:
            sample_reader = LCQuADReader(executor=self.executor,
                                         predicates=binary_predicates,
                                         token_indexers=token_indexer,
                                         ontology_types=dbo_classes)
        else:
            train_reader = LCQuADReader(executor=self.executor,
                                        predicates=binary_predicates,
                                        token_indexers=token_indexer,
                                        ontology_types=dbo_classes)
            test_reader = LCQuADReader(executor=self.executor,
                                       predicates=binary_predicates,
                                       token_indexers=token_indexer,
                                       ontology_types=dbo_classes)

        # sample_reader.cache_data("sample_dataset")
        # train_reader.cache_data("train_dataset")
        # test_reader.cache_data("test_dataset")

        if self.sample_only:
            self.sample_instances = list(
                sample_reader.read(str(self.dataset_sample_file_path)))
        else:
            self.train_instances = list(
                train_reader.read(str(self.dataset_train_file_path)))
            self.test_instances = list(
                test_reader.read(str(self.dataset_test_file_path)))

        if self.sample_only:
            self.vocab = Vocabulary.from_instances(self.sample_instances)
        else:
            self.vocab = Vocabulary.from_instances(self.train_instances +
                                                   self.test_instances)

        #self.vocab = Vocabulary()

        token_embedding = Embedding(
            num_embeddings=self.vocab.get_vocab_size() + 2,
            embedding_dim=256,
            padding_index=0)

        #options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json'
        #weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5'

        # the embedder maps the input tokens to the appropriate embedding matrix
        #elmo_embedder = ElmoTokenEmbedder(options_file, weight_file)
        #word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder})

        word_embeddings: TextFieldEmbedder = BasicTextFieldEmbedder(
            {"tokens": token_embedding})

        encoder = PytorchSeq2SeqWrapper(
            nn.LSTM(
                input_size=word_embeddings.get_output_dim(),
                num_layers=1,
                hidden_size=128,
                bidirectional=True,
                # dropout=0.4,
                batch_first=True))

        val_outputs = self.TEST_DATA_ROOT / "val_outputs.json"

        self.val_outputs_fp = codecs.open(val_outputs, 'w')

        # self.set_up_model(model_params_file_path, dataset_sample_file_path)
        self.model = LCQuADMmlSemanticParser(
            vocab=self.vocab,
            sentence_embedder=word_embeddings,
            action_embedding_dim=256,
            encoder=encoder,
            attention=DotProductAttention(),
            decoder_beam_search=BeamSearch(beam_size=1),
            max_decoding_steps=50,
            dropout=0.5,
            val_outputs=self.val_outputs_fp)
        self.model.cuda(0)
    def __init__(
            self,
            vocab: Vocabulary,
            model_name: str,
            decoder: DecoderNet,
            decoder_type: str = "lstm",  # `lstm` / `transformer`
            decoder_num_layers: int = 1,
            share_decoder_params: bool = True,  # valid for `transformer`
            text_field_embedder: TextFieldEmbedder = None,
            start_token: str = "[CLS]",
            end_token: str = "[SEP]",
            index_name: str = "bert",
            beam_size: int = 4,
            min_dec_len: int = 4,
            max_dec_len: int = 30,
            coverage_factor: float = 0.0,
            device: Union[int, str, List[int]] = -1,
            trainable: bool = True,  # 表示bert的参数是否可训练
            metrics: Optional[List[Metric]] = None,
            valid_metric_keys: List[str] = None,
            seed: int = 42,
            initializer: InitializerApplicator = InitializerApplicator(),
            regularizer: RegularizerApplicator = None):
        super().__init__(vocab, regularizer)

        # ---------- 定义编码器并获取输出维度 -------------
        if model_name is None and text_field_embedder is None:
            raise ValueError(
                f"`model_name` and `text_field_embedder` can't both equal to None."
            )

        # 对于预训练模型来说,这里相当于encoder
        self._text_field_embedder = text_field_embedder or BasicTextFieldEmbedder(
            {
                index_name:
                PretrainedChineseBertEmbedder(model_name,
                                              train_parameters=trainable,
                                              return_all=False,
                                              output_hidden_states=False)
            })

        # 保存bert编码器的输出维度
        self.encoder_output_dim = self._text_field_embedder.get_output_dim()

        # ---------- 通用初始化过程 -------------
        self.common_init(self.encoder_output_dim, decoder, decoder_type,
                         decoder_num_layers, share_decoder_params, start_token,
                         end_token, index_name, beam_size, min_dec_len,
                         max_dec_len, coverage_factor, device, metrics,
                         valid_metric_keys, seed, initializer)

        # ---------- 不同编码器独特的初始化过程 -------------
        # 由于编码器是bert,所以需要保存编码器的embedding部分
        # 如果是albert,还有embedding到hidden的映射部分
        bert_token_embedder = self._text_field_embedder._token_embedders[
            index_name]
        self.bert_type = model_name or bert_token_embedder.model_name  # 获取model的名称
        self.word_embeddings = bert_token_embedder.transformer_model.get_input_embeddings(
        )
        if "albert" in self.bert_type:
            # 从embedding层到隐层的映射
            self.embedding_to_hidden = bert_token_embedder.transformer_model.encoder.embedding_hidden_mapping_in

        # 如果解码器是LSTM,则需要使用attention初始化LSTM的初始状态
        # 如果编码器也是LSTM,则不需要
        if self.params["decoder_type"] == "lstm":
            self.h_query = torch.nn.Parameter(torch.randn(
                [self.encoder_output_dim]),
                                              requires_grad=True)
            self.c_query = torch.nn.Parameter(torch.randn(
                [self.encoder_output_dim]),
                                              requires_grad=True)
            # 当编码器是transformer,解码器是LSTM时,需要计算LSTM的初始化状态
            self.init_attention = DotProductAttention()
Exemple #26
0
def main():
    elmo_token_indexer = ELMoTokenCharactersIndexer()

    reader = Seq2SeqDatasetReader(
        source_tokenizer=WordTokenizer(),
        target_tokenizer=WordTokenizer(),
        source_token_indexers={'tokens': elmo_token_indexer},
        target_token_indexers={
            'tokens': SingleIdTokenIndexer(namespace='target_tokens')
        })

    train_dataset, test_dataset, dev_dataset = (
        reader.read(DATA_ROOT + "/" + fname) for fname in
        ["train_all_seq.txt", "test_all_seq.txt", "val_all_seq.txt"])

    vocab = Vocabulary.from_instances(train_dataset + dev_dataset +
                                      test_dataset,
                                      min_count={
                                          'tokens': 1,
                                          'target_tokens': 1
                                      })

    # en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
    #                              embedding_dim=256)
    # en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
    # embedding_dim=elmo_embedding_dim)
    #elmo_embedder = Elmo(options_file, weight_file, 2, dropout=0.5)
    elmo_embedder = ElmoTokenEmbedder(options_file, weight_file)
    # word_embeddings = BasicTextFieldEmbedder({'tokens': elmo_embedder})
    # en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
    # embedding_dim=256)
    source_embedder = BasicTextFieldEmbedder({"tokens": elmo_embedder})

    #Initializing the model
    max_decoding_steps = 20
    encoder = PytorchSeq2SeqWrapper(
        torch.nn.LSTM(elmo_embedding_dim, hidden_dim, batch_first=True))

    # encoder = StackedSelfAttentionEncoder(input_dim=elmo_embedding_dim, hidden_dim=hidden_dim, projection_dim=128, feedforward_hidden_dim=128, num_layers=1, num_attention_heads=8)
    attention = DotProductAttention()

    model = SimpleSeq2Seq(vocab,
                          source_embedder,
                          encoder,
                          max_decoding_steps,
                          target_embedding_dim=elmo_embedding_dim,
                          target_namespace='target_tokens',
                          attention=attention,
                          beam_size=8,
                          use_bleu=True)

    if USE_GPU: model.cuda()
    else: model

    # Training the model
    optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)
    iterator = BucketIterator(batch_size=32,
                              sorting_keys=[("source_tokens", "num_tokens")])
    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=dev_dataset,
                      patience=10,
                      num_epochs=1,
                      cuda_device=0 if USE_GPU else -1)

    for i in range(20):
        print('Epoch: {}'.format(i))
        trainer.train()

        predictor = SimpleSeq2SeqPredictor(model, reader)

        for instance in itertools.islice(dev_dataset, 10):
            print('SOURCE:', instance.fields['source_tokens'].tokens)
            print('GOLD:', instance.fields['target_tokens'].tokens)
            print('PRED:',
                  predictor.predict_instance(instance)['predicted_tokens'])

    #Saving the model
    with open("model_seq2seq.th", 'wb') as f:
        torch.save(model.state_dict(), f)

    vocab.save_to_files("vocabulary_seq2seq")
    predictor = SimpleSeq2SeqPredictor(model, reader)
    with open('predict_seq2seq.txt', 'w+') as f:
        for instance in itertools.islice(test_dataset, 10):
            preds = predictor.predict_instance(instance)['predicted_tokens']
            f.write(" ".join(preds) + "\n")
    def __init__(self,
                 vocab: Vocabulary,
                 encoder: Seq2SeqEncoder,
                 entity_encoder: Seq2VecEncoder,
                 decoder_beam_search: BeamSearch,
                 question_embedder: TextFieldEmbedder,
                 input_attention: Attention,
                 past_attention: Attention,
                 graph_attention: Attention,
                 max_decoding_steps: int,
                 action_embedding_dim: int,
                 enable_gating: bool = False,
                 ablation_mode: str = None,
                 gnn: bool = True,
                 graph_loss_lambda: float = 0.5,
                 decoder_use_graph_entities: bool = True,
                 decoder_self_attend: bool = True,
                 gnn_timesteps: int = 2,
                 pruning_gnn_timesteps: int = 2,
                 parse_sql_on_decoding: bool = True,
                 add_action_bias: bool = False,
                 use_neighbor_similarity_for_linking: bool = True,
                 dataset_path: str = 'dataset',
                 log_path: str = '',
                 training_beam_size: int = None,
                 decoder_num_layers: int = 1,
                 dropout: float = 0.0,
                 rule_namespace: str = 'rule_labels') -> None:
        super().__init__(vocab, encoder, entity_encoder, question_embedder, gnn_timesteps, dropout, rule_namespace)

        self.enable_gating = enable_gating
        self.ablation_mode = ablation_mode
        self._log_path = log_path
        self._max_decoding_steps = max_decoding_steps
        self._add_action_bias = add_action_bias

        self._parse_sql_on_decoding = parse_sql_on_decoding
        self._self_attend = decoder_self_attend
        self._decoder_use_graph_entities = decoder_use_graph_entities
        self._use_neighbor_similarity_for_linking = use_neighbor_similarity_for_linking

        self._action_padding_index = -1  # the padding value used by IndexField

        self._exact_match = Average()
        self._sql_evaluator_match = Average()
        self._action_similarity = Average()
        self._beam_hit = Average()

        self._action_embedding_dim = action_embedding_dim

        self._graph_loss_lambda = graph_loss_lambda

        num_actions = vocab.get_vocab_size(self._rule_namespace)
        if self._add_action_bias:
            input_action_dim = action_embedding_dim + 1
        else:
            input_action_dim = action_embedding_dim
        self._action_embedder = Embedding(num_embeddings=num_actions, embedding_dim=input_action_dim)
        self._output_action_embedder = Embedding(num_embeddings=num_actions, embedding_dim=action_embedding_dim)

        self._embedding_projector = torch.nn.Linear(question_embedder.get_output_dim(), self._embedding_dim, bias=False)
        self._bert_embedding_dim = question_embedder.get_output_dim()
        encoder_output_dim = self._encoder.get_output_dim() + self._embedding_dim

        self._neighbor_encoder = TimeDistributed(BagOfEmbeddingsEncoder(self._embedding_dim, averaged=True))

        self._first_action_embedding = torch.nn.Parameter(torch.FloatTensor(action_embedding_dim))
        self._first_attended_utterance = torch.nn.Parameter(torch.FloatTensor(encoder_output_dim))
        self._first_attended_output = torch.nn.Parameter(torch.FloatTensor(action_embedding_dim))
        torch.nn.init.normal_(self._first_action_embedding)
        torch.nn.init.normal_(self._first_attended_utterance)
        torch.nn.init.normal_(self._first_attended_output)

        self._entity_type_decoder_embedding = Embedding(self._num_entity_types, action_embedding_dim)

        self._decoder_num_layers = decoder_num_layers

        self._beam_search = decoder_beam_search
        self._decoder_trainer = MaximumMarginalLikelihood(training_beam_size)

        self._graph_pruning = GraphPruning(3, self._embedding_dim, encoder.get_output_dim(), dropout,
                                           timesteps=pruning_gnn_timesteps)

        if decoder_self_attend:
            self._transition_function = AttendPastSchemaItemsTransitionFunction(encoder_output_dim=encoder_output_dim,
                                                                                action_embedding_dim=action_embedding_dim,
                                                                                input_attention=input_attention,
                                                                                past_attention=past_attention,
                                                                                enable_gating=self.enable_gating,
                                                                                ablation_mode=self.ablation_mode,
                                                                                predict_start_type_separately=False,
                                                                                add_action_bias=self._add_action_bias,
                                                                                dropout=dropout,
                                                                                num_layers=self._decoder_num_layers)
        else:
            self._transition_function = LinkingTransitionFunction(encoder_output_dim=encoder_output_dim,
                                                                  action_embedding_dim=action_embedding_dim,
                                                                  input_attention=input_attention,
                                                                  predict_start_type_separately=False,
                                                                  add_action_bias=self._add_action_bias,
                                                                  dropout=dropout,
                                                                  num_layers=self._decoder_num_layers)

        if self.enable_gating:
            self._graph_attention = graph_attention
        else:
            self._graph_attention = DotProductAttention()

        self._embedding_sim_attn = CosineMatrixAttention()

        # TODO: Remove hard-coded dirs
        self._evaluate_func = partial(evaluate,
                                      db_dir=os.path.join(dataset_path, 'database'),
                                      table=os.path.join(dataset_path, 'tables.json'),
                                      check_valid=False)
Exemple #28
0
    def __init__(self, vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 label_namespace: str = "labels",
                 encoder: Optional[Seq2VecEncoder] = None,
                 seq_encoder: Optional[Seq2SeqEncoder] = None,
                 feedforward: Optional[FeedForward] = None,
                 dropout: Optional[float] = None,
                 incl_neutral: Optional[bool] = False,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super().__init__(vocab, regularizer)
        self.label_namespace = label_namespace
        self.text_field_embedder = text_field_embedder
        self.num_labels = self.vocab.get_vocab_size(label_namespace)
        self.encoder = encoder

        self.seq_encoder = seq_encoder
        if self.seq_encoder is not None:
            self.attention_vector = Parameter(torch.Tensor(self.seq_encoder.get_output_dim()))
            self.attention_layer = DotProductAttention(normalize=True)
    
        embedding_output_dim = self.text_field_embedder.get_output_dim()
        
        if dropout is not None:
            self.dropout = torch.nn.Dropout(dropout)
            self.variational_dropout = InputVariationalDropout(dropout)
        else:
            self.dropout = None
        self._feedforward = feedforward

        if feedforward is not None:
            output_dim = feedforward.get_output_dim()
        elif encoder is not None:
            output_dim = self.encoder.get_output_dim()
        elif seq_encoder is not None:
            output_dim = self.seq_encoder.get_output_dim()
        else:
            output_dim = embedding_output_dim
        # Have to create a tag projection layer for each label in the 
        # multi label classifier
        self._tag_projection_layers: Any = []
        for k in range(self.num_labels):
            tag_projection_layer = Linear(output_dim, 1)
            self.add_module(f'tag_projection_layer_{k}', tag_projection_layer)
            self._tag_projection_layers.append(tag_projection_layer)
        self.output_activation = torch.nn.Sigmoid()
        self.loss_criterion = torch.nn.BCEWithLogitsLoss(reduction='mean')
        
        self.incl_neutral = incl_neutral
        self.metrics = {"jaccard_index": JaccardIndex(self.incl_neutral)}
        if encoder is not None:
            check_dimensions_match(embedding_output_dim, encoder.get_input_dim(),
                                   "text field embedding dim", "encoder input dim")
        if feedforward is not None and encoder is not None:
            check_dimensions_match(encoder.get_output_dim(), feedforward.get_input_dim(),
                                   "encoder output dim", "feedforward input dim")
        elif feedforward is not None and encoder is None:
            check_dimensions_match(embedding_output_dim, feedforward.get_input_dim(),
                                   "text field output dim", "feedforward input dim")
        if self.seq_encoder is not None:
            self.reset_parameters()
        initializer(self)
Exemple #29
0
    def __init__(self,
                 vocab: Vocabulary,
                 context_field_embedder: TextFieldEmbedder,
                 context_encoder: Seq2SeqEncoder,
                 target_encoder: Seq2VecEncoder,
                 feedforward: Optional[FeedForward] = None,
                 context_attention_activation_function: str = 'tanh',
                 target_field_embedder: Optional[TextFieldEmbedder] = None,
                 AE: bool = True,
                 AttentionAE: bool = True,
                 inter_target_encoding: Optional[InterTarget] = None,
                 target_position_weight: Optional[TargetPositionWeight] = None,
                 target_position_embedding: Optional[TextFieldEmbedder] = None,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None,
                 dropout: float = 0.0,
                 label_name: str = 'target-sentiment-labels',
                 loss_weights: Optional[List[float]] = None,
                 use_target_sequences: bool = False) -> None:
        super().__init__(vocab, regularizer)
        '''
        :param vocab: A Vocabulary, required in order to compute sizes 
                      for input/output projections.
        :param context_field_embedder: Used to embed the context/sentence and 
                                       target text if target_field_embedder is 
                                       None but the target_encoder is NOT None.
        :param context_encoder: Encoder that will create the representation 
                                for the sentence/context that the target 
                                appears in.
        :param target_encoder: Encoder that will create the representation of 
                               target text tokens.
        :param feedforward: An optional feed forward layer(s) to apply before 
                            the final softmax layer.
        :param context_attention_activation_function: The activation function 
                                                      to be used after the 
                                                      projection of the encoded
                                                      context. (Equation 7)
                                                      in the original paper.
        :param target_field_embedder: Used to embed the target text to give as 
                                      input to the target_encoder. Thus this 
                                      allows a separate embedding for context 
                                      and target text.
        :param AE: Whether to concatentate the target representations to each 
                   words word embedding.
        :param AttentionAE: Whether to concatenate the target representations 
                            to each contextualised word representation i.e. 
                            to each word's vector after the `context_encoder` 
        :param inter_target_encoding: Whether to model the relationship between 
                                      targets/aspect.
        :param target_position_weight: Whether to weight the output of the 
                                       context encoding based on the position 
                                       of the tokens to the target tokens. This 
                                       weighting is applied before any attention 
                                       is applied.
        :param target_position_embedding: Whether or not to concatenate a position
                                          embedding on to the input embeddings 
                                          before being an input to the 
                                          `context_encoder`.
        :param initializer: Used to initialize the model parameters.
        :param regularizer: If provided, will be used to calculate the 
                            regularization penalty during training.
        :param dropout: To apply dropout after each layer apart from the last 
                        layer. All dropout that is applied to timebased data 
                        will be `variational dropout 
                        <https://arxiv.org/abs/1512.05287>`_ all else will be  
                        standard dropout. Variation dropout is applied to the 
                        target vectors after they have been processed by the 
                        `inter_target_encoding` if this is set.
        :param label_name: Name of the label name space.
        :param loss_weights: The amount of weight to give the negative, neutral,
                             positive classes respectively. e.g. [0.2, 0.5, 0.3]
                             would weight the negative class by a factor of 
                             0.2, neutral by 0.5 and positive by 0.3. NOTE It 
                             assumes the sentiment labels are the following:
                             [negative, neutral, positive].
        :param use_target_sequences: Whether or not to use target tokens within 
                                     the context as the targets contextualized 
                                     word representation (CWR). This would only
                                     make sense to use if the word representation 
                                     i.e. field embedder is a contextualized 
                                     embedder e.g. ELMO etc. This also requires 
                                     that the dataset reader has the following 
                                     argument set to True `target_sequences`.
                                     ANOTHER reason why you would want to use 
                                     this even when not using CWR is that you 
                                     want to get contextualised POS/Dep tags 
                                     etc.
        
        This is based around the models in `Attention-based LSTM for Aspect-level 
        Sentiment Classification <https://aclweb.org/anthology/D16-1058>`_. 
        The models re-created are:
        
        1. AE-LSTM where instead of just encoding using an LSTM also applies 
           an attention network after the LSTM as in the model within 
           `Modeling Inter-Aspect Dependencies for Aspect-Based Sentiment Analysis 
           <https://www.aclweb.org/anthology/N18-2043>`_
        3. AT-LSTM
        2. ATAE

        For the 1'st model ensure `AE` is True and `AttentionAE` is False. For
        the 2'nd ensure that `AE` is False and `AttentionAE` is True. For the 
        the 3'rd ensure both `AE` and `AttentionAE` are True.

        This can also be used to re-create the model from `Modeling Inter-Aspect Dependencies for 
        Aspect-Based Sentiment Analysis <https://www.aclweb.org/anthology/N18-2043>`_
        with the fustion part being `concat`. To do so `inter_target_encoding`
        argument must be a LSTM.

         .. _variational dropout:
           https://papers.nips.cc/paper/6241-a-theoretically-grounded-application-of-dropout-in-recurrent-neural-networks.pdf
        '''
        if not AE and not AttentionAE:
            raise ConfigurationError('Either `AE` or `AttentionAE` have to '
                                     'be True')

        self.label_name = label_name
        self.context_field_embedder = context_field_embedder
        self.target_field_embedder = target_field_embedder
        self.num_classes = self.vocab.get_vocab_size(self.label_name)
        self.context_encoder = context_encoder
        self.target_encoder = target_encoder
        self.feedforward = feedforward
        self._use_target_sequences = use_target_sequences
        if self._use_target_sequences and self.target_field_embedder:
            raise ConfigurationError(
                '`use_target_sequences` cannot be True at'
                ' the same time as a value for '
                '`target_field_embedder` as the embeddings'
                ' come from the context and not a separate embedder')

        target_encoder_out = self.target_encoder.get_output_dim()
        context_encoder_out = self.context_encoder.get_output_dim()
        self.context_encoder_bidirectional = self.context_encoder.is_bidirectional(
        )

        # Applied after the contextulisation layer and before the attention layer
        attention_projection_layer_dim = context_encoder_out
        if AttentionAE:
            attention_projection_layer_dim = context_encoder_out + target_encoder_out
        self.attention_project_layer = Linear(attention_projection_layer_dim,
                                              attention_projection_layer_dim,
                                              bias=False)
        self.attention_project_layer = TimeDistributed(
            self.attention_project_layer)

        # Activation function to be applied after projection and before attention
        context_attention_activation_function = Activation.by_name(
            f'{context_attention_activation_function}')()
        self._context_attention_activation_function = context_attention_activation_function
        attention_vector_dim = context_encoder_out
        if AttentionAE:
            attention_vector_dim = context_encoder_out + target_encoder_out
        self.attention_vector = Parameter(torch.Tensor(attention_vector_dim))
        self.context_attention_layer = DotProductAttention(normalize=True)

        # Final projection layers, these are applied after the attention layer
        self.final_attention_projection_layer = Linear(context_encoder_out,
                                                       context_encoder_out,
                                                       bias=False)
        self.final_hidden_state_projection_layer = Linear(context_encoder_out,
                                                          context_encoder_out,
                                                          bias=False)

        # Set the loss weights (have to sort them by order of label index in
        # the vocab)
        self.loss_weights = target_sentiment.util.loss_weight_order(
            self, loss_weights, self.label_name)

        # Inter target modelling
        self.inter_target_encoding = inter_target_encoding

        if feedforward is not None:
            output_dim = self.feedforward.get_output_dim()
        elif self.inter_target_encoding is not None:
            output_dim = self.inter_target_encoding.get_output_dim()
        else:
            output_dim = context_encoder_out
        self.label_projection = Linear(output_dim, self.num_classes)

        self.metrics = {"accuracy": CategoricalAccuracy()}
        self.f1_metrics = {}
        # F1 Scores
        label_index_name = self.vocab.get_index_to_token_vocabulary(
            self.label_name)
        for label_index, _label_name in label_index_name.items():
            _label_name = f'F1_{_label_name.capitalize()}'
            self.f1_metrics[_label_name] = F1Measure(label_index)
        # Dropout
        self._variational_dropout = InputVariationalDropout(dropout)
        self._naive_dropout = Dropout(dropout)

        # Ensure that the dimensions of the target or text field embedder and
        # the target encoder match
        target_field_embedder_dim = context_field_embedder.get_output_dim()
        target_field_error = "context field embedding dim"
        if self.target_field_embedder:
            target_field_embedder_dim = target_field_embedder.get_output_dim()
            target_field_error = "target field embedding dim"

        check_dimensions_match(target_field_embedder_dim,
                               target_encoder.get_input_dim(),
                               target_field_error, "target encoder input dim")
        # If AE is True ensure that the context encoder input is equal to the
        # the output of the target encoder plus the context field embedder
        context_field_embedder_out = context_field_embedder.get_output_dim()

        # position embeddings
        self.target_position_embedding = target_position_embedding
        if self.target_position_embedding is not None:
            context_field_embedder_out += self.target_position_embedding.get_output_dim(
            )
        if AE:
            check_dimensions_match(
                context_field_embedder_out + target_encoder_out,
                context_encoder.get_input_dim(),
                "context field embedding dim + Target Encoder out",
                "text encoder input dim")
        else:
            check_dimensions_match(context_field_embedder_out,
                                   context_encoder.get_input_dim(),
                                   "context field embedding dim",
                                   "text encoder input dim")
        if self.inter_target_encoding is not None:
            check_dimensions_match(context_encoder_out,
                                   self.inter_target_encoding.get_input_dim(),
                                   'Context field enocder output',
                                   'Inter target encoder input')
        if self.feedforward is not None:
            if self.inter_target_encoding is not None:
                check_dimensions_match(
                    self.inter_target_encoding.get_output_dim(),
                    self.feedforward.get_input_dim(),
                    'Inter target encoder output', 'FeedForward input dim')
            else:
                check_dimensions_match(context_encoder_out,
                                       self.feedforward.get_input_dim(),
                                       'Context encoder output',
                                       'FeedForward input dim')

        self.target_position_weight = target_position_weight
        # TimeDistributed anything that is related to the targets.
        if self.feedforward is not None:
            self.feedforward = TimeDistributed(self.feedforward)
        self.label_projection = TimeDistributed(self.label_projection)
        self._time_variational_dropout = TimeDistributed(
            self._variational_dropout)

        self._AE = AE
        self._AttentionAE = AttentionAE

        self.reset_parameters()
        initializer(self)
Exemple #30
0
def main():

    trainFile = "../srcData/trainData.csv"
    validFile = "../srcData/devData.csv"
    testFile = "../srcData/testData.csv"
    trainSeq2SeqFile = data.dataPreparation(trainFile)
    validSeq2SeqFile = data.dataPreparation(validFile)
    testSeq2SeqFile = data.dataPreparation(testFile)
    print(testSeq2SeqFile)
    #TokenIndexer Determines how string tokens gets represented as arrays of indexes in a model
    #SingleIdTokenIndexer = Tokens are single integers
    #TokenCharactersIndexer = Tokens as a list of integers
    # Read a tsvfile with paired instances (source, target)
    reader = Seq2SeqDatasetReader(
        source_tokenizer=WordTokenizer(),
        target_tokenizer=WordTokenizer(),  # Defaults to source_tokenizer
        source_token_indexers={'tokens': SingleIdTokenIndexer()},
        target_token_indexers={'tokens': SingleIdTokenIndexer()
                               }  # Defaults to source_token_indexers
    )

    # Each of the dataset is a list of each tokens (source_tokens, target_tokens)
    train_dataset = reader.read(trainSeq2SeqFile)
    validation_dataset = reader.read(validSeq2SeqFile)
    test_dataset = reader.read(testSeq2SeqFile)

    # Finding extra fact2 vocab
    trainExtraVocab = findExtraVocab(train_dataset)
    validExtraVocab = findExtraVocab(validation_dataset)
    testExtraVocab = findExtraVocab(test_dataset)
    finalExtraVocab = list(
        set(trainExtraVocab + validExtraVocab + testExtraVocab))
    print("length:", len(finalExtraVocab))
    #input()

    #vocab = Vocabulary.from_instances(train_dataset + validation_dataset, min_count={'tokens': 3, 'target_tokens': 3})
    vocab = Vocabulary.from_instances(train_dataset + validation_dataset +
                                      test_dataset)
    # Train + Valid = 9703
    # Train + Valid + Test = 10099

    print("Vocab SIze :", vocab.get_vocab_size('tokens'))

    encEmbedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                             embedding_dim=ENC_EMBEDDING_DIM)

    # Embedding for tokens since in the dataset creation time it is mentioned tokens
    source_embedder = BasicTextFieldEmbedder({"tokens": encEmbedding})

    encoder = PytorchSeq2SeqWrapper(
        torch.nn.LSTM(ENC_EMBEDDING_DIM,
                      HIDDEN_DIM,
                      batch_first=True,
                      dropout=0.2))

    attention = DotProductAttention()

    max_decoding_steps = 4  # TODO: make this variable
    model = SimpleSeq2Seq(
        vocab,
        source_embedder,
        encoder,
        max_decoding_steps,
        target_embedding_dim=TGT_EMBEDDING_DIM,
        #target_namespace = 'target_tokens',
        attention=attention,
        beam_size=beamSize,
        use_bleu=True,
        extra_vocab=finalExtraVocab)
    #Can also specify lr=0.001
    optimizer = optim.Adam(model.parameters())

    # Data Iterator that specify how to batch our dataset
    # Takes data shuffles it and creates fixed sized batches
    #iterator = BasicIterator(batch_size=2)
    #iterator.index_with(vocab)
    # Pads batches wrt max input lengths per batch, sorts dataset wrt the fieldnames and padding keys provided for efficient computations
    iterator = BucketIterator(batch_size=50,
                              sorting_keys=[("source_tokens", "num_tokens")])
    iterator.index_with(vocab)

    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        iterator=iterator,
        train_dataset=train_dataset,
        validation_dataset=validation_dataset,
        #patience = 3,
        num_epochs=numEpochs,
        cuda_device=CUDA_DEVICE)

    trainer.train()
    predictor = SimpleSeq2SeqPredictor(model, reader)
    '''for i in range(2):
        print ("Epoch: {}".format(i))
        trainer.train()

        predictor = SimpleSeq2SeqPredictor(model, reader)


        for instance in itertools.islice(validation_dataset, 10):
            print('SOURCE:', instance.fields['source_tokens'].tokens)
            print('GOLD:', instance.fields['target_tokens'].tokens)
            print('PRED:', predictor.predict_instance(instance)['predicted_tokens'])
            """'{'predictions': [[1, 4, 5, 92, 8, 6, 1, 8, 6, 26, 3]], 
             'loss': 5.9835076332092285,
             'class_log_probabilities': [-20.10894012451172],
             'predicted_tokens': ['@@UNKNOWN@@', 'is', 'a', 'type', 'of', 'the', '@@UNKNOWN@@', 'of', 'the', 'sun']}
             """
            print (predictor.predict_instance(instance))
    '''

    outFile = open(
        "output_" + str(HIDDEN_DIM) + "_" + str(numEpochs) + "_" +
        str(beamSize) + ".csv", "w")
    writer = csv.writer(outFile, delimiter="\t")
    for instance in itertools.islice(test_dataset, 500):
        src = instance.fields['source_tokens'].tokens
        gold = instance.fields['target_tokens'].tokens
        pred = predictor.predict_instance(instance)['predicted_tokens']
        writer.writerow([src, gold, pred])

    outFile.close()