Beispiel #1
0
def build_seq2seq_model(flags,
                        data_reader,
                        vocab: Vocabulary,
                        source_namespace: str = 'source_tokens',
                        target_namespace: str = 'target_tokens') -> Model:
    source_embedding = Embedding(
        vocab.get_vocab_size(namespace=source_namespace),
        embedding_dim=flags.source_embedding_dim)
    source_embedder = BasicTextFieldEmbedder({'tokens': source_embedding})
    lstm_encoder = PytorchSeq2SeqWrapper(
        torch.nn.LSTM(flags.source_embedding_dim,
                      flags.encoder_hidden_dim,
                      batch_first=True,
                      bidirectional=flags.encoder_bidirectional))
    attention = DotProductAttention()
    model = SimpleSeq2Seq(vocab,
                          source_embedder,
                          lstm_encoder,
                          flags.max_decode_length,
                          target_embedding_dim=flags.decoder_hidden_dim,
                          target_namespace=target_namespace,
                          attention=attention,
                          beam_size=flags.beam_size,
                          use_bleu=True)
    return model
Beispiel #2
0
def main():
    reader = Seq2SeqDatasetReader(
        source_tokenizer=WordTokenizer(),
        target_tokenizer=CharacterTokenizer(),
        source_token_indexers={'tokens': SingleIdTokenIndexer()},
        target_token_indexers={'tokens': SingleIdTokenIndexer(namespace='target_tokens')})
    train_dataset = reader.read('data/mt/tatoeba.eng_cmn.train.tsv')
    validation_dataset = reader.read('data/mt/tatoeba.eng_cmn.dev.tsv')

    vocab = Vocabulary.from_instances(train_dataset + validation_dataset,
                                      min_count={'tokens': 3, 'target_tokens': 3})

    en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                             embedding_dim=EN_EMBEDDING_DIM)
    # encoder = PytorchSeq2SeqWrapper(
    #     torch.nn.LSTM(EN_EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))
    encoder = StackedSelfAttentionEncoder(input_dim=EN_EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, projection_dim=128, feedforward_hidden_dim=128, num_layers=1, num_attention_heads=8)

    source_embedder = BasicTextFieldEmbedder({"tokens": en_embedding})

    # attention = LinearAttention(HIDDEN_DIM, HIDDEN_DIM, activation=Activation.by_name('tanh')())
    # attention = BilinearAttention(HIDDEN_DIM, HIDDEN_DIM)
    attention = DotProductAttention()

    max_decoding_steps = 20   # TODO: make this variable
    model = SimpleSeq2Seq(vocab, source_embedder, encoder, max_decoding_steps,
                          target_embedding_dim=ZH_EMBEDDING_DIM,
                          target_namespace='target_tokens',
                          attention=attention,
                          beam_size=8,
                          use_bleu=True)
    optimizer = optim.Adam(model.parameters())
    iterator = BucketIterator(batch_size=32, sorting_keys=[("source_tokens", "num_tokens")])

    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=validation_dataset,
                      num_epochs=1,
                      cuda_device=CUDA_DEVICE)

    for i in range(50):
        print('Epoch: {}'.format(i))
        trainer.train()

        predictor = SimpleSeq2SeqPredictor(model, reader)

        for instance in itertools.islice(validation_dataset, 10):
            print('SOURCE:', instance.fields['source_tokens'].tokens)
            print('GOLD:', instance.fields['target_tokens'].tokens)
            print('PRED:', predictor.predict_instance(instance)['predicted_tokens'])
Beispiel #3
0
                                      feedforward_hidden_dim=128,
                                      num_layers=1,
                                      num_attention_heads=8)

source_embedder = BasicTextFieldEmbedder({"tokens": en_embedding})

# attention = LinearAttention(HIDDEN_DIM, HIDDEN_DIM, activation=Activation.by_name('tanh')())
# attention = BilinearAttention(HIDDEN_DIM, HIDDEN_DIM)
attention = DotProductAttention()

max_decoding_steps = 800
model = SimpleSeq2Seq(vocab,
                      source_embedder,
                      encoder,
                      max_decoding_steps,
                      target_embedding_dim=ZH_EMBEDDING_DIM,
                      target_namespace='target_tokens',
                      attention=attention,
                      beam_size=12,
                      use_bleu=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(
    device
)  # without this there is no error, but it runs in CPU (instead of GPU).

optimizer = optim.Adam(model.parameters())
iterator = BucketIterator(batch_size=128,
                          sorting_keys=[("source_tokens", "num_tokens")])

iterator.index_with(vocab)
Beispiel #4
0
                          embedding_dim=src_embedding_dim)

encoder = PytorchSeq2SeqWrapper(
    torch.nn.LSTM(src_embedding_dim, hidden_dim, batch_first=True))

source_embedder = BasicTextFieldEmbedder({"tokens": src_embedding})

attention = LinearAttention(hidden_dim,
                            hidden_dim,
                            activation=Activation.by_name('tanh')())

model = SimpleSeq2Seq(
    vocab,
    source_embedder,
    encoder,
    max_decoding_steps=20,
    target_embedding_dim=trg_embedding_dim,
    target_namespace='target_tokens',
    attention=attention,  # pass attention
    beam_size=8,
    use_bleu=True)

optimizer = optim.Adam(model.parameters())
iterator = BucketIterator(batch_size=32,
                          sorting_keys=[("source_tokens", "num_tokens")])
iterator.index_with(vocab)

trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  patience=10,
                  train_dataset=train_dataset,
Beispiel #5
0
def main():
    elmo_token_indexer = ELMoTokenCharactersIndexer()

    reader = Seq2SeqDatasetReader(
        source_tokenizer=WordTokenizer(),
        target_tokenizer=WordTokenizer(),
        source_token_indexers={'tokens': elmo_token_indexer},
        target_token_indexers={
            'tokens': SingleIdTokenIndexer(namespace='target_tokens')
        })

    train_dataset, test_dataset, dev_dataset = (
        reader.read(DATA_ROOT + "/" + fname) for fname in
        ["train_all_seq.txt", "test_all_seq.txt", "val_all_seq.txt"])

    vocab = Vocabulary.from_instances(train_dataset + dev_dataset +
                                      test_dataset,
                                      min_count={
                                          'tokens': 1,
                                          'target_tokens': 1
                                      })

    # en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
    #                              embedding_dim=256)
    # en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
    # embedding_dim=elmo_embedding_dim)
    #elmo_embedder = Elmo(options_file, weight_file, 2, dropout=0.5)
    elmo_embedder = ElmoTokenEmbedder(options_file, weight_file)
    # word_embeddings = BasicTextFieldEmbedder({'tokens': elmo_embedder})
    # en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
    # embedding_dim=256)
    source_embedder = BasicTextFieldEmbedder({"tokens": elmo_embedder})

    #Initializing the model
    max_decoding_steps = 20
    encoder = PytorchSeq2SeqWrapper(
        torch.nn.LSTM(elmo_embedding_dim, hidden_dim, batch_first=True))

    # encoder = StackedSelfAttentionEncoder(input_dim=elmo_embedding_dim, hidden_dim=hidden_dim, projection_dim=128, feedforward_hidden_dim=128, num_layers=1, num_attention_heads=8)
    attention = DotProductAttention()

    model = SimpleSeq2Seq(vocab,
                          source_embedder,
                          encoder,
                          max_decoding_steps,
                          target_embedding_dim=elmo_embedding_dim,
                          target_namespace='target_tokens',
                          attention=attention,
                          beam_size=8,
                          use_bleu=True)

    if USE_GPU: model.cuda()
    else: model

    # Training the model
    optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)
    iterator = BucketIterator(batch_size=32,
                              sorting_keys=[("source_tokens", "num_tokens")])
    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=dev_dataset,
                      patience=10,
                      num_epochs=1,
                      cuda_device=0 if USE_GPU else -1)

    for i in range(20):
        print('Epoch: {}'.format(i))
        trainer.train()

        predictor = SimpleSeq2SeqPredictor(model, reader)

        for instance in itertools.islice(dev_dataset, 10):
            print('SOURCE:', instance.fields['source_tokens'].tokens)
            print('GOLD:', instance.fields['target_tokens'].tokens)
            print('PRED:',
                  predictor.predict_instance(instance)['predicted_tokens'])

    #Saving the model
    with open("model_seq2seq.th", 'wb') as f:
        torch.save(model.state_dict(), f)

    vocab.save_to_files("vocabulary_seq2seq")
    predictor = SimpleSeq2SeqPredictor(model, reader)
    with open('predict_seq2seq.txt', 'w+') as f:
        for instance in itertools.islice(test_dataset, 10):
            preds = predictor.predict_instance(instance)['predicted_tokens']
            f.write(" ".join(preds) + "\n")
Beispiel #6
0
 def __init__(self,
              vocab: Vocabulary,
              tasks: str,
              domains: str,
              source_embedder: TextFieldEmbedder,
              encoder: Seq2SeqEncoder,
              max_decoding_steps: int,
              upos_namespace: str = "upos_tags",
              ner_namespace: str = "ner_tags",
              chunk_namespace: str = "chunk_tags",
              target_embedding_dim: int = None,
              attention_function: SimilarityFunction = None,
              scheduled_sampling_ratio: float = 0.0,
              initializer: InitializerApplicator = InitializerApplicator(),
              regularizer: Optional[RegularizerApplicator] = None) -> None:
     super(SimpleSeq2MultiSeq, self).__init__(vocab, regularizer)
     # print(len(tasks), len(domains))
     self._num_tasks = len(tasks)
     self._tasks = tasks
     self._domains = domains
     self._source_embedder = source_embedder
     self._encoder = encoder
     self._max_decoding_steps = max_decoding_steps
     self._upos_namespace = upos_namespace
     self._ner_namespace = ner_namespace
     self._chunk_namespace = chunk_namespace
     self._attention_function = attention_function
     self._scheduled_sampling_ratio = scheduled_sampling_ratio
     self._upos_seq2seq = SimpleSeq2Seq(
         vocab=vocab,
         source_embedder=source_embedder,
         encoder=encoder,
         max_decoding_steps=max_decoding_steps,
         target_namespace=upos_namespace,
         target_embedding_dim=target_embedding_dim,
         attention_function=attention_function,
         scheduled_sampling_ratio=scheduled_sampling_ratio,
         initializer=initializer,
         regularizer=regularizer)
     self._ner_seq2seq = SimpleSeq2Seq(
         vocab=vocab,
         source_embedder=source_embedder,
         encoder=encoder,
         max_decoding_steps=max_decoding_steps,
         target_namespace=ner_namespace,
         target_embedding_dim=target_embedding_dim,
         attention_function=attention_function,
         scheduled_sampling_ratio=scheduled_sampling_ratio,
         initializer=initializer,
         regularizer=regularizer)
     self._chunk_seq2seq = SimpleSeq2Seq(
         vocab=vocab,
         source_embedder=source_embedder,
         encoder=encoder,
         max_decoding_steps=max_decoding_steps,
         target_namespace=chunk_namespace,
         target_embedding_dim=target_embedding_dim,
         attention_function=attention_function,
         scheduled_sampling_ratio=scheduled_sampling_ratio,
         initializer=initializer,
         regularizer=regularizer)
     initializer(self)
Beispiel #7
0
    def __init__(self, training=False):
        self.training = training
        config = conf['seq2seq_allen']
        self.model_path = config['model_path']
        self.vocab_path = config['vocab_path']
        prefix = config['processed_data_prefix']
        train_file = config['train_data']
        valid_file = config['test_data']
        src_embedding_dim = config['src_embedding_dim']
        trg_embedding_dim = config['trg_embedding_dim']
        hidden_dim = config['hidden_dim']
        epoch = config['epoch']
        patience = config['patience']

        if torch.cuda.is_available():
            self.cuda_device = 0
        else:
            self.cuda_device = -1

        self.reader = Seq2SeqDatasetReader(
            source_tokenizer=WordTokenizer(),
            target_tokenizer=WordTokenizer(),
            source_token_indexers={'tokens': SingleIdTokenIndexer()},
            target_token_indexers={'tokens': SingleIdTokenIndexer()})

        if self.training:
            self.train_dataset = self.reader.read(
                os.path.join(prefix, train_file))
            self.valid_dataset = self.reader.read(
                os.path.join(prefix, valid_file))

            self.vocab = Vocabulary.from_instances(self.train_dataset +
                                                   self.valid_dataset,
                                                   min_count={'tokens': 3})
        else:
            self.vocab = Vocabulary.from_files(self.vocab_path)

        src_embedding = Embedding(
            num_embeddings=self.vocab.get_vocab_size('tokens'),
            embedding_dim=src_embedding_dim)

        encoder = PytorchSeq2SeqWrapper(
            torch.nn.LSTM(src_embedding_dim, hidden_dim, batch_first=True))

        source_embedder = BasicTextFieldEmbedder({"tokens": src_embedding})

        self.model = SimpleSeq2Seq(vocab=self.vocab,
                                   source_embedder=source_embedder,
                                   encoder=encoder,
                                   max_decoding_steps=20,
                                   target_embedding_dim=trg_embedding_dim,
                                   use_bleu=True)

        optimizer = optim.Adam(self.model.parameters())
        iterator = BucketIterator(batch_size=32,
                                  sorting_keys=[("source_tokens", "num_tokens")
                                                ])
        # 迭代器需要接受vocab,在训练时可以用vocab来index数据
        iterator.index_with(self.vocab)

        self.model.cuda(self.cuda_device)

        if training:
            self.trainer = Trainer(model=self.model,
                                   optimizer=optimizer,
                                   iterator=iterator,
                                   patience=patience,
                                   train_dataset=self.train_dataset,
                                   validation_dataset=self.valid_dataset,
                                   serialization_dir=self.model_path,
                                   num_epochs=epoch,
                                   cuda_device=self.cuda_device)

        if not self.training:
            with open(os.path.join(self.model_path, 'best.th'), 'rb') as f:
                self.model.load_state_dict(torch.load(f))
            self.model.cuda(self.cuda_device)
            self.model.training = self.training
            self.predictor = Seq2SeqPredictor(self.model,
                                              dataset_reader=self.reader)
Beispiel #8
0
def main():
    target_namespace = "target_tokens"
    if not USE_COPY:
        reader = Seq2SeqDatasetReader(
            source_tokenizer=WordTokenizer(
                word_splitter=JustSpacesWordSplitter()),
            target_tokenizer=WordTokenizer(
                word_splitter=JustSpacesWordSplitter()),
            source_token_indexers={'tokens': SingleIdTokenIndexer()},
            target_token_indexers={
                'tokens': SingleIdTokenIndexer(namespace=target_namespace)
            })
    else:
        reader = CopyNetDatasetReader(
            source_tokenizer=WordTokenizer(
                word_splitter=JustSpacesWordSplitter()),
            target_tokenizer=WordTokenizer(
                word_splitter=JustSpacesWordSplitter()),
            target_namespace=target_namespace)
    train_dataset = reader.read('./data/data_train.tsv')
    validation_dataset = reader.read('./data/data_val.tsv')

    vocab = Vocabulary.from_instances(train_dataset,
                                      min_count={
                                          'tokens': 3,
                                          'target_tokens': 3
                                      })

    en_embedding = Embedding(
        num_embeddings=vocab.get_vocab_size('tokens'),
        embedding_dim=SRC_EMBEDDING_DIM,
        pretrained_file="../opennmt/glove_dir/glove.840B.300d.txt")
    assert en_embedding.weight.requires_grad
    datas = _read_pretrained_embeddings_file(en_embedding._pretrained_file,
                                             SRC_EMBEDDING_DIM, vocab)
    datas.requires_grad = True
    en_embedding.weight.data = datas
    print(en_embedding.weight.data)
    assert en_embedding.weight.requires_grad
    encoder = PytorchSeq2SeqWrapper(
        torch.nn.LSTM(SRC_EMBEDDING_DIM,
                      HIDDEN_DIM,
                      batch_first=True,
                      bidirectional=True,
                      dropout=0.3,
                      num_layers=1))
    #encoder = StackedSelfAttentionEncoder(input_dim=SRC_EMBEDDING_DIM,
    #                                      hidden_dim=HIDDEN_DIM,
    #                                      projection_dim=128, feedforward_hidden_dim=128,
    #                                      num_layers=1, num_attention_heads=8)

    source_embedder = BasicTextFieldEmbedder({"tokens": en_embedding})
    attention = DotProductAttention()

    if not USE_COPY:
        model = SimpleSeq2Seq(vocab,
                              source_embedder,
                              encoder,
                              MAX_DECODING_STEPS,
                              target_embedding_dim=TGT_EMBEDDING_DIM,
                              target_namespace='target_tokens',
                              attention=attention,
                              beam_size=8,
                              use_bleu=True)
    else:
        model = MyCopyNet(vocab,
                          source_embedder,
                          encoder,
                          max_decoding_steps=MAX_DECODING_STEPS,
                          target_embedding_dim=TGT_EMBEDDING_DIM,
                          target_namespace=target_namespace,
                          attention=attention,
                          beam_size=8,
                          tgt_embedder_pretrain_file=
                          "../opennmt/glove_dir/glove.840B.300d.txt")
    model.to(torch.device('cuda'))
    optimizer = optim.Adam(model.parameters())
    iterator = BucketIterator(batch_size=64,
                              sorting_keys=[("source_tokens", "num_tokens")],
                              padding_noise=0.2)

    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=validation_dataset,
                      num_epochs=22,
                      patience=4,
                      serialization_dir="./checkpoints",
                      cuda_device=CUDA_DEVICE,
                      summary_interval=100)
    trainer.train()
    print(en_embedding.weight.data)
    predictor = Seq2SeqPredictor(model, reader)

    # Dump all predictions to a file
    # TODO (DNGros): Is there an automatic way in allennlp to do this??
    pred_toks = []
    with open("pred.txt", "w") as outfile:
        for instance in tqdm(validation_dataset):
            pred = predictor.predict_instance(instance)
            toks = pred['predicted_tokens']
            if toks:
                outfile.write(" ".join(toks[0]) + "\n")
            else:
                outfile.write("" + "\n")
Beispiel #9
0
    def __init__(self):
        config = conf['seq2seq_allen']
        prefix = config['processed_data_prefix']
        train_file = config['train_data']
        valid_file = config['valid_data']
        src_embedding_dim = config['src_embedding_dim']
        trg_embedding_dim = config['trg_embedding_dim']
        hidden_dim = config['hidden_dim']

        if torch.cuda.is_available():
            cuda_device = 0
        else:
            cuda_device = -1

        self.reader = Seq2SeqDatasetReader(
            source_tokenizer=WordTokenizer(),
            target_tokenizer=WordTokenizer(),
            source_token_indexers={'tokens': SingleIdTokenIndexer()},
            target_token_indexers={
                'tokens': SingleIdTokenIndexer(namespace='target_tokens')
            })

        self.train_dataset = self.reader.read(os.path.join(prefix, train_file))
        self.valid_dataset = self.reader.read(os.path.join(prefix, valid_file))

        vocab = Vocabulary.from_instances(self.train_dataset +
                                          self.valid_dataset,
                                          min_count={
                                              'tokens': 3,
                                              'target_tokens': 3
                                          })

        src_embedding = Embedding(
            num_embeddings=vocab.get_vocab_size('tokens'),
            embedding_dim=src_embedding_dim)

        encoder = PytorchSeq2SeqWrapper(
            torch.nn.LSTM(src_embedding_dim, hidden_dim, batch_first=True))

        source_embedder = BasicTextFieldEmbedder({"tokens": src_embedding})

        attention = LinearAttention(hidden_dim,
                                    hidden_dim,
                                    activation=Activation.by_name('tanh')())

        self.model = SimpleSeq2Seq(
            vocab=vocab,
            source_embedder=source_embedder,
            encoder=encoder,
            max_decoding_steps=20,
            target_embedding_dim=trg_embedding_dim,
            target_namespace='target_tokens',
            attention=attention,  # pass attention
            use_bleu=True)

        optimizer = optim.Adam(self.model.parameters())
        iterator = BucketIterator(batch_size=32,
                                  sorting_keys=[("source_tokens", "num_tokens")
                                                ])
        # 迭代器需要接受vocab,在训练时可以用vocab来index数据
        iterator.index_with(vocab)

        self.model.cuda(cuda_device)

        self.trainer = Trainer(model=self.model,
                               optimizer=optimizer,
                               iterator=iterator,
                               patience=10,
                               validation_metric="+accuracy",
                               train_dataset=self.train_dataset,
                               validation_dataset=self.valid_dataset,
                               num_epochs=1,
                               cuda_device=cuda_device)