Example #1
0
def main():
    reader = Seq2SeqDatasetReader(
        source_tokenizer=WordTokenizer(),
        target_tokenizer=CharacterTokenizer(),
        source_token_indexers={'tokens': SingleIdTokenIndexer()},
        target_token_indexers={'tokens': SingleIdTokenIndexer(namespace='target_tokens')})
    train_dataset = reader.read('data/mt/tatoeba.eng_cmn.train.tsv')
    validation_dataset = reader.read('data/mt/tatoeba.eng_cmn.dev.tsv')

    vocab = Vocabulary.from_instances(train_dataset + validation_dataset,
                                      min_count={'tokens': 3, 'target_tokens': 3})

    en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                             embedding_dim=EN_EMBEDDING_DIM)
    # encoder = PytorchSeq2SeqWrapper(
    #     torch.nn.LSTM(EN_EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))
    encoder = StackedSelfAttentionEncoder(input_dim=EN_EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, projection_dim=128, feedforward_hidden_dim=128, num_layers=1, num_attention_heads=8)

    source_embedder = BasicTextFieldEmbedder({"tokens": en_embedding})

    # attention = LinearAttention(HIDDEN_DIM, HIDDEN_DIM, activation=Activation.by_name('tanh')())
    # attention = BilinearAttention(HIDDEN_DIM, HIDDEN_DIM)
    attention = DotProductAttention()

    max_decoding_steps = 20   # TODO: make this variable
    model = SimpleSeq2Seq(vocab, source_embedder, encoder, max_decoding_steps,
                          target_embedding_dim=ZH_EMBEDDING_DIM,
                          target_namespace='target_tokens',
                          attention=attention,
                          beam_size=8,
                          use_bleu=True)
    optimizer = optim.Adam(model.parameters())
    iterator = BucketIterator(batch_size=32, sorting_keys=[("source_tokens", "num_tokens")])

    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=validation_dataset,
                      num_epochs=1,
                      cuda_device=CUDA_DEVICE)

    for i in range(50):
        print('Epoch: {}'.format(i))
        trainer.train()

        predictor = SimpleSeq2SeqPredictor(model, reader)

        for instance in itertools.islice(validation_dataset, 10):
            print('SOURCE:', instance.fields['source_tokens'].tokens)
            print('GOLD:', instance.fields['target_tokens'].tokens)
            print('PRED:', predictor.predict_instance(instance)['predicted_tokens'])
Example #2
0
iterator.index_with(vocab)

trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  patience=2,
                  train_dataset=train_dataset,
                  validation_dataset=validation_dataset,
                  num_epochs=1,
                  cuda_device=CUDA_DEVICE)

for i in range(20):
    print('Epoch: {}'.format(i))
    trainer.train()

    predictor = SimpleSeq2SeqPredictor(model, reader)

    for instance in itertools.islice(validation_dataset, 70):
        print('SOURCE:', instance.fields['source_tokens'].tokens)
        print('GOLD:', instance.fields['target_tokens'].tokens)
        print('PRED:',
              predictor.predict_instance(instance)['predicted_tokens'])

    # Here's how to save the model.
with open("/.../model.th", 'wb') as f:
    torch.save(model.state_dict(), f)
vocab.save_to_files("/.../vocab")
print("Model saved. DONE")
#if __name__ == '__main__':
#    main()
Example #3
0
def main():
    elmo_token_indexer = ELMoTokenCharactersIndexer()

    reader = Seq2SeqDatasetReader(
        source_tokenizer=WordTokenizer(),
        target_tokenizer=WordTokenizer(),
        source_token_indexers={'tokens': elmo_token_indexer},
        target_token_indexers={
            'tokens': SingleIdTokenIndexer(namespace='target_tokens')
        })

    train_dataset, test_dataset, dev_dataset = (
        reader.read(DATA_ROOT + "/" + fname) for fname in
        ["train_all_seq.txt", "test_all_seq.txt", "val_all_seq.txt"])

    vocab = Vocabulary.from_instances(train_dataset + dev_dataset +
                                      test_dataset,
                                      min_count={
                                          'tokens': 1,
                                          'target_tokens': 1
                                      })

    # en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
    #                              embedding_dim=256)
    # en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
    # embedding_dim=elmo_embedding_dim)
    #elmo_embedder = Elmo(options_file, weight_file, 2, dropout=0.5)
    elmo_embedder = ElmoTokenEmbedder(options_file, weight_file)
    # word_embeddings = BasicTextFieldEmbedder({'tokens': elmo_embedder})
    # en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
    # embedding_dim=256)
    source_embedder = BasicTextFieldEmbedder({"tokens": elmo_embedder})

    #Initializing the model
    max_decoding_steps = 20
    encoder = PytorchSeq2SeqWrapper(
        torch.nn.LSTM(elmo_embedding_dim, hidden_dim, batch_first=True))

    # encoder = StackedSelfAttentionEncoder(input_dim=elmo_embedding_dim, hidden_dim=hidden_dim, projection_dim=128, feedforward_hidden_dim=128, num_layers=1, num_attention_heads=8)
    attention = DotProductAttention()

    model = SimpleSeq2Seq(vocab,
                          source_embedder,
                          encoder,
                          max_decoding_steps,
                          target_embedding_dim=elmo_embedding_dim,
                          target_namespace='target_tokens',
                          attention=attention,
                          beam_size=8,
                          use_bleu=True)

    if USE_GPU: model.cuda()
    else: model

    # Training the model
    optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)
    iterator = BucketIterator(batch_size=32,
                              sorting_keys=[("source_tokens", "num_tokens")])
    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=dev_dataset,
                      patience=10,
                      num_epochs=1,
                      cuda_device=0 if USE_GPU else -1)

    for i in range(20):
        print('Epoch: {}'.format(i))
        trainer.train()

        predictor = SimpleSeq2SeqPredictor(model, reader)

        for instance in itertools.islice(dev_dataset, 10):
            print('SOURCE:', instance.fields['source_tokens'].tokens)
            print('GOLD:', instance.fields['target_tokens'].tokens)
            print('PRED:',
                  predictor.predict_instance(instance)['predicted_tokens'])

    #Saving the model
    with open("model_seq2seq.th", 'wb') as f:
        torch.save(model.state_dict(), f)

    vocab.save_to_files("vocabulary_seq2seq")
    predictor = SimpleSeq2SeqPredictor(model, reader)
    with open('predict_seq2seq.txt', 'w+') as f:
        for instance in itertools.islice(test_dataset, 10):
            preds = predictor.predict_instance(instance)['predicted_tokens']
            f.write(" ".join(preds) + "\n")
Example #4
0
def main():

    trainFile = "../srcData/trainData.csv"
    validFile = "../srcData/devData.csv"
    testFile = "../srcData/testData.csv"
    trainSeq2SeqFile = data.dataPreparation(trainFile)
    validSeq2SeqFile = data.dataPreparation(validFile)
    testSeq2SeqFile = data.dataPreparation(testFile)
    print(testSeq2SeqFile)
    #TokenIndexer Determines how string tokens gets represented as arrays of indexes in a model
    #SingleIdTokenIndexer = Tokens are single integers
    #TokenCharactersIndexer = Tokens as a list of integers
    # Read a tsvfile with paired instances (source, target)
    reader = Seq2SeqDatasetReader(
        source_tokenizer=WordTokenizer(),
        target_tokenizer=WordTokenizer(),  # Defaults to source_tokenizer
        source_token_indexers={'tokens': SingleIdTokenIndexer()},
        target_token_indexers={'tokens': SingleIdTokenIndexer()
                               }  # Defaults to source_token_indexers
    )

    # Each of the dataset is a list of each tokens (source_tokens, target_tokens)
    train_dataset = reader.read(trainSeq2SeqFile)
    validation_dataset = reader.read(validSeq2SeqFile)
    test_dataset = reader.read(testSeq2SeqFile)

    # Finding extra fact2 vocab
    trainExtraVocab = findExtraVocab(train_dataset)
    validExtraVocab = findExtraVocab(validation_dataset)
    testExtraVocab = findExtraVocab(test_dataset)
    finalExtraVocab = list(
        set(trainExtraVocab + validExtraVocab + testExtraVocab))
    print("length:", len(finalExtraVocab))
    #input()

    #vocab = Vocabulary.from_instances(train_dataset + validation_dataset, min_count={'tokens': 3, 'target_tokens': 3})
    vocab = Vocabulary.from_instances(train_dataset + validation_dataset +
                                      test_dataset)
    # Train + Valid = 9703
    # Train + Valid + Test = 10099

    print("Vocab SIze :", vocab.get_vocab_size('tokens'))

    encEmbedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                             embedding_dim=ENC_EMBEDDING_DIM)

    # Embedding for tokens since in the dataset creation time it is mentioned tokens
    source_embedder = BasicTextFieldEmbedder({"tokens": encEmbedding})

    encoder = PytorchSeq2SeqWrapper(
        torch.nn.LSTM(ENC_EMBEDDING_DIM,
                      HIDDEN_DIM,
                      batch_first=True,
                      dropout=0.2))

    attention = DotProductAttention()

    max_decoding_steps = 4  # TODO: make this variable
    model = SimpleSeq2Seq(
        vocab,
        source_embedder,
        encoder,
        max_decoding_steps,
        target_embedding_dim=TGT_EMBEDDING_DIM,
        #target_namespace = 'target_tokens',
        attention=attention,
        beam_size=beamSize,
        use_bleu=True,
        extra_vocab=finalExtraVocab)
    #Can also specify lr=0.001
    optimizer = optim.Adam(model.parameters())

    # Data Iterator that specify how to batch our dataset
    # Takes data shuffles it and creates fixed sized batches
    #iterator = BasicIterator(batch_size=2)
    #iterator.index_with(vocab)
    # Pads batches wrt max input lengths per batch, sorts dataset wrt the fieldnames and padding keys provided for efficient computations
    iterator = BucketIterator(batch_size=50,
                              sorting_keys=[("source_tokens", "num_tokens")])
    iterator.index_with(vocab)

    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        iterator=iterator,
        train_dataset=train_dataset,
        validation_dataset=validation_dataset,
        #patience = 3,
        num_epochs=numEpochs,
        cuda_device=CUDA_DEVICE)

    trainer.train()
    predictor = SimpleSeq2SeqPredictor(model, reader)
    '''for i in range(2):
        print ("Epoch: {}".format(i))
        trainer.train()

        predictor = SimpleSeq2SeqPredictor(model, reader)


        for instance in itertools.islice(validation_dataset, 10):
            print('SOURCE:', instance.fields['source_tokens'].tokens)
            print('GOLD:', instance.fields['target_tokens'].tokens)
            print('PRED:', predictor.predict_instance(instance)['predicted_tokens'])
            """'{'predictions': [[1, 4, 5, 92, 8, 6, 1, 8, 6, 26, 3]], 
             'loss': 5.9835076332092285,
             'class_log_probabilities': [-20.10894012451172],
             'predicted_tokens': ['@@UNKNOWN@@', 'is', 'a', 'type', 'of', 'the', '@@UNKNOWN@@', 'of', 'the', 'sun']}
             """
            print (predictor.predict_instance(instance))
    '''

    outFile = open(
        "output_" + str(HIDDEN_DIM) + "_" + str(numEpochs) + "_" +
        str(beamSize) + ".csv", "w")
    writer = csv.writer(outFile, delimiter="\t")
    for instance in itertools.islice(test_dataset, 500):
        src = instance.fields['source_tokens'].tokens
        gold = instance.fields['target_tokens'].tokens
        pred = predictor.predict_instance(instance)['predicted_tokens']
        writer.writerow([src, gold, pred])

    outFile.close()
max_decoding_steps = 300
model_pred = SimpleSeq2Seq(vocab, source_embedder, encoder, max_decoding_steps,
                           target_embedding_dim=ZH_EMBEDDING_DIM,
                           target_namespace='target_tokens',
                           attention=attention,
                           beam_size=8,
                           use_bleu=True)

# Reload the trained model.
with open('/home/earendil/NLP/neural_machine_translation/checkpoint_model_epoch_13', 'rb') as f:
    model_pred.load_state_dict(torch.load(f, map_location=torch.device('cpu')))
    model_pred.eval()

# Predict on new text using loaded model
predictor = SimpleSeq2SeqPredictor(model_pred, dataset_reader=reader)

import speech_recognition as sr
import os
# Import the required module for text  
# to speech conversion 
from gtts import gTTS

# In[ ]:

# initialize the recognizer
r = sr.Recognizer()

with sr.Microphone() as source:
    # read the audio data from the default microphone
    print("Start speaking...")