def main(): reader = Seq2SeqDatasetReader( source_tokenizer=WordTokenizer(), target_tokenizer=CharacterTokenizer(), source_token_indexers={'tokens': SingleIdTokenIndexer()}, target_token_indexers={'tokens': SingleIdTokenIndexer(namespace='target_tokens')}) train_dataset = reader.read('data/mt/tatoeba.eng_cmn.train.tsv') validation_dataset = reader.read('data/mt/tatoeba.eng_cmn.dev.tsv') vocab = Vocabulary.from_instances(train_dataset + validation_dataset, min_count={'tokens': 3, 'target_tokens': 3}) en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EN_EMBEDDING_DIM) # encoder = PytorchSeq2SeqWrapper( # torch.nn.LSTM(EN_EMBEDDING_DIM, HIDDEN_DIM, batch_first=True)) encoder = StackedSelfAttentionEncoder(input_dim=EN_EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, projection_dim=128, feedforward_hidden_dim=128, num_layers=1, num_attention_heads=8) source_embedder = BasicTextFieldEmbedder({"tokens": en_embedding}) # attention = LinearAttention(HIDDEN_DIM, HIDDEN_DIM, activation=Activation.by_name('tanh')()) # attention = BilinearAttention(HIDDEN_DIM, HIDDEN_DIM) attention = DotProductAttention() max_decoding_steps = 20 # TODO: make this variable model = SimpleSeq2Seq(vocab, source_embedder, encoder, max_decoding_steps, target_embedding_dim=ZH_EMBEDDING_DIM, target_namespace='target_tokens', attention=attention, beam_size=8, use_bleu=True) optimizer = optim.Adam(model.parameters()) iterator = BucketIterator(batch_size=32, sorting_keys=[("source_tokens", "num_tokens")]) iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=validation_dataset, num_epochs=1, cuda_device=CUDA_DEVICE) for i in range(50): print('Epoch: {}'.format(i)) trainer.train() predictor = SimpleSeq2SeqPredictor(model, reader) for instance in itertools.islice(validation_dataset, 10): print('SOURCE:', instance.fields['source_tokens'].tokens) print('GOLD:', instance.fields['target_tokens'].tokens) print('PRED:', predictor.predict_instance(instance)['predicted_tokens'])
iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, patience=2, train_dataset=train_dataset, validation_dataset=validation_dataset, num_epochs=1, cuda_device=CUDA_DEVICE) for i in range(20): print('Epoch: {}'.format(i)) trainer.train() predictor = SimpleSeq2SeqPredictor(model, reader) for instance in itertools.islice(validation_dataset, 70): print('SOURCE:', instance.fields['source_tokens'].tokens) print('GOLD:', instance.fields['target_tokens'].tokens) print('PRED:', predictor.predict_instance(instance)['predicted_tokens']) # Here's how to save the model. with open("/.../model.th", 'wb') as f: torch.save(model.state_dict(), f) vocab.save_to_files("/.../vocab") print("Model saved. DONE") #if __name__ == '__main__': # main()
def main(): elmo_token_indexer = ELMoTokenCharactersIndexer() reader = Seq2SeqDatasetReader( source_tokenizer=WordTokenizer(), target_tokenizer=WordTokenizer(), source_token_indexers={'tokens': elmo_token_indexer}, target_token_indexers={ 'tokens': SingleIdTokenIndexer(namespace='target_tokens') }) train_dataset, test_dataset, dev_dataset = ( reader.read(DATA_ROOT + "/" + fname) for fname in ["train_all_seq.txt", "test_all_seq.txt", "val_all_seq.txt"]) vocab = Vocabulary.from_instances(train_dataset + dev_dataset + test_dataset, min_count={ 'tokens': 1, 'target_tokens': 1 }) # en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), # embedding_dim=256) # en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), # embedding_dim=elmo_embedding_dim) #elmo_embedder = Elmo(options_file, weight_file, 2, dropout=0.5) elmo_embedder = ElmoTokenEmbedder(options_file, weight_file) # word_embeddings = BasicTextFieldEmbedder({'tokens': elmo_embedder}) # en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), # embedding_dim=256) source_embedder = BasicTextFieldEmbedder({"tokens": elmo_embedder}) #Initializing the model max_decoding_steps = 20 encoder = PytorchSeq2SeqWrapper( torch.nn.LSTM(elmo_embedding_dim, hidden_dim, batch_first=True)) # encoder = StackedSelfAttentionEncoder(input_dim=elmo_embedding_dim, hidden_dim=hidden_dim, projection_dim=128, feedforward_hidden_dim=128, num_layers=1, num_attention_heads=8) attention = DotProductAttention() model = SimpleSeq2Seq(vocab, source_embedder, encoder, max_decoding_steps, target_embedding_dim=elmo_embedding_dim, target_namespace='target_tokens', attention=attention, beam_size=8, use_bleu=True) if USE_GPU: model.cuda() else: model # Training the model optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5) iterator = BucketIterator(batch_size=32, sorting_keys=[("source_tokens", "num_tokens")]) iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=dev_dataset, patience=10, num_epochs=1, cuda_device=0 if USE_GPU else -1) for i in range(20): print('Epoch: {}'.format(i)) trainer.train() predictor = SimpleSeq2SeqPredictor(model, reader) for instance in itertools.islice(dev_dataset, 10): print('SOURCE:', instance.fields['source_tokens'].tokens) print('GOLD:', instance.fields['target_tokens'].tokens) print('PRED:', predictor.predict_instance(instance)['predicted_tokens']) #Saving the model with open("model_seq2seq.th", 'wb') as f: torch.save(model.state_dict(), f) vocab.save_to_files("vocabulary_seq2seq") predictor = SimpleSeq2SeqPredictor(model, reader) with open('predict_seq2seq.txt', 'w+') as f: for instance in itertools.islice(test_dataset, 10): preds = predictor.predict_instance(instance)['predicted_tokens'] f.write(" ".join(preds) + "\n")
def main(): trainFile = "../srcData/trainData.csv" validFile = "../srcData/devData.csv" testFile = "../srcData/testData.csv" trainSeq2SeqFile = data.dataPreparation(trainFile) validSeq2SeqFile = data.dataPreparation(validFile) testSeq2SeqFile = data.dataPreparation(testFile) print(testSeq2SeqFile) #TokenIndexer Determines how string tokens gets represented as arrays of indexes in a model #SingleIdTokenIndexer = Tokens are single integers #TokenCharactersIndexer = Tokens as a list of integers # Read a tsvfile with paired instances (source, target) reader = Seq2SeqDatasetReader( source_tokenizer=WordTokenizer(), target_tokenizer=WordTokenizer(), # Defaults to source_tokenizer source_token_indexers={'tokens': SingleIdTokenIndexer()}, target_token_indexers={'tokens': SingleIdTokenIndexer() } # Defaults to source_token_indexers ) # Each of the dataset is a list of each tokens (source_tokens, target_tokens) train_dataset = reader.read(trainSeq2SeqFile) validation_dataset = reader.read(validSeq2SeqFile) test_dataset = reader.read(testSeq2SeqFile) # Finding extra fact2 vocab trainExtraVocab = findExtraVocab(train_dataset) validExtraVocab = findExtraVocab(validation_dataset) testExtraVocab = findExtraVocab(test_dataset) finalExtraVocab = list( set(trainExtraVocab + validExtraVocab + testExtraVocab)) print("length:", len(finalExtraVocab)) #input() #vocab = Vocabulary.from_instances(train_dataset + validation_dataset, min_count={'tokens': 3, 'target_tokens': 3}) vocab = Vocabulary.from_instances(train_dataset + validation_dataset + test_dataset) # Train + Valid = 9703 # Train + Valid + Test = 10099 print("Vocab SIze :", vocab.get_vocab_size('tokens')) encEmbedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=ENC_EMBEDDING_DIM) # Embedding for tokens since in the dataset creation time it is mentioned tokens source_embedder = BasicTextFieldEmbedder({"tokens": encEmbedding}) encoder = PytorchSeq2SeqWrapper( torch.nn.LSTM(ENC_EMBEDDING_DIM, HIDDEN_DIM, batch_first=True, dropout=0.2)) attention = DotProductAttention() max_decoding_steps = 4 # TODO: make this variable model = SimpleSeq2Seq( vocab, source_embedder, encoder, max_decoding_steps, target_embedding_dim=TGT_EMBEDDING_DIM, #target_namespace = 'target_tokens', attention=attention, beam_size=beamSize, use_bleu=True, extra_vocab=finalExtraVocab) #Can also specify lr=0.001 optimizer = optim.Adam(model.parameters()) # Data Iterator that specify how to batch our dataset # Takes data shuffles it and creates fixed sized batches #iterator = BasicIterator(batch_size=2) #iterator.index_with(vocab) # Pads batches wrt max input lengths per batch, sorts dataset wrt the fieldnames and padding keys provided for efficient computations iterator = BucketIterator(batch_size=50, sorting_keys=[("source_tokens", "num_tokens")]) iterator.index_with(vocab) trainer = Trainer( model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=validation_dataset, #patience = 3, num_epochs=numEpochs, cuda_device=CUDA_DEVICE) trainer.train() predictor = SimpleSeq2SeqPredictor(model, reader) '''for i in range(2): print ("Epoch: {}".format(i)) trainer.train() predictor = SimpleSeq2SeqPredictor(model, reader) for instance in itertools.islice(validation_dataset, 10): print('SOURCE:', instance.fields['source_tokens'].tokens) print('GOLD:', instance.fields['target_tokens'].tokens) print('PRED:', predictor.predict_instance(instance)['predicted_tokens']) """'{'predictions': [[1, 4, 5, 92, 8, 6, 1, 8, 6, 26, 3]], 'loss': 5.9835076332092285, 'class_log_probabilities': [-20.10894012451172], 'predicted_tokens': ['@@UNKNOWN@@', 'is', 'a', 'type', 'of', 'the', '@@UNKNOWN@@', 'of', 'the', 'sun']} """ print (predictor.predict_instance(instance)) ''' outFile = open( "output_" + str(HIDDEN_DIM) + "_" + str(numEpochs) + "_" + str(beamSize) + ".csv", "w") writer = csv.writer(outFile, delimiter="\t") for instance in itertools.islice(test_dataset, 500): src = instance.fields['source_tokens'].tokens gold = instance.fields['target_tokens'].tokens pred = predictor.predict_instance(instance)['predicted_tokens'] writer.writerow([src, gold, pred]) outFile.close()
max_decoding_steps = 300 model_pred = SimpleSeq2Seq(vocab, source_embedder, encoder, max_decoding_steps, target_embedding_dim=ZH_EMBEDDING_DIM, target_namespace='target_tokens', attention=attention, beam_size=8, use_bleu=True) # Reload the trained model. with open('/home/earendil/NLP/neural_machine_translation/checkpoint_model_epoch_13', 'rb') as f: model_pred.load_state_dict(torch.load(f, map_location=torch.device('cpu'))) model_pred.eval() # Predict on new text using loaded model predictor = SimpleSeq2SeqPredictor(model_pred, dataset_reader=reader) import speech_recognition as sr import os # Import the required module for text # to speech conversion from gtts import gTTS # In[ ]: # initialize the recognizer r = sr.Recognizer() with sr.Microphone() as source: # read the audio data from the default microphone print("Start speaking...")