-
Notifications
You must be signed in to change notification settings - Fork 0
/
inference_translator.py
151 lines (135 loc) · 5.69 KB
/
inference_translator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
from __future__ import print_function
from annoy import AnnoyIndex
from keras.models import Model
from keras.layers import Input, LSTM, Dense
from keras.callbacks import ModelCheckpoint
import numpy as np
import io
#just a function to show how much of a loop is done already
def show_loop_progress(counter,length):
if counter < length - 1 :
print("%.2f%%"%(100*counter/length) ,end='\r')
else:
print("%.2f%%"%(100*counter/length))
return
#start and end of sentence tokens
sos = "<SOS>"
eos = "<EOS>"
oov = "<OOV>"
input_words_file = "cc.tl.300.vec"
output_words_file = "wiki-news-300d-1M-subword.vec"
words_to_load = -1
#set to how many words you want to load
#set to negative values if you want to load all words
if words_to_load>0:
print("Words to load: %s"%(words_to_load))
else:
print("Words to load: all of them")
print("Loading filipino word vectors...")
fin = io.open(input_words_file, 'r', encoding='utf-8', newline='\n', errors='ignore')
n, word_dim = map(int, fin.readline().split())
filipino = {}
x=0
for line in fin:
tokens = line.rstrip().split(' ')
filipino[tokens[0]] = np.array(list(map(float, tokens[1:])))
if words_to_load > 0 :
show_loop_progress(x,words_to_load)
else:
show_loop_progress(x,n)
x = x + 1
if (x >= words_to_load) and (words_to_load > 0):
break
filipino[sos] = np.zeros(word_dim)
filipino[eos] = 0.5*np.ones(word_dim)
filipino[oov] = 0.2*np.ones(word_dim)
#oov, sos and eos were set to arbitrary vectors
print("Loading english word vectors...")
fin = io.open(output_words_file, 'r', encoding='utf-8', newline='\n', errors='ignore')
n, word_dim = map(int, fin.readline().split())
english = {}
english_loaded=0
english_list = []
english_vector = AnnoyIndex(word_dim)
for line in fin:
tokens = line.rstrip().split(' ')
english[tokens[0]] = np.array(list(map(float, tokens[1:])))
english_list += [tokens[0]]
english_vector.add_item(english_loaded,list(map(float, tokens[1:])))
if words_to_load > 0 :
show_loop_progress(english_loaded,words_to_load)
else:
show_loop_progress(english_loaded,n)
english_loaded += 1
if (english_loaded >= words_to_load) and (words_to_load > 0):
break
english[sos] = np.zeros(word_dim)
english_list += [sos]
english_vector.add_item(english_loaded,np.zeros(word_dim))
english_loaded+=1
english[eos] = 0.5*np.ones(word_dim)
english_list += [eos]
english_vector.add_item(english_loaded,0.5*np.ones(word_dim))
english_loaded+=1
english[oov] = 0.2*np.ones(word_dim)
english_list += [oov]
english_vector.add_item(english_loaded,0.2*np.ones(word_dim))
#oov, sos and eos were set to arbitrary vectors
#building translator model
batch_size = 64 # Batch size for training.
epochs = 100 # Number of epochs to train for.
context_dim = 512 # Latent dimensionality of the encoding space.
encoder_input_layer = Input( shape = (None,word_dim) )
encoder_first_layer = LSTM(context_dim,return_sequences=True)(encoder_input_layer)
encoder_middle_layer = LSTM(context_dim,return_sequences=True)(encoder_first_layer)
__, h_state, c_state = LSTM(context_dim,return_state=True)(encoder_middle_layer)
#discard outputs and keep states
encoder_final_state = [ h_state , c_state ]
decoder_input_layer = Input( shape = (None,word_dim) )
decoder_first_layer = LSTM(context_dim,return_sequences=True,return_state=True)
decoder_outputs,__,__ = decoder_first_layer(decoder_input_layer,initial_state=encoder_final_state)
decoder_dense = Dense(word_dim,activation="softmax")
decoder_outputs = decoder_dense(decoder_outputs)
auto_decoder_input_layer = Input( shape = (None,word_dim) )
auto_decoder_first_layer = LSTM(context_dim,return_sequences=True,return_state=True)
auto_decoder_outputs,__,__ = auto_decoder_first_layer(auto_decoder_input_layer,initial_state=encoder_final_state)
auto_decoder_dense = Dense(word_dim,activation="softmax")
auto_decoder_outputs = auto_decoder_dense(auto_decoder_outputs)
training_model = Model([encoder_input_layer,decoder_input_layer,auto_decoder_input_layer],[decoder_outputs,auto_decoder_outputs])
training_model.load_weights('checkpoint.h5')
encoder_model = Model(encoder_input_layer,encoder_final_state)
encoder_model.summary()
decoder_input_h_state = Input( shape = (context_dim,))
decoder_input_c_state = Input( shape = (context_dim,))
decoder_initial_states = [ decoder_input_h_state , decoder_input_c_state ]
decoder_outputs, state_h, state_c = decoder_first_layer( decoder_input_layer , initial_state=decoder_initial_states )
decoder_current_states = [ state_h , state_c ]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model( [decoder_input_layer]+decoder_initial_states , [decoder_outputs]+decoder_current_states )
decoder_model.summary()
while True:
input_sentence = sos+" "+input("Enter filipino sentence to be translated to english:").strip()+" "+eos
input_sentence = input_sentence.replace(",", " , ")
input_sentence = input_sentence.replace(".", " . ")
input_sentence = input_sentence.replace("!", " ! ")
input_sentence = input_sentence.replace("?", " ? ")
input_sentence = input_sentence.replace("\"", " \" ")
input_sentence = input_sentence.lower()
encoder_input = np.array([list(map(lambda x: filipino.get(x,filipino[oov]),input_sentence.split()))])
flip = True
if flip:
encoder_input = np.flip(encoder_input,1)
understanding = encoder_model.predict(encoder_input)
output_vectors = np.zeros((1,1,word_dim))
output_vectors[0][0] = english[sos]
decoded_sentence = ""
max_length = 20
while True:
output_tokens, h, c = decoder_model.predict([output_vectors] + understanding)
output_word = english_list[english_vector.get_nns_by_vector(output_tokens[0][0],1)[0]]
decoded_sentence += output_word + " "
if len(decoded_sentence.split()) >= max_length or output_word == eos:
break
output_vectors = output_tokens
understanding = [h,c]
print(decoded_sentence)