-
Notifications
You must be signed in to change notification settings - Fork 0
/
mainprime.py
362 lines (273 loc) · 11.4 KB
/
mainprime.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
import os
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from os import listdir
from PIL import Image
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.applications.vgg16 import preprocess_input
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.layers import Input, Dense, GRU, Embedding
from tensorflow.python.keras.applications import VGG16
from tensorflow.python.keras.optimizers import RMSprop
from tensorflow.python.keras.callbacks import ModelCheckpoint, TensorBoard
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
def load_imagesdict(directory):
images = dict()
for name in listdir(directory):
filename = directory + '/' + name
image = load_img(filename, target_size=(224, 224))
# convert the image pixels to a numpy array
image = img_to_array(image)
# reshape data for the model
image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
# prepare the image for the VGG model
image = preprocess_input(image)
# get image id
image_id = name.split('.')[0]
images[image_id] = image
return images
def load_doc(filename):
# open the file as read only
file = open(filename, 'r')
# read all text
text = file.read()
# close the file
file.close()
return text
def load_descriptions(doc):
mapping = dict()
# process lines
for line in doc.split('\n'):
# split line by white space
tokens = line.split()
if len(line) < 2:
continue
# take the first token as the image id, the rest as the description
image_id, image_desc = tokens[0], tokens[1:]
# remove filename from image id
image_id = image_id.split('.')[0]
# convert description tokens back to string
image_desc = ' '.join(image_desc)
# store the first description for each image
if image_id not in mapping:
mapping[image_id] = image_desc
return mapping
def loadcaptionsdict(filename):
doc = load_doc(filename)
descriptions = load_descriptions(doc)
return descriptions
def getimgandcaptionlist(images,descriptions):
imgids = list(images.keys())
Images = []
captions = []
for _ in range(len(imgids)):
Images.append(images[imgids[_]])
captions.append(descriptions[imgids[_]])
return Images,captions
def mark_captions(captions_list):
mark_start= 'ssss '
mark_end = ' eeee'
captions_marked = [mark_start + caption + mark_end for caption in captions_list]
return captions_marked
def sentences_to_listt(caps):
listowords = []
for _ in caps:
w = _.split()
if "." in w:
w.remove(".")
listowords.append(w)
return listowords
def get_unique_wordsdict(captions_marked_list):
unique_words = []
for i in range(len(captions_marked_list)):
for j in range(len(captions_marked_list[i])):
unique_words.append(captions_marked_list[i][j])
unique_words = list(set(unique_words))
inttoword = dict(list(enumerate(unique_words)))
return inttoword
def invert_dict(d):
return dict([ (v, k) for k, v in d.items( ) ])
def word2int(caps,unique_words_dict):
for i in range(len(caps)):
for j in range(len(caps[i])):
caps[i][j] = unique_words_dict[caps[i][j]]
return caps
def VGGModel():
image_model = VGG16(include_top = True, weights='imagenet')
transfer_layer = image_model.get_layer('fc2')
image_model_transfer = Model(inputs =image_model.input, outputs = transfer_layer.output)
return image_model_transfer,transfer_layer.output
def get_imgsignature(image_model_transfer,Img,num):
trans_values = []
for _ in range(num):
trans_values.append(image_model_transfer.predict(Img[_]))
return trans_values
def decoder(statesize,embeddingsize,maxnumwords,transfer_layer_output):
transfer_values_size = K.int_shape(transfer_layer_output)[1]
transfer_values_input = Input(shape=(transfer_values_size,),name='transfer_values_input')
decoder_transfer_map = Dense(state_size,activation='tanh',name='decoder_transfer_map')
decoder_input = Input(shape=(None, ), name='decoder_input')
decoder_embedding = Embedding(input_dim=num_words,output_dim=embedding_size,name='decoder_embedding')
decoder_gru1 = GRU(state_size, name='decoder_gru1',return_sequences=True)
decoder_gru2 = GRU(state_size, name='decoder_gru2',return_sequences=True)
decoder_gru3 = GRU(state_size, name='decoder_gru3',return_sequences=True)
decoder_dense = Dense(num_words,activation='linear',name='decoder_output')
initial_state = decoder_transfer_map(transfer_values_input)
net = decoder_input
net = decoder_embedding(net)
net = decoder_gru1(net, initial_state=initial_state)
net = decoder_gru2(net, initial_state=initial_state)
net = decoder_gru3(net, initial_state=initial_state)
decoder_output = decoder_dense(net)
decoder_model = Model(inputs=[transfer_values_input, decoder_input],outputs=[decoder_output])
return decoder_model
def sparse_cross_entropy(y_true, y_pred):
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_true,logits=y_pred)
loss_mean = tf.reduce_mean(loss)
return loss_mean
def batch_generator(batch_size):
"""
Generator function for creating random batches of training-data.
Note that it selects the data completely randomly for each
batch, corresponding to sampling of the training-set with
replacement. This means it is possible to sample the same
data multiple times within a single epoch - and it is also
possible that some data is not sampled at all within an epoch.
However, all the data should be unique within a single batch.
"""
# Infinite loop.
while True:
# Get a list of random indices for images in the training-set.
idx = np.random.randint(100,size=batch_size)
# Get the pre-computed transfer-values for those images.
# These are the outputs of the pre-trained image-model.
transf_values = np.array([transfer_values[_] for _ in idx])
# For each of the randomly chosen images there are
# at least 5 captions describing the contents of the image.
# Select one of those captions at random and get the
# associated sequence of integer-tokens.
tokens = [caps_markedwords[_] for _ in idx]
# Count the number of tokens in all these token-sequences.
num_tokens = [len(t) for t in tokens]
# Max number of tokens.
max_tokens = np.max(num_tokens)
# Pad all the other token-sequences with zeros
# so they all have the same length and can be
# input to the neural network as a numpy array.
tokens_padded = pad_sequences(tokens,
maxlen=max_tokens,
padding='post',
truncating='post')
# Further prepare the token-sequences.
# The decoder-part of the neural network
# will try to map the token-sequences to
# themselves shifted one time-step.
decoder_input_data = tokens_padded[:, 0:-1]
decoder_output_data = tokens_padded[:, 1:]
# Dict for the input-data. Because we have
# several inputs, we use a named dict to
# ensure that the data is assigned correctly.
x_data = \
{
'decoder_input': decoder_input_data,
'transfer_values_input': transf_values
}
# Dict for the output-data.
y_data = \
{
'decoder_output': decoder_output_data
}
yield (x_data, y_data)
def generate_caption(image_model_transfer,image_path, unique_words_dict, max_tokens=30):
token_end = unique_words_dict["eeee"]
image = load_img(image_path, target_size=(224, 224))
a = image
image = img_to_array(image)
image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
image = preprocess_input(image)
transfer_values = image_model_transfer.predict(image)
shape = (1, max_tokens)
decoder_input_data = np.zeros(shape=shape, dtype=np.int)
token_int = unique_words_dict["ssss"]
output_text = ''
count_tokens = 0
while token_int != token_end and count_tokens < max_tokens:
decoder_input_data[0, count_tokens] = token_int
x_data = \
{
'transfer_values_input': transfer_values,
'decoder_input': decoder_input_data
}
decoder_output = decoder_model.predict(x_data)
token_onehot = decoder_output[0, count_tokens, :]
# Convert to an integer-token.
token_int = np.argmax(token_onehot)
sampled_word = inttoword[token_int]
# Append the word to the output-text.
output_text += " " + sampled_word
# Increment the token-counter.
count_tokens += 1
output_tokens = decoder_input_data[0]
#plt.imshow(a)
#plt.show()
print("Image Path" + image_path)
print("Predicted caption:")
print(output_text)
return output_text
print()
directory = 'Flicker8k_Dataset'
filename = 'Flickr8k.token.txt'
state_size = 512
embedding_size = 128
num_words = 0
batch_size = 32
num_images = 0
captions_marked = []
caps_markedwords = []
captions_marked = []
caps_markedwords = []
imagesdict = load_imagesdict(directory)
descriptions = loadcaptionsdict(filename)
Images,captions = getimgandcaptionlist(imagesdict,descriptions)
captions_marked = mark_captions(captions)
caps_markedwords = sentences_to_listt(captions_marked)
inttoword = get_unique_wordsdict(caps_markedwords)
wordtoint = invert_dict(inttoword)
num_words = len(inttoword.keys())
capsintencoded = word2int(caps_markedwords,wordtoint)
num_images = len(Images)
steps_per_epoch = int(num_images/batch_size)
model,transfer_layer_output = VGGModel()
decoder_model = decoder(state_size,embedding_size,num_words,transfer_layer_output)
optimizer = RMSprop(lr=1e-3)
decoder_target = tf.placeholder(dtype='int32', shape=(None, None))
decoder_model.compile(optimizer=optimizer,loss=sparse_cross_entropy,target_tensors=[decoder_target])
#transfer_values = get_imgsignature(model,Images,num_images)
#transfer_values = np.array(transfer_values).reshape(num_images,4096)
transfer_values = np.load("new.npz.npy")
path_checkpoint = '22.test.keras'
callback_checkpoint = ModelCheckpoint(filepath=path_checkpoint,verbose=1,save_weights_only=True)
callback_tensorboard = TensorBoard(log_dir='./22_logs/',histogram_freq=0,write_graph=False)
callbacks = [callback_checkpoint, callback_tensorboard]
generator = batch_generator(batch_size)
#decoder_model.fit_generator(generator=generator,steps_per_epoch=steps_per_epoch,epochs=50,callbacks=callbacks)
decoder_model.load_weights('savedweights.hdf5')
j = 500
cps = []
imids = []
while j<600:
aa= 'Flicker8k_Dataset/' + list(imagesdict.keys())[_] + '.jpg'
bb = generate_caption(model,aa,wordtoint, max_tokens=30)
cps.append(bb)
imids.append(aa)
thefile = open('test.txt',w)
for item in cps:
thefile.write("%s\n"%item)
thefile.close()
thefile = open('test1.txt',w)
for item in imids:
thefile.write("%s\n"%item)