def do_proc(): parser = get_parser() args_dict = parser.parse_args() args_dict.mode = 'test' args_dict.bs = 1 args_dict.cnn_train = False args_dict.dr = True args_dict.bn = True args_dict.sgate = True args_dict.temperature = -1 args_dict.model_file = 'h5-models/model-ep008-loss2.863-val_loss3.476.h5' model = get_model(args_dict) weights = args_dict.model_file model.load_weights(weights) print model.summary() model.compile(optimizer=None, loss='categorical_crossentropy', sample_weight_mode="temporal") dataloader = DataLoader(args_dict) N = args_dict.bs val_gen = dataloader.generator('test', batch_size=args_dict.bs, train_flag=False) # N samples tmp_dir = os.path.join(args_dict.data_folder, 'tmp') cnn = model.layers[1] cnn.save_weights(os.path.join(tmp_dir, 'cnn.h5'), overwrite=True) lang_model = model.layers[3] lang_model.save_weights(os.path.join(tmp_dir, 'lang.h5'), overwrite=True) K.clear_session() wh = args_dict.convsize # spatial dim of conv features dim = args_dict.nfilters # number of channels seqlen = 1 # seqlen is 1 in test mode im_ph = Input(batch_shape=(args_dict.bs, args_dict.imsize, args_dict.imsize, 3)) cf_ph = Input(batch_shape=(args_dict.bs, wh, wh, dim)) pw_ph = Input(batch_shape=(args_dict.bs, seqlen), name='prev_words') cnn = image_model(args_dict, im_ph) cnn.load_weights(os.path.join(tmp_dir, 'cnn.h5')) lang_model = language_model(args_dict, wh, dim, cf_ph, pw_ph) lang_model.load_weights(os.path.join(tmp_dir, 'lang.h5')) att_layer = 'att_scores' lang_model_att = Model(input=lang_model.input, output=[ lang_model.get_layer('out').output, lang_model.get_layer(att_layer).output ]) cnn.compile(optimizer=None, loss='categorical_crossentropy', sample_weight_mode="temporal") lang_model_att.compile(optimizer=None, loss='categorical_crossentropy', sample_weight_mode="temporal") vocab_file = os.path.join(args_dict.data_folder, 'data', args_dict.vfile) vocab = pickle.load(open(vocab_file, 'rb')) inv_vocab = {v: k for k, v in vocab.items()} figsize = (30, 30) # parameters to manipulate attention weights sig = 5 th = 0.3 IMPATH = os.path.join(args_dict.coco_path, 'images', 'val' + args_dict.year) count = 0 for [batch_im, prevs], cap, _, imids in val_gen: # store all attention maps here conv_feats = cnn.predict_on_batch(batch_im) masks = np.zeros( (args_dict.seqlen, args_dict.imsize, args_dict.imsize)) # first previous word is <start> (idx 1 in vocab) prevs = np.zeros((N, 1)) # store all predicted words in sequence here word_idxs = np.zeros((N, args_dict.seqlen)) imname = imids[0]['file_name'] img = read_image(os.path.join(IMPATH, imname), (args_dict.imsize, args_dict.imsize)) # loop to get sequence of predicted words for i in range(args_dict.seqlen): preds, att = lang_model_att.predict_on_batch( [conv_feats, prevs]) # (N,1,vocab_size) # store predicted word and set previous word for next step preds = preds.squeeze() if args_dict.temperature > 0: preds = sample(preds, temperature=args_dict.temperature) word_idxs[:, i] = np.argmax(preds, axis=-1) prevs = np.argmax(preds, axis=-1) prevs = np.reshape(prevs, (N, 1)) # attention map manipulation for display s_att = np.shape(att)[-1] att = np.reshape(att, (s_att, )) if args_dict.sgate: s_w = att[-1] # sentinel weight att = att[: -1] # remove the sentinel weight from attention weights if s_w > 0.5: continue # if sentinel weight is higher, then black mask s = int(np.sqrt(s_att)) att = np.reshape(att, (s, s)) att = zoom(att, float(img.shape[0]) / att.shape[-1], order=1) att = gaussian_filter(att, sigma=sig) att = (att - (np.min(att))) / (np.max(att) - np.min(att)) att[att > th] = 1 att[att <= th] = 0.3 masks[i] = att # find words for predicted word idxs pred_caps = idx2word(word_idxs, inv_vocab) true_caps = idx2word(np.argmax(cap, axis=-1), inv_vocab) # display predictions with attention maps n_words = len(pred_caps[0]) f, axarr = plt.subplots(1, n_words, figsize=figsize) for i in range(n_words): im = copy.deepcopy(img) for c in range(3): im[:, :, c] = im[:, :, c] * masks[i] axarr[i].imshow(im) axarr[i].axis('off') axarr[i].set_title(pred_caps[0][i]) plt.show() pred_cap = ' '.join(pred_caps[0]) true_cap = ' '.join(true_caps[0]) # true captions print("ID:", imids[0]['file_name'], imids[0]['id']) print("True:", true_cap) print("Gen:", pred_cap) lang_model_att.reset_states() count += 1 if count > 10: break
im = np.expand_dims(im, axis=0) # Import model #Weights https://gist.github.com/baraldilorenzo/07d7802847aaad0a35d3 if os.path.exists('test_modelVG.h5'): model = load_model('test_modelVG.h5') else: model = VG(include_top=True, weights='imagenet', input_tensor=None, input_shape=None, pooling=None, classes=1000) model.save('test_modelVG.h5') #VGG_16(weights_path='weights.h5') optimizer = SGD() model.compile(optimizer=optimizer, loss='categorical_crossentropy') out = model.predict(im) index = np.argmax(out) i = np.argsort(out) print("Max Prediction: " + item_dict[int(index)]) print("Other predictions in order:") ind = 1 for ind in range(5): name = item_dict[int(i[0][-ind - 1])] print(str(ind) + ". " + name)