def evaluate_model(model, qa_data, args, model_options, sess, conv_features, image_id_map): # to avoid h5py from slowing down. conv_features = data_loader.load_conv_features('train', args.cnn_model, args.feature_layer, load_image_list=False) prediction_check = [] ans_vocab_rev = qa_data['index_to_ans'] print("loading conv feats") #qa_data should be validation for this batch_no = 0 while (batch_no * args.batch_size) < len(qa_data['training']): question, answer, image_features, image_ids, ans_freq_batch = get_batch( batch_no, args.batch_size, qa_data['training'], conv_features, image_id_map, 'val', model_options) [predicted] = sess.run([model.g_predictions], feed_dict={ model.g_question: question, model.g_image_features: image_features }) pred_ans_text = utils.answer_indices_to_text(predicted, ans_vocab_rev) for bi, pred_ans in enumerate(pred_ans_text): if pred_ans in ans_freq_batch[ bi]: # and ans_freq_batch[bi][pred_ans] >= 3: prediction_check.append( min(1.0, ans_freq_batch[bi][pred_ans] / 3.0)) # prediction_check.append(1.0) else: prediction_check.append(0.0) # print pred_ans, ans_freq_batch, prediction_check[-1] accuracy = np.sum(prediction_check, dtype="float32") / len(prediction_check) print("Evaluating", batch_no, len(qa_data) / args.batch_size, accuracy) batch_no += 1 return accuracy
def main(): parser = argparse.ArgumentParser() parser.add_argument('--residual_channels', type=int, default=512, help='residual_channels') parser.add_argument('--data_dir', type=str, default='Data', help='Data directory') parser.add_argument('--version', type=int, default=1, help='VQA data version') parser.add_argument('--model_path', type=str, default=None, help='Trained Model Path') parser.add_argument('--feature_layer', type=str, default="block4", help='CONV FEATURE LAYER, fc7, pool5 or block4') parser.add_argument('--cnn_model', type=str, default="resnet", help='CNN model') parser.add_argument('--text_model', type=str, default="bytenet", help='bytenet/lstm') parser.add_argument('--question', type=str, default="What animal is shown in the picture", help='question about the image') parser.add_argument('--image_file', type=str, default="Image File path for the question", help='Image File path') args = parser.parse_args() conv_features_batch = get_conv_features(args.image_file, args.cnn_model, args.feature_layer) tf.reset_default_graph() meta_data = data_loader.load_meta_data(args.version, args.data_dir) ans_vocab_rev = meta_data['index_to_ans'] ques_vocab_rev = meta_data['index_to_qw'] qw_to_index = meta_data['qw_to_index'] question_words = data_loader.tokenize_mcb(args.question) question_indices = [qw_to_index[qw] if qw in qw_to_index else qw_to_index['UNK'] for qw in question_words] question_indices += [0 for i in range(len(question_indices), meta_data['max_question_length'])] sentence_batch = np.ndarray( (1, meta_data['max_question_length']), dtype = 'int32') sentence_batch[0] = question_indices model_options = { 'question_vocab_size' : len(meta_data['index_to_qw']), 'residual_channels' : args.residual_channels, 'ans_vocab_size' : len(meta_data['index_to_ans']), 'filter_width' : 3, 'img_dim' : 14, 'img_channels' : 2048, 'dilations' : [ 1, 2, 4, 8, 1, 2, 4, 8, ], 'text_model' : args.text_model, 'dropout_keep_prob' : 0.6, 'max_question_length' : meta_data['max_question_length'], 'num_answers' : 10 } model = VQA_model_attention.VQA_model(model_options) model.build_generator() sess = tf.InteractiveSession() tf.initialize_all_variables().run() saver = tf.train.Saver() if args.model_path: saver.restore(sess, args.model_path) try: shutil.rmtree('Data/gen_samples') except: pass os.makedirs('Data/gen_samples') pred_answer, prob1, prob2 = sess.run([model.g_predictions, model.g_prob1, model.g_prob2], feed_dict = { model.g_question : sentence_batch, model.g_image_features : conv_features_batch }) pred_ans_text = utils.answer_indices_to_text(pred_answer, ans_vocab_rev) sample_data = [] print "Actual vs Prediction" for sample_i in range(len(pred_ans_text)): print pred_ans_text[sample_i] image_array = utils.load_image_array(args.image_file, 224) blend1 = utils.get_blend_map(image_array, prob1[sample_i], overlap = True) blend2 = utils.get_blend_map(image_array, prob2[sample_i], overlap = True) sample_data.append({ 'question' : args.question, 'predicted_answer' : pred_ans_text[sample_i], 'batch_index' : sample_i }) misc.imsave('Data/gen_samples/{}_actual_image.jpg'.format(sample_i), image_array) misc.imsave('Data/gen_samples/{}_blend1.jpg'.format(sample_i), blend1) misc.imsave('Data/gen_samples/{}_blend2.jpg'.format(sample_i), blend2) f = open('Data/gen_samples/sample.json', 'wb') f.write(json.dumps(sample_data)) f.close() shutil.make_archive('Data/gen_samples', 'zip', 'Data/gen_samples')
def main(): parser = argparse.ArgumentParser() parser.add_argument('--residual_channels', type=int, default=512, help='residual_channels') parser.add_argument('--data_dir', type=str, default='Data', help='Data directory') parser.add_argument('--batch_size', type=int, default=64, help='Batch Size') parser.add_argument('--learning_rate', type=float, default=0.001, help='Batch Size') parser.add_argument('--epochs', type=int, default=25, help='Expochs') parser.add_argument('--max_steps', type=int, default=50000, help='max steps, set 1 for evaluating the model') parser.add_argument('--version', type=int, default=1, help='VQA data version') parser.add_argument('--sample_every', type=int, default=200, help='Debug every x iterations') parser.add_argument('--evaluate_every', type=int, default=6000, help='Evaluate every x steps') parser.add_argument('--resume_model', type=str, default=None, help='Trained Model Path') parser.add_argument('--training_log_file', type=str, default='Data/training_log.json', help='Log file for accuracy') parser.add_argument('--feature_layer', type=str, default="block4", help='CONV FEATURE LAYER, fc7, pool5 or block4') parser.add_argument('--cnn_model', type=str, default="resnet", help='CNN model') parser.add_argument('--text_model', type=str, default="bytenet", help='bytenet/lstm') # evaluation_steps = [6000, 12000, 18000, 25000, 30000, 35000, 50000] # evaluation_steps = [400, 800, 1200, 1600, 2000, 2400, 2800] args = parser.parse_args() print "Reading QA DATA", args.version qa_data = data_loader.load_questions_answers(args.version, args.data_dir) shuffle(qa_data['training']) shuffle(qa_data['validation']) ans_vocab_rev = qa_data['index_to_ans'] ques_vocab_rev = qa_data['index_to_qw'] print "Reading conv features" conv_features, image_id_list = data_loader.load_conv_features( 'train', args.cnn_model, args.feature_layer) # image_id_map = {image_id_list[i] : i for i in xrange(len(image_id_list))} image_id_map = {image_id_list[i]: i for i in xrange(len(image_id_list))} conv_features_val, image_id_list_val = data_loader.load_conv_features( 'val', args.cnn_model, args.feature_layer) image_id_map_val = { image_id_list_val[i]: i for i in xrange(len(image_id_list_val)) } conv_features = data_loader.load_conv_features('train', args.cnn_model, args.feature_layer, load_image_list=False) model_options = { 'question_vocab_size': len(qa_data['index_to_qw']), 'residual_channels': args.residual_channels, 'ans_vocab_size': len(qa_data['index_to_ans']), 'filter_width': 3, 'img_dim': 14, 'img_channels': 2048, 'dilations': [ 1, 2, 4, 8, 1, 2, 4, 8, ], 'text_model': args.text_model, 'dropout_keep_prob': 0.6, 'max_question_length': qa_data['max_question_length'], 'num_answers': 10 } print "MODEL OPTIONS" print model_options model = VQA_model_attention.VQA_model(model_options) model.build_model() train_op = tf.train.AdamOptimizer(args.learning_rate).minimize(model.loss) model.build_generator(reuse=True) sess = tf.InteractiveSession() tf.initialize_all_variables().run() saver = tf.train.Saver() if args.resume_model: saver.restore(sess, args.resume_model) step = 0 training_log = [] for epoch in xrange(args.epochs): batch_no = 0 while (batch_no * args.batch_size) < len(qa_data['training']): start = time.clock() question, answer, image_features, image_ids, _ = get_batch( batch_no, args.batch_size, qa_data['training'], conv_features, image_id_map, 'train', model_options) _, loss_value = sess.run( [train_op, model.loss], feed_dict={ model.question: question, model.image_features: image_features, model.answers: answer }) end = time.clock() print "Time for batch of photos", end - start print "Time for one epoch (mins)", len( qa_data['training']) / args.batch_size * (end - start) / 60.0 batch_no += 1 step += 1 print "LOSS", loss_value, batch_no, len( qa_data) / args.batch_size, step, epoch print "****" if step % args.sample_every == 0: try: shutil.rmtree('Data/samples') except: pass os.makedirs('Data/samples') pred_answer, prob1, prob2 = sess.run( [model.g_predictions, model.g_prob1, model.g_prob2], feed_dict={ model.g_question: question, model.g_image_features: image_features }) pred_ans_text = utils.answer_indices_to_text( pred_answer, ans_vocab_rev) # just a sample actual_ans_text = utils.answer_indices_to_text( answer[:, 0], ans_vocab_rev) sample_data = [] print "Actual vs Prediction" for sample_i in range(len(pred_ans_text)): print actual_ans_text[sample_i], pred_ans_text[sample_i] question_text = utils.question_indices_to_text( question[sample_i], ques_vocab_rev) image_array = utils.image_array_from_image_id( image_ids[sample_i], 'train') blend1 = utils.get_blend_map(image_array, prob1[sample_i], overlap=True) blend2 = utils.get_blend_map(image_array, prob2[sample_i], overlap=True) sample_data.append({ 'question': question_text, 'actual_answer': actual_ans_text[sample_i], 'predicted_answer': pred_ans_text[sample_i], 'image_id': image_ids[sample_i], 'batch_index': sample_i }) misc.imsave( 'Data/samples/{}_actual_image.jpg'.format(sample_i), image_array) misc.imsave('Data/samples/{}_blend1.jpg'.format(sample_i), blend1) misc.imsave('Data/samples/{}_blend2.jpg'.format(sample_i), blend2) f = open('Data/samples/sample.json', 'wb') f.write(json.dumps(sample_data)) f.close() shutil.make_archive('Data/samples', 'zip', 'Data/samples') gc.collect() if step % args.evaluate_every == 0: accuracy = evaluate_model(model, qa_data, args, model_options, sess, conv_features_val, image_id_map_val) print "ACCURACY>> ", accuracy, step, epoch training_log.append({ 'step': step, 'epoch': epoch, 'accuracy': accuracy, }) f = open(args.training_log_file, 'wb') f.write(json.dumps(training_log)) f.close() save_path = saver.save( sess, "Data/Models{}/model{}.ckpt".format(args.version, epoch)) gc.collect() # to avoid h5py from slowing down. conv_features = data_loader.load_conv_features( 'train', args.cnn_model, args.feature_layer, load_image_list=False) if step >= args.max_steps: break