def update_output(n_clicks, value): vocab_data = data_loader.get_question_answer_vocab("2") qvocab = vocab_data['question_vocab'] q_map = { vocab_data['question_vocab'][qw] : qw for qw in vocab_data['question_vocab']} print('filename::',filen) fc7_features = utils.extract_fc7_features(filen, 'Data/vgg16.tfmodel') model_options = { 'num_lstm_layers' : 2, 'rnn_size' : 512, 'embedding_size' : 512, 'word_emb_dropout' : 0.5, 'image_dropout' : 0.5, 'fc7_feature_length' : 4096, 'lstm_steps' : vocab_data['max_question_length'] + 1, 'q_vocab_size' : len(vocab_data['question_vocab']), 'ans_vocab_size' : len(vocab_data['answer_vocab']) } question_vocab = vocab_data['question_vocab'] word_regex = re.compile(r'\w+') question_ids = np.zeros((1, vocab_data['max_question_length']), dtype = 'int32') print('qst',value) question_words = re.findall(word_regex, value) base = vocab_data['max_question_length'] - len(question_words) for i in range(0, len(question_words)): if question_words[i] in question_vocab: question_ids[0][base + i] = question_vocab[ question_words[i] ] else: question_ids[0][base + i] = question_vocab['UNK'] ans_map = { vocab_data['answer_vocab'][ans] : ans for ans in vocab_data['answer_vocab']} model = vis_lstm_model.Vis_lstm_model(model_options) input_tensors, t_prediction, t_ans_probab = model.build_generator() sess = tf.InteractiveSession() saver = tf.train.Saver() saver.restore(sess, 'Data/Models/modelnew99.ckpt') pred, answer_probab = sess.run([t_prediction, t_ans_probab], feed_dict={ input_tensors['fc7']:fc7_features, input_tensors['sentence']:question_ids, }) print("answerprediction",pred[0]) #model.summary() #plot_model(model,to_file='predictmodel.png') print ("Ans:", ans_map[pred[0]]) answer_probab_tuples = [(-answer_probab[0][idx], idx) for idx in range(len(answer_probab[0]))] answer_probab_tuples.sort() print ("Top Answers") for i in range(1): print (ans_map[ answer_probab_tuples[0][1] ]) #ans=(ans_map[answer_probab_tuples[i][1] ]) lang = "en" text="This is a "+ans_map[ answer_probab_tuples[0][1] ] speech = Speech(text, lang) sox_effects = ("speed", "0.8") speech.play(sox_effects) return ans_map[answer_probab_tuples[0][1]]
def main(): parser = argparse.ArgumentParser() parser.add_argument('--image_path', type=str, default='Data/cat.jpeg', help='Image Path') parser.add_argument('--model_path', type=str, default='Data/Models/model2.ckpt', help='Model Path') parser.add_argument('--num_lstm_layers', type=int, default=2, help='num_lstm_layers') parser.add_argument('--fc7_feature_length', type=int, default=4096, help='fc7_feature_length') parser.add_argument('--rnn_size', type=int, default=512, help='rnn_size') parser.add_argument('--embedding_size', type=int, default=512, help='embedding_size'), parser.add_argument('--word_emb_dropout', type=float, default=0.5, help='word_emb_dropout') parser.add_argument('--image_dropout', type=float, default=0.5, help='image_dropout') parser.add_argument('--data_dir', type=str, default='Data', help='Data directory') parser.add_argument('--question', type=str, default='Which animal is this?', help='Question') args = parser.parse_args() print("Image:", args.image_path) print("Question:", args.question) vocab_data = data_loader.get_question_answer_vocab(args.data_dir) qvocab = vocab_data['question_vocab'] q_map = { vocab_data['question_vocab'][qw]: qw for qw in vocab_data['question_vocab'] } fc7_features = utils.extract_fc7_features( args.image_path, join(args.data_dir, 'vgg16.tfmodel')) model_options = { 'num_lstm_layers': args.num_lstm_layers, 'rnn_size': args.rnn_size, 'embedding_size': args.embedding_size, 'word_emb_dropout': args.word_emb_dropout, 'image_dropout': args.image_dropout, 'fc7_feature_length': args.fc7_feature_length, 'lstm_steps': vocab_data['max_question_length'] + 1, 'q_vocab_size': len(vocab_data['question_vocab']), 'ans_vocab_size': len(vocab_data['answer_vocab']) } question_vocab = vocab_data['question_vocab'] word_regex = re.compile(r'\w+') question_ids = np.zeros((1, vocab_data['max_question_length']), dtype='int32') question_words = re.findall(word_regex, args.question) base = vocab_data['max_question_length'] - len(question_words) for i in range(0, len(question_words)): if question_words[i] in question_vocab: question_ids[0][base + i] = question_vocab[question_words[i]] else: question_ids[0][base + i] = question_vocab['UNK'] ans_map = { vocab_data['answer_vocab'][ans]: ans for ans in vocab_data['answer_vocab'] } model = vis_lstm_model.Vis_lstm_model(model_options) input_tensors, t_prediction, t_ans_probab = model.build_generator() sess = tf.InteractiveSession() saver = tf.train.Saver() saver.restore(sess, args.model_path) pred, answer_probab = sess.run( [t_prediction, t_ans_probab], feed_dict={ input_tensors['fc7']: fc7_features, input_tensors['sentence']: question_ids, }) print("Ans:", ans_map[pred[0]]) answer_probab_tuples = [(-answer_probab[0][idx], idx) for idx in range(len(answer_probab[0]))] answer_probab_tuples.sort() print("Top Answers") for i in range(5): print(ans_map[answer_probab_tuples[i][1]])
def main(): parser = argparse.ArgumentParser() parser.add_argument('--num_lstm_layers', type=int, default=2, help='num_lstm_layers') parser.add_argument('--fc7_feature_length', type=int, default=4096, help='fc7_feature_length') parser.add_argument('--rnn_size', type=int, default=512, help='rnn_size') parser.add_argument('--embedding_size', type=int, default=512, help='embedding_size'), parser.add_argument('--word_emb_dropout', type=float, default=0.5, help='word_emb_dropout') parser.add_argument('--image_dropout', type=float, default=0.5, help='image_dropout') parser.add_argument('--data_dir', type=str, default='Data', help='Data directory') parser.add_argument('--batch_size', type=int, default=200, help='Batch Size') parser.add_argument('--learning_rate', type=float, default=0.001, help='Batch Size') parser.add_argument('--epochs', type=int, default=200, help='Expochs') parser.add_argument('--debug', type=bool, default=False, help='Debug') parser.add_argument('--model_path', type=str, default = 'Data/Models/model21.ckpt', help='Model Path') args = parser.parse_args() print "Reading QA DATA" qa_data = data_loader.load_questions_answers(args) print "Reading fc7 features" fc7_features, image_id_list = data_loader.load_fc7_features(args.data_dir, 'val') print "FC7 features", fc7_features.shape print "image_id_list", image_id_list.shape image_id_map = {} for i in xrange(len(image_id_list)): image_id_map[ image_id_list[i] ] = i ans_map = { qa_data['answer_vocab'][ans] : ans for ans in qa_data['answer_vocab']} model_options = { 'num_lstm_layers' : args.num_lstm_layers, 'rnn_size' : args.rnn_size, 'embedding_size' : args.embedding_size, 'word_emb_dropout' : args.word_emb_dropout, 'image_dropout' : args.image_dropout, 'fc7_feature_length' : args.fc7_feature_length, 'lstm_steps' : qa_data['max_question_length'] + 1, 'q_vocab_size' : len(qa_data['question_vocab']), 'ans_vocab_size' : len(qa_data['answer_vocab']) } model = vis_lstm_model.Vis_lstm_model(model_options) input_tensors, t_prediction, t_ans_probab = model.build_generator() sess = tf.InteractiveSession() saver = tf.train.Saver() avg_accuracy = 0.0 total = 0 saver.restore(sess, args.model_path) batch_no = 0 while (batch_no*args.batch_size) < len(qa_data['validation']): sentence, answer, fc7 = get_batch(batch_no, args.batch_size, fc7_features, image_id_map, qa_data, 'val') pred, ans_prob = sess.run([t_prediction, t_ans_probab], feed_dict={ input_tensors['fc7']:fc7, input_tensors['sentence']:sentence, }) batch_no += 1 if args.debug: for idx, p in enumerate(pred): print ans_map[p], ans_map[ np.argmax(answer[idx])] correct_predictions = np.equal(pred, np.argmax(answer, 1)) correct_predictions = correct_predictions.astype('float32') accuracy = correct_predictions.mean() print "Acc", accuracy avg_accuracy += accuracy total += 1 print "Acc", avg_accuracy/total
def main(): parser = argparse.ArgumentParser() # argparse 是 Python 内置的一个用于命令项选项与参数解析的模块,\ # 通过在程序中定义好我们需要的参数,argparse 将会从 sys.argv 中解析出这些参数,并自动生成帮助和使用信息 parser.add_argument('--num_lstm_layers', type=int, default=2, help='num_lstm_layers') parser.add_argument('--fc7_feature_length', type=int, default=4096, help='fc7_feature_length') parser.add_argument('--rnn_size', type=int, default=512, help='rnn_size') parser.add_argument('--embedding_size', type=int, default=512, help='embedding_size'), parser.add_argument('--word_emb_dropout', type=float, default=0.5, help='word_emb_dropout') parser.add_argument('--image_dropout', type=float, default=0.5, help='image_dropout') parser.add_argument('--data_dir', type=str, default='Data', help='Data directory') parser.add_argument('--batch_size', type=int, default=200, help='Batch Size') parser.add_argument('--learning_rate', type=float, default=0.001, help='Batch Size') parser.add_argument('--epochs', type=int, default=200, help='Expochs') parser.add_argument('--debug', type=bool, default=False, help='Debug') parser.add_argument('--resume_model', type=str, default=None, help='Trained Model Path') parser.add_argument('--version', type=int, default=2, help='VQA data version') args = parser.parse_args() # Get the two attributes, integers and accumulate. print("Reading QA DATA") #存了些什么样子的数据?函数返回的数据结构啥样子= qa_data的结构啥样子。 qa_data = data_loader.load_questions_answers(args.version, args.data_dir) print("Reading fc7 features") #下面data_loader提取到的就是feature和id,但是dataloader应该还没有经过training,如何得到的? #data_loader的到的image_id_list是什么样子的? fc7_features, image_id_list = data_loader.load_fc7_features(args.data_dir, 'train') print("FC7 features", fc7_features.shape) print("image_id_list", image_id_list.shape) image_id_map = {} #得到的是image_id名字对应的id数字;数据类型为字典 for i in range(len(image_id_list)): image_id_map[ image_id_list[i] ] = i # 为啥需要一个ans_map这样的字典? # 这里面的ans是什么东西,以及qa_data['answer_vocab'][ans]的数据结构为何会是这样? ans_map = { qa_data['answer_vocab'][ans] : ans for ans in qa_data['answer_vocab']} #下面这个是配置好TensorFlow初始化的参数。 model_options = { 'num_lstm_layers' : args.num_lstm_layers, 'rnn_size' : args.rnn_size, 'embedding_size' : args.embedding_size, 'word_emb_dropout' : args.word_emb_dropout, 'image_dropout' : args.image_dropout, 'fc7_feature_length' : args.fc7_feature_length, 'lstm_steps' : qa_data['max_question_length'] + 1, 'q_vocab_size' : len(qa_data['question_vocab']), 'ans_vocab_size' : len(qa_data['answer_vocab']) } #下面这里几句话对TensorFLow进行了初始化与调用。 model = vis_lstm_model.Vis_lstm_model(model_options)# 初始化TensorFlow input_tensors, t_loss, t_accuracy, t_p = model.build_model() # Get the results of the Neural Network Model(LSTM) train_op = tf.train.AdamOptimizer(args.learning_rate).minimize(t_loss) # Use Adam to get better learning rate sess = tf.InteractiveSession() # Get into the interactive session, I think here just open a window or sth to display sth. tf.initialize_all_variables().run() # I think here is the interrupt processing. if resume from previous process, resume with previous process results. saver = tf.train.Saver() if args.resume_model: saver.restore(sess, args.resume_model) for i in range(args.epochs): batch_no = 0 while (batch_no*args.batch_size) < len(qa_data['training']): # batch_no*args.batch_size = the total number of elements #in training set that has been explored. #Get the batch of the training set. sentence, answer, fc7 = get_training_batch(batch_no, args.batch_size, fc7_features, image_id_map, qa_data, 'train') _, loss_value, accuracy, pred = sess.run([train_op, t_loss, t_accuracy, t_p], feed_dict={ input_tensors['fc7']:fc7, input_tensors['sentence']:sentence, input_tensors['answer']:answer } ) ### The whole part just store all the parameters into tensorflow inner class! ### batch_no += 1 if args.debug: for idx, p in enumerate(pred): print(ans_map[p], ans_map[ np.argmax(answer[idx])]) print("Loss", loss_value, batch_no, i) print("Accuracy", accuracy) print("---------------") else: print("Loss", loss_value, batch_no, i) print("Training Accuracy", accuracy) save_path = saver.save(sess, "Data/Models/model{}.ckpt".format(i))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--num_lstm_layers', type=int, default=2, help='num_lstm_layers') parser.add_argument('--fc7_feature_length', type=int, default=4096, help='fc7_feature_length') parser.add_argument('--rnn_size', type=int, default=512, help='rnn_size') parser.add_argument('--embedding_size', type=int, default=512, help='embedding_size'), parser.add_argument('--word_emb_dropout', type=float, default=0.5, help='word_emb_dropout') parser.add_argument('--image_dropout', type=float, default=0.5, help='image_dropout') parser.add_argument('--data_dir', type=str, default='Data', help='Data directory') parser.add_argument('--batch_size', type=int, default=200, help='Batch Size') parser.add_argument('--learning_rate', type=float, default=0.001, help='Batch Size') parser.add_argument('--epochs', type=int, default=200, help='Expochs') parser.add_argument('--debug', type=bool, default=False, help='Debug') parser.add_argument('--resume_model', type=str, default=None, help='Trained Model Path') args = parser.parse_args() print "Reading QA DATA" qa_data = data_loader.load_questions_answers(args) print "Reading fc7 features" fc7_features, image_id_list = data_loader.load_fc7_features( args.data_dir, 'train') print "FC7 features", fc7_features.shape print "image_id_list", image_id_list.shape image_id_map = {} for i in xrange(len(image_id_list)): image_id_map[image_id_list[i]] = i ans_map = { qa_data['answer_vocab'][ans]: ans for ans in qa_data['answer_vocab'] } model_options = { 'num_lstm_layers': args.num_lstm_layers, 'rnn_size': args.rnn_size, 'embedding_size': args.embedding_size, 'word_emb_dropout': args.word_emb_dropout, 'image_dropout': args.image_dropout, 'fc7_feature_length': args.fc7_feature_length, 'lstm_steps': qa_data['max_question_length'] + 1, 'q_vocab_size': len(qa_data['question_vocab']), 'ans_vocab_size': len(qa_data['answer_vocab']) } model = vis_lstm_model.Vis_lstm_model(model_options) input_tensors, t_loss, t_accuracy, t_p = model.build_model() train_op = tf.train.AdamOptimizer(args.learning_rate).minimize(t_loss) sess = tf.InteractiveSession() tf.initialize_all_variables().run() saver = tf.train.Saver() if args.resume_model: saver.restore(sess, args.resume_model) for i in xrange(args.epochs): batch_no = 0 while (batch_no * args.batch_size) < len(qa_data['training']): sentence, answer, fc7 = get_training_batch(batch_no, args.batch_size, fc7_features, image_id_map, qa_data, 'train') _, loss_value, accuracy, pred = sess.run( [train_op, t_loss, t_accuracy, t_p], feed_dict={ input_tensors['fc7']: fc7, input_tensors['sentence']: sentence, input_tensors['answer']: answer }) batch_no += 1 if args.debug: for idx, p in enumerate(pred): print ans_map[p], ans_map[np.argmax(answer[idx])] print "Loss", loss_value, batch_no, i print "Accuracy", accuracy print "---------------" else: print "Loss", loss_value, batch_no, i print "Training Accuracy", accuracy save_path = saver.save(sess, "Data/Models/model{}.ckpt".format(i))
def main(): config = json.load(open('config.json')) parser = argparse.ArgumentParser() parser.add_argument('--num_lstm_layers', type=int, default=2, help='num_lstm_layers') parser.add_argument('--fc7_feature_length', type=int, default=4096, help='fc7_feature_length') parser.add_argument('--rnn_size', type=int, default=512, help='rnn_size') parser.add_argument('--embedding_size', type=int, default=512, help='embedding_size'), parser.add_argument('--word_emb_dropout', type=float, default=0.5, help='word_emb_dropout') parser.add_argument('--image_dropout', type=float, default=0.5, help='image_dropout') parser.add_argument('--qa_dir', type=str, default=config['qa_dir'], help='QA Data directory') parser.add_argument('--data_dir', type=str, default=config['data_dir'], help='Common Data directory') parser.add_argument('--batch_size', type=int, default=200, help='Batch Size') parser.add_argument('--learning_rate', type=float, default=0.001, help='Batch Size') parser.add_argument('--epochs', type=int, default=2, help='Expochs') parser.add_argument('--debug', type=bool, default=False, help='Debug') parser.add_argument('--resume_model', type=str, default=None, help='Trained Model Path') parser.add_argument('--version', type=int, default=1, help='VQA data version') args = parser.parse_args() print("Reading QA DATA") qa_data = data_loader.load_questions_answers(args.qa_dir) print("Reading fc7 features") fc7_features, image_id_list = data_loader.load_fc7_features(args.data_dir, 'train') print("FC7 features", fc7_features.shape) print("image_id_list", image_id_list.shape) image_id_map = {} for i in range(len(image_id_list)): image_id_map[image_id_list[i]] = i ans_map = {qa_data['answer_vocab'][ans]: ans for ans in qa_data['answer_vocab']} model_options = { 'num_lstm_layers': args.num_lstm_layers, 'rnn_size': args.rnn_size, 'embedding_size': args.embedding_size, 'word_emb_dropout': args.word_emb_dropout, 'image_dropout': args.image_dropout, 'fc7_feature_length': args.fc7_feature_length, 'lstm_steps': qa_data['max_question_length'] + 1, 'q_vocab_size': len(qa_data['question_vocab']), 'ans_vocab_size': len(qa_data['answer_vocab']) } model = vis_lstm_model.Vis_lstm_model(model_options) input_tensors, t_loss, t_accuracy, t_p = model.build_model() train_op = tf.train.AdamOptimizer(args.learning_rate).minimize(t_loss) sess = tf.InteractiveSession() # tf.initialize_all_variables().run() # tf.initialize_all_variables() is deprecated since 2017-03-02 sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() if args.resume_model: saver.restore(sess, args.resume_model) acc_file = open('train_acc.txt', 'w', encoding='utf-8') acc_file.write('epoch avg_acc\n') for i in range(args.epochs): batch_no = 0 epochs_acc_sum = 0 while (batch_no * args.batch_size) < len(qa_data['training']): sentence, answer, fc7 = get_training_batch(batch_no, args.batch_size, fc7_features, image_id_map, qa_data, 'train') _, loss_value, accuracy, pred = sess.run([train_op, t_loss, t_accuracy, t_p], feed_dict={ input_tensors['fc7']: fc7, input_tensors['sentence']: sentence, input_tensors['answer']: answer } ) batch_no += 1 if args.debug: for idx, p in enumerate(pred): print(ans_map[p], ans_map[np.argmax(answer[idx])]) print("Loss", loss_value, batch_no, i) print("Accuracy", accuracy) print("---------------") epochs_acc_sum += accuracy else: print("Loss", loss_value, batch_no, i) print("Training Accuracy", accuracy) epochs_acc_sum += accuracy acc_file.write(str(i) + ' ' + str(epochs_acc_sum/batch_no) + '\n') print() save_path = saver.save(sess, "Data/Models/model{}.ckpt".format(i)) acc_file.close()
def serve(img_features, question): parser = argparse.ArgumentParser() parser.add_argument('--model_path', type=str, default='Data/model2.ckpt', help='Model Path') parser.add_argument('--num_lstm_layers', type=int, default=2, help='num_lstm_layers') parser.add_argument('--fc7_feature_length', type=int, default=4096, help='fc7_feature_length') parser.add_argument('--rnn_size', type=int, default=512, help='rnn_size') parser.add_argument('--embedding_size', type=int, default=512, help='embedding_size'), parser.add_argument('--word_emb_dropout', type=float, default=0.5, help='word_emb_dropout') parser.add_argument('--image_dropout', type=float, default=0.5, help='image_dropout') parser.add_argument('--data_dir', type=str, default='Data', help='Data directory') parser.add_argument('--question', type=str, default=question, help='Question') args = parser.parse_args() tf.reset_default_graph() vocab_data = data_loader.get_question_answer_vocab(args.data_dir) qvocab = vocab_data['question_vocab'] q_map = { vocab_data['question_vocab'][qw]: qw for qw in vocab_data['question_vocab'] } model_options = { 'num_lstm_layers': args.num_lstm_layers, 'rnn_size': args.rnn_size, 'embedding_size': args.embedding_size, 'word_emb_dropout': args.word_emb_dropout, 'image_dropout': args.image_dropout, 'fc7_feature_length': args.fc7_feature_length, 'lstm_steps': vocab_data['max_question_length'] + 1, 'q_vocab_size': len(vocab_data['question_vocab']), 'ans_vocab_size': len(vocab_data['answer_vocab']) } question_vocab = vocab_data['question_vocab'] word_regex = re.compile(r'\w+') question_ids = np.zeros((1, vocab_data['max_question_length']), dtype='int32') question_words = re.findall(word_regex, args.question) base = vocab_data['max_question_length'] - len(question_words) for i in range(0, len(question_words)): if question_words[i] in question_vocab: question_ids[0][base + i] = question_vocab[question_words[i]] else: question_ids[0][base + i] = question_vocab['UNK'] ans_map = { vocab_data['answer_vocab'][ans]: ans for ans in vocab_data['answer_vocab'] } model = vis_lstm_model.Vis_lstm_model(model_options) input_tensors, t_prediction, t_ans_probab = model.build_generator() sess = tf.InteractiveSession() saver = tf.train.Saver() saver.restore(sess, args.model_path) pred, answer_probab = sess.run( [t_prediction, t_ans_probab], feed_dict={ input_tensors['fc7']: img_features, input_tensors['sentence']: question_ids, }) sess.close() ans_list = [] ans_list.append(ans_map[pred[0]]) return ans_list
def main(): parser = argparse.ArgumentParser() parser.add_argument('--num_lstm_layers', type=int, default=2, help='num_lstm_layers') parser.add_argument('--cnn7_feature_length', type=int, default=512, help='cnn7_feature_length') parser.add_argument('--rnn_size', type=int, default=512, help='rnn_size') parser.add_argument('--embedding_size', type=int), parser.add_argument('--word_emb_dropout', type=float) parser.add_argument('--image_dropout', type=float) parser.add_argument('--data_dir', type=str) parser.add_argument('--batch_size', type=int, default=100, help='Batch Size') parser.add_argument('--learning_rate', type=float, default=0.1, help='Batch Size') parser.add_argument('--epochs', type=int, default=400, help='Expochs') parser.add_argument('--debug', type=bool, default=False, help='Debug') parser.add_argument('--resume_model', type=str, default=None, help='Trained Model Path') parser.add_argument('--version', type=int, default=2, help='VQA data version') args = parser.parse_args() print("Reading QA DATA") qa_data = data_loader.load_questions_answers(args.version, args.data_dir) print("Reading cnn7 features") cnn7_features, image_id_list = data_loader.load_cnn7_features(args.data_dir, 'train') print("cnn7 features", cnn7_features.shape) print("image_id_list", image_id_list.shape) image_id_map = {} for i in range(len(image_id_list)): image_id_map[ image_id_list[i] ] = i ans_map = { qa_data['answer_vocab'][ans] : ans for ans in qa_data['answer_vocab']} model_options = { 'num_lstm_layers' : args.num_lstm_layers, 'rnn_size' : args.rnn_size, 'embedding_size' : args.embedding_size, 'word_emb_dropout' : args.word_emb_dropout, 'image_dropout' : args.image_dropout, 'cnn7_feature_length' : args.cnn7_feature_length, 'lstm_steps' : qa_data['max_question_length'] + 1, 'q_vocab_size' : len(qa_data['question_vocab']), 'ans_vocab_size' : len(qa_data['answer_vocab']) } model = vis_lstm_model.Vis_lstm_model(model_options) input_tensors, t_loss, t_accuracy, t_p = model.build_model() train_op = tf.train.AdamOptimizer(args.learning_rate).minimize(t_loss) sess = tf.InteractiveSession() tf.initialize_all_variables().run() saver = tf.train.Saver() if args.resume_model: saver.restore(sess, args.resume_model) last_epoch = int(args.resume_model[-7:-5]) print(f'I resume Epoch {last_epoch}') else: last_epoch = int(-1) for i in range(args.epochs): batch_no = 0 batch_acc_record = [] while batch_no < 220: start = time.clock() sentence, answer, cnn7 = get_training_batch(batch_no, args.batch_size, cnn7_features, image_id_map, qa_data, 'train') _, loss_value, accuracy, pred = sess.run([train_op, t_loss, t_accuracy, t_p], feed_dict={ input_tensors['cnn7']:cnn7, input_tensors['sentence']:sentence, input_tensors['answer']:answer } ) batch_acc_record.append(accuracy) batch_no += 1 if args.debug: for idx, p in enumerate(pred): print(ans_map[p], ans_map[ np.argmax(answer[idx])]) print("Loss", loss_value, batch_no, i + 1 + last_epoch) print("Accuracy", accuracy) print("---------------") else: print("Loss", loss_value, batch_no, i + 1 + last_epoch) print("Training Accuracy", accuracy) end = time.clock() print("Time for one batch", end - start) print("Hours For one epoch" , (291 * 1.0)*(end - start)/60.0/60.0) save_path = saver.save(sess, "Data/Models/model{}.ckpt".format(i + 1 + last_epoch)) if np.mean(batch_acc_record)>=0.9: break sess.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model_path', type=str, default=MODEL_PATH, help='Model Path') parser.add_argument('--num_lstm_layers', type=int, default=2, help='num_lstm_layers') parser.add_argument('--feature_length', type=int, default=4096, help='feature_length') parser.add_argument('--rnn_size', type=int, default=512, help='rnn_size') parser.add_argument('--embedding_size', type=int, default=512, help='embedding_size'), parser.add_argument('--word_emb_dropout', type=float, default=1.0, help='word_emb_dropout') parser.add_argument('--image_dropout', type=float, default=1.0, help='image_dropout') parser.add_argument('--data_dir', type=str, default=DATA_PATH, help='Data directory') parser.add_argument('--image_features', type=str, default='vgg16', help='Image features') args = parser.parse_args() vocab_data = data_loader.get_question_answer_vocab(args.data_dir) model_options = { 'num_lstm_layers': args.num_lstm_layers, 'rnn_size': args.rnn_size, 'embedding_size': args.embedding_size, 'word_emb_dropout': args.word_emb_dropout, 'image_dropout': args.image_dropout, 'feature_length': args.feature_length, 'lstm_steps': vocab_data['max_question_length'] + 1, 'q_vocab_size': len(vocab_data['question_vocab']), 'ans_vocab_size': len(vocab_data['answer_vocab']) } ans_map = { vocab_data['answer_vocab'][ans]: ans for ans in vocab_data['answer_vocab'] } question_vocab = vocab_data['question_vocab'] word_regex = re.compile(r'\w+') print("Reading QA DATA") test_data = data_loader.load_test_questions() print(len(test_data['question_vocab'])) features, image_id_list = data_loader.load_features( args.data_dir, 'test', args.image_features) print("Features", features.shape) print("Image_id_list", image_id_list.shape) image_id_map = {} for i in range(len(image_id_list)): image_id_map[image_id_list[i]] = i model = vis_lstm_model.Vis_lstm_model(model_options) input_tensors, t_prediction, t_ans_probab = model.build_generator() sess = tf.InteractiveSession() saver = tf.train.Saver() saver.restore(sess, args.model_path) stop_vocab = ['a', 'an', 'the'] for i, now_image in enumerate(test_data['testing']): now_image_path = 'Data/test2015/COCO_test2015_%.12d.jpg' % ( now_image['image_id']) img = Image.open(now_image_path) img.show() question_ids = np.zeros((1, vocab_data['max_question_length']), dtype='int32') print('Question:', now_image['question']) question_words = re.findall(word_regex, now_image['question']) question_words = list( filter(lambda x: data_loader.vocab_handle(x) not in stop_vocab, question_words)) base = vocab_data['max_question_length'] - len(question_words) for j in range(0, len(question_words)): now_question_words = data_loader.vocab_handle(question_words[j]) if now_question_words in question_vocab: question_ids[0][base + j] = question_vocab[now_question_words] else: question_ids[0][base + j] = question_vocab['UNK'] now_index = image_id_map[test_data['testing'][i]['image_id']] pred, answer_probab = sess.run( [t_prediction, t_ans_probab], feed_dict={ input_tensors['features']: features[now_index].reshape(1, args.feature_length), input_tensors['sentence']: question_ids, }) print("Ans:", ans_map[pred[0]]) answer_probab_tuples = [(-answer_probab[0][idx], idx) for idx in range(len(answer_probab[0]))] answer_probab_tuples.sort() print("Top Answers:") for i in range(5): print(ans_map[answer_probab_tuples[i][1]]) input() sess.close()
def main(): print("Reading QA DATA") qa_data = utils.load_questions_answers(FLAGS.data_dir) print("Reading image features") img_features, image_id_list = utils.load_image_features(FLAGS.data_dir, "train") print("img features", img_features.shape) print("image_id_list", image_id_list.shape) image_id_map = {} for i in range(len(image_id_list)): image_id_map[ image_id_list[i] ] = i ans_map = { qa_data['answer_vocab'][ans] : ans for ans in qa_data['answer_vocab']} model_options = { "num_lstm_layers" : FLAGS.num_lstm_layers, "rnn_size" : FLAGS.rnn_size, "embedding_size" : FLAGS.que_feat_len, "word_emb_dropout" : FLAGS.word_dropout, "image_dropout" : FLAGS.img_dropout, "img_feature_length" : FLAGS.img_feat_len, "lstm_steps" : qa_data["max_question_length"] + 1, "q_vocab_size" : len(qa_data["question_vocab"]), "ans_vocab_size" : len(qa_data["answer_vocab"]) } model = vis_lstm_model.Vis_lstm_model(model_options) input_tensors, t_loss, t_accuracy, t_p = model.build_model() train_op = tf.train.AdamOptimizer(FLAGS.learning_rate).minimize(t_loss) sess = tf.InteractiveSession() tf.initialize_all_variables().run() saver = tf.train.Saver() if FLAGS.checkpoint_path: saver.restore(sess, tf.train.latest_checkpoint(FLAGS.checkpoint_path)) for i in range(FLAGS.epochs): batch_no = 0 while (batch_no*FLAGS.batch_size) < len(qa_data["training"]): sentence, answer, img = get_training_batch(batch_no, FLAGS.batch_size, img_features, image_id_map, qa_data, "train") _, loss_value, accuracy, pred = sess.run([train_op, t_loss, t_accuracy, t_p], feed_dict={ input_tensors["img"]:img, input_tensors["sentence"]:sentence, input_tensors["answer"]:answer } ) batch_no += 1 if FLAGS.debug: for idx, p in enumerate(pred): print(ans_map[p], ans_map[ np.argmax(answer[idx])]) print("Loss", loss_value, batch_no, i) print("Accuracy", accuracy) print("---------------") else: print("Loss", loss_value, batch_no, i) print("Training Accuracy", accuracy) save_path = saver.save(sess, "./data/pretrain/model/model{}.ckpt".format(i))
def main(image_path="test.jpg", question="what is in the image?"): slim = tf.contrib.slim resnet = nets.resnet_v2 """ tf.app.flags.DEFINE_string("image_path", image_path, "directory of image") tf.app.flags.DEFINE_string("question", question, "question") tf.app.flags.DEFINE_string("img_checkpoint_path", "./data/pretrain/resnet152/resnet_v2_152.ckpt", "directory of checkpoint files for image feature extraction") tf.app.flags.DEFINE_string("checkpoint_path", "./data/pretrain/model", "directory of checkpoint files for overall model") tf.app.flags.DEFINE_integer("num_lstm_layers", 2, "number of lstm layers") tf.app.flags.DEFINE_integer( "img_feat_len", 1001, "length of image feature vector") tf.app.flags.DEFINE_integer("rnn_size", 300, "size of rnn") tf.app.flags.DEFINE_integer( "que_feat_len", 300, "length of question feature vector") tf.app.flags.DEFINE_float("word_dropout", 0.5, "dropout rate of word nodes") tf.app.flags.DEFINE_float("img_dropout", 0.5, "dropout rate of image nodes") tf.app.flags.DEFINE_string("data_dir", "./data", "directory of data") FLAGS = tf.app.flags.FLAGS print ("Image:", FLAGS.image_path) print ("Question:", FLAGS.question) """ #FLAGS = object() flags_image_path = image_path flags_question = question flags_img_checkpoint_path = "./data/pretrain/resnet152/resnet_v2_152.ckpt" flags_checkpoint_path = "./data/pretrain/model" flags_num_lstm_layers = 2 flags_img_feat_len = 1001 flags_rnn_size = 300 flags_que_feat_len = 300 flags_word_dropout = 0.5 flags_img_dropout = 0.5 flags_data_dir = "./data" vocab_data = utils.get_question_answer_vocab(flags_data_dir) qvocab = vocab_data['question_vocab'] q_map = {vocab_data['question_vocab'][qw] : qw for qw in vocab_data['question_vocab']} with tf.Graph().as_default(): images = tf.placeholder("float32", [None, 224, 224, 3]) with slim.arg_scope(resnet.resnet_arg_scope()): net, _ = resnet.resnet_v2_152(images, 1001, is_training=False) restorer = tf.train.Saver() with tf.Session() as sess:#config=tf.ConfigProto(log_device_placement=True)) as sess: start = time.clock() image_array = utils.load_image_array(flags_image_path) image_feed = np.ndarray((1, 224, 224, 3)) image_feed[0:, :, :] = image_array # checkpoint = tf.train.latest_checkpoint(flags_img_checkpoint_path) checkpoint = flags_img_checkpoint_path restorer.restore(sess, checkpoint) print("Image Model loaded") feed_dict = {images: image_feed} img_feature = sess.run(net, feed_dict=feed_dict) img_feature = np.squeeze(img_feature) end = time.clock() print("Time elapsed", end - start) print("Image processed") model_options = { 'num_lstm_layers': flags_num_lstm_layers, 'rnn_size': flags_rnn_size, 'embedding_size': flags_que_feat_len, 'word_emb_dropout': flags_word_dropout, 'image_dropout': flags_img_dropout, 'img_feature_length': flags_img_feat_len, 'lstm_steps': vocab_data['max_question_length'] + 1, 'q_vocab_size': len(vocab_data['question_vocab']), 'ans_vocab_size': len(vocab_data['answer_vocab']) } question_vocab = vocab_data['question_vocab'] word_regex = re.compile(r'\w+') question_ids = np.zeros( (1, vocab_data['max_question_length']), dtype='int32') question_words = re.findall(word_regex, flags_question) base = vocab_data['max_question_length'] - len(question_words) for i in range(0, len(question_words)): if question_words[i] in question_vocab: question_ids[0][base + i] = question_vocab[question_words[i]] else: question_ids[0][base + i] = question_vocab['UNK'] ans_map = {vocab_data['answer_vocab'][ans] : ans for ans in vocab_data['answer_vocab']} with tf.Graph().as_default(): model = vis_lstm_model.Vis_lstm_model(model_options) input_tensors, t_prediction, t_ans_probab = model.build_generator() restorer = tf.train.Saver() with tf.Session() as sess:#config=tf.ConfigProto(log_device_placement=True)) as sess: checkpoint = tf.train.latest_checkpoint(flags_checkpoint_path) restorer.restore(sess, checkpoint) pred, answer_probab = sess.run([t_prediction, t_ans_probab], feed_dict={ input_tensors['img']: np.reshape(img_feature, [1,1001]), input_tensors['sentence']: question_ids, }) print("Ans:", ans_map[pred[0]]) answer_probab_tuples = [(-answer_probab[0][idx], idx) for idx in range(len(answer_probab[0]))] answer_probab_tuples.sort() print("Top Answers") for i in range(5): print(ans_map[answer_probab_tuples[i][1]]) return (ans_map, answer_probab_tuples)
def main(): parser = argparse.ArgumentParser() parser.add_argument( '--image_path', type=str, default='Data/train_2014/COCO_train2014_000000581922.jpg', help='Image Path') parser.add_argument( '--model_path', type=str, default='Data/train2014/Tri Training 3/Models/model49.ckpt', help='Model Path') parser.add_argument('--num_lstm_layers', type=int, default=2, help='num_lstm_layers') parser.add_argument('--fc7_feature_length', type=int, default=4096, help='fc7_feature_length') parser.add_argument('--rnn_size', type=int, default=512, help='rnn_size') parser.add_argument('--embedding_size', type=int, default=512, help='embedding_size'), parser.add_argument('--word_emb_dropout', type=float, default=0.5, help='word_emb_dropout') parser.add_argument('--image_dropout', type=float, default=0.5, help='image_dropout') parser.add_argument('--data_dir', type=str, default='Data/train2014/Tri Training 3/', help='Data directory') parser.add_argument('--batch_size', type=int, default=200, help='Batch Size') parser.add_argument('--question', type=str, default='What is this product?', help='Question') args = parser.parse_args() #vizwiz_file_path = 'Data/Test' solution = dict() solution["model"] = "model_3" solution["predictions"] = [] vocab_data = data_loader.get_question_answer_vocab(data_dir=args.data_dir) qvocab = vocab_data['question_vocab'] q_map = { vocab_data['question_vocab'][qw]: qw for qw in vocab_data['question_vocab'] } vizwiz_questions_path = 'VizWiz_to_VQA_Questions.json' with open(vizwiz_questions_path, 'r') as input_file: vizwiz_questions = json.loads(input_file.read()) '''with open('questions_temp.txt','w') as temp_file: for i in range(20000): #print(vizwiz_questions['questions'][i]['question'] temp_file.write(vizwiz_questions['questions'][i]['question']) temp_file.write('\n')''' question_vocab = vocab_data['question_vocab'] word_regex = re.compile(r'\w+') #question_ids = np.zeros((20000, vocab_data['max_question_length']), dtype = 'int32') #fc7_features = np.zeros((2, args.fc7_feature_length)) print("Reading fc7 features") fc7_features, image_id_list = data_loader.load_fc7_features( 'Data/', 'train') print("FC7 features", fc7_features.shape) print("image_id_list", image_id_list.shape) #print(0/0) '''i=0 for file in os.listdir(vizwiz_file_path): if file.endswith(".jpg"): args.image_path = join(vizwiz_file_path,file) #args.question = vizwiz_questions['questions'][i]['question'] print("Image:", args.image_path) print("Question:", args.question) fc7_features[i] = utils.extract_fc7_features(args.image_path, 'Data/vgg16-20160129.tfmodel') i += 1''' model_options = { 'num_lstm_layers': args.num_lstm_layers, 'rnn_size': args.rnn_size, 'embedding_size': args.embedding_size, 'word_emb_dropout': args.word_emb_dropout, 'image_dropout': args.image_dropout, 'fc7_feature_length': args.fc7_feature_length, 'lstm_steps': vocab_data['max_question_length'] + 1, 'q_vocab_size': len(vocab_data['question_vocab']), 'ans_vocab_size': len(vocab_data['answer_vocab']) } #question_words = re.findall(word_regex, args.question) #base = vocab_data['max_question_length'] - len(question_words) '''for no_questions in range(question_ids.shape[0]): for i in range(0, len(question_words)): if question_words[i] in question_vocab: question_ids[no_questions][base + i] = question_vocab[ question_words[i] ] else: question_ids[no_questions][base + i] = question_vocab['UNK']''' ans_map = { vocab_data['answer_vocab'][ans]: ans for ans in vocab_data['answer_vocab'] } model = vis_lstm_model.Vis_lstm_model(model_options) input_tensors, t_prediction, t_ans_probab = model.build_generator() sess = tf.InteractiveSession() saver = tf.train.Saver() saver.restore(sess, args.model_path) batch_no = 0 with open('result3.txt', 'w') as output_file: while (batch_no * args.batch_size) < 20000: question_ids = np.zeros( (args.batch_size, vocab_data['max_question_length']), dtype='int32') #vizwiz_questions['questions'][i]['question'] for no_questions in range(question_ids.shape[0]): question_formatted = vizwiz_questions['questions'][ batch_no * args.batch_size + no_questions]['question'] question_list = question_formatted.split() question_list = question_list[0:20] question_formatted = ' '.join(question_list) question_words = re.findall(word_regex, question_formatted) base = vocab_data['max_question_length'] - len(question_words) for i in range(0, len(question_words)): if question_words[i] in question_vocab: question_ids[no_questions][base + i] = question_vocab[ question_words[i]] else: question_ids[no_questions][base + i] = question_vocab['UNK'] fc7 = get_batch(batch_no, args.batch_size, fc7_features) pred, ans_prob = sess.run( [t_prediction, t_ans_probab], feed_dict={ input_tensors['fc7']: fc7, input_tensors['sentence']: question_ids, }) for i in range(len(pred)): current_prediction = dict() current_prediction["image_id"] = "VizWiz_train_%.12d.jpg" % ( batch_no * args.batch_size + i) current_prediction["question"] = vizwiz_questions['questions'][ batch_no * args.batch_size + i]['question'] #output_file.write("Ques:" + vizwiz_questions['questions'][batch_no*args.batch_size + i]['question']) answer_list = [] answer_probab_tuples = [(-ans_prob[i][idx], idx) for idx in range(len(ans_prob[0]))] answer_probab_tuples.sort() for j in range(5): answer_list.append(ans_map[answer_probab_tuples[j][1]]) #output_file.write("Ans:" + ans_map[pred[i]]) current_prediction["predicted_answer"] = answer_list #output_file.write("Ans:" + str(answer_list)) #output_file.write('\n') solution["predictions"].append(current_prediction) #print("Ans:", ans_map[pred[i]]) #print('\n') batch_no += 1 output_file.write(json.dumps(solution))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--num_lstm_layers', type=int, default=2, help='num_lstm_layers') parser.add_argument('--fc7_feature_length', type=int, default=4096, help='fc7_feature_length') parser.add_argument('--rnn_size', type=int, default=512, help='rnn_size') parser.add_argument('--embedding_size', type=int, default=512, help='embedding_size'), parser.add_argument('--word_emb_dropout', type=float, default=0.5, help='word_emb_dropout') parser.add_argument('--image_dropout', type=float, default=0.5, help='image_dropout') parser.add_argument('--data_dir', type=str, default='Data', help='Data directory') parser.add_argument('--batch_size', type=int, default=200, help='Batch Size') parser.add_argument('--learning_rate', type=float, default=0.001, help='Batch Size') parser.add_argument('--epochs', type=int, default=100, help='Epochs') parser.add_argument('--debug', type=bool, default=False, help='Debug') parser.add_argument('--resume_model', type=str, default=None, help='Trained Model Path') parser.add_argument('--version', type=int, default=2, help='VQA data version') args = parser.parse_args() print("Creating QuestionAnswer data") prepare_training_data('trainquestions.json', 'trainannotations.json', 'valquestions.json', 'valannotations.json') print("Prepared given data") print("Reading QuestionAnswer data") qa_data = load_questions_answers('newqadata.pkl', 'Data') print(qa_data['answer_vocab']) print("Creating Image features") ################################################ split = 'train' vgg_file = open('Data/vgg16.tfmodel', 'rb') vgg16raw = vgg_file.read() vgg_file.close() graph_def = tf.GraphDef() graph_def.ParseFromString(vgg16raw) images = tf.placeholder("float", [None, 224, 224, 3]) tf.import_graph_def(graph_def, input_map={"images": images}) graph = tf.get_default_graph() for opn in graph.get_operations(): print("Name", opn.name, opn.values()) all_data = load_questions_answers() if split == "train": qa_data = all_data['training'] else: qa_data = all_data['validation'] image_ids = {} for qa in qa_data: image_ids[qa['image_id']] = 1 image_id_list = [img_id for img_id in image_ids] print("Total Images", len(image_id_list)) sess = tf.Session() fc7 = np.ndarray((len(image_id_list), 4096)) idx = 0 while idx < len(image_id_list): start = time.clock() image_batch = np.ndarray((10, 224, 224, 3)) count = 0 for i in range(0, args.batch_size): if idx >= len(image_id_list): break image_file = join('Data', '%snew/%.1d.jpg' % (split, image_id_list[idx])) image_batch[i, :, :, :] = utils.load_image_array(image_file) idx += 1 count += 1 feed_dict = {images: image_batch[0:count, :, :, :]} fc7_tensor = graph.get_tensor_by_name("import/Relu_1:0") fc7_batch = sess.run(fc7_tensor, feed_dict=feed_dict) fc7[(idx - count):idx, :] = fc7_batch[0:count, :] end = time.clock() print("Time for batch 10 photos", end - start) print("Hours For Whole Dataset", (len(image_id_list) * 1.0) * (end - start) / 60.0 / 60.0 / 10.0) print("Images Processed", idx) print("Saving fc7 features") h5f_fc7 = h5py.File(join('Data', 'fc7new.h5'), 'w') h5f_fc7.create_dataset('fc7_features', data=fc7) h5f_fc7.close() print("Saving image id list") h5f_image_id_list = h5py.File(join('Data', 'image_id_listnew.h5'), 'w') h5f_image_id_list.create_dataset('image_id_list', data=image_id_list) h5f_image_id_list.close() print("Done!") ##################################################33 print("Reading image features") fc7_features, image_id_list = load_fc7_features('Data', 'train') print("FC7 features", fc7_features.shape) print("image_id_list", image_id_list.shape) qa_data = load_questions_answers('newqadata.pkl', 'Data') print(qa_data['answer_vocab']) image_id_map = {} for i in range(len(image_id_list)): image_id_map[image_id_list[i]] = i ans_map = { qa_data['answer_vocab'][ans]: ans for ans in qa_data['answer_vocab'] } model_options = { 'num_lstm_layers': 2, 'rnn_size': 512, 'embedding_size': 512, 'word_emb_dropout': 0.5, 'image_dropout': 0.5, 'fc7_feature_length': 4096, 'lstm_steps': qa_data['max_question_length'] + 1, 'q_vocab_size': len(qa_data['question_vocab']), 'ans_vocab_size': len(qa_data['answer_vocab']) } model = vis_lstm_model.Vis_lstm_model(model_options) input_tensors, t_loss, t_accuracy, t_p = model.build_model() train_op = tf.train.AdamOptimizer(0.001).minimize(t_loss) sess = tf.InteractiveSession() tf.initialize_all_variables().run() saver = tf.train.Saver() #model.summary() #plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True) if args.resume_model: saver.restore(sess, args.resume_model) for i in range(100): batch_no = 0 while (batch_no * 10) < len(qa_data['training']): sentence, answer, fc7 = get_training_batch(batch_no, 10, fc7_features, image_id_map, qa_data, 'train') _, loss_value, accuracy, pred = sess.run( [train_op, t_loss, t_accuracy, t_p], feed_dict={ input_tensors['fc7']: fc7, input_tensors['sentence']: sentence, input_tensors['answer']: answer }) batch_no += 1 if args.debug: for idx, p in enumerate(pred): print(ans_map[p], ans_map[np.argmax(answer[idx])]) print("Loss", loss_value, batch_no, i) print("Accuracy", accuracy) print("---------------") skplt.metrics.plot_roc_curve(answer[idx], ans_map[p]) plt.show() else: print("Loss", loss_value, batch_no, i) print("Training Accuracy", accuracy) #skplt.metrics.plot_roc_curve(answer[0], pred[0]) #plt.show() save_path = saver.save(sess, "Data/Models/modelnew{}.ckpt".format(i))
def main(): parser = argparse.ArgumentParser() parser.add_argument( '--image_path', type=str, default='Data/train_2014/COCO_train2014_000000581922.jpg', help='Image Path') parser.add_argument( '--model_path', type=str, default='Data/train2014/Tri Training 1/Models/model11.ckpt', help='Model Path') parser.add_argument('--num_lstm_layers', type=int, default=2, help='num_lstm_layers') parser.add_argument('--fc7_feature_length', type=int, default=4096, help='fc7_feature_length') parser.add_argument('--rnn_size', type=int, default=512, help='rnn_size') parser.add_argument('--embedding_size', type=int, default=512, help='embedding_size'), parser.add_argument('--word_emb_dropout', type=float, default=0.5, help='word_emb_dropout') parser.add_argument('--image_dropout', type=float, default=0.5, help='image_dropout') parser.add_argument('--data_dir', type=str, default='Data/train2014/Tri Training 1/', help='Data directory') parser.add_argument('--question', type=str, default='What is this product?', help='Question') args = parser.parse_args() vizwiz_file_path = 'Data/Images' vocab_data = data_loader.get_question_answer_vocab(data_dir=args.data_dir) qvocab = vocab_data['question_vocab'] #print(qvocab) #print(0/0) q_map = { vocab_data['question_vocab'][qw]: qw for qw in vocab_data['question_vocab'] } vizwiz_questions_path = 'VizWiz_to_VQA_Questions.json' with open(vizwiz_questions_path, 'r') as input_file: vizwiz_questions = json.loads(input_file.read()) question_vocab = vocab_data['question_vocab'] word_regex = re.compile(r'\w+') question_ids = np.zeros((1, vocab_data['max_question_length']), dtype='int32') i = 0 for file in os.listdir(vizwiz_file_path): if file.endswith(".jpg"): args.image_path = join(vizwiz_file_path, file) args.question = vizwiz_questions['questions'][i]['question'] i += 1 print("Image:", args.image_path) print("Question:", args.question) #fc7_features, image_id_list = data_loader.load_fc7_features(args.data_dir, 'val') fc7_features = utils.extract_fc7_features( args.image_path, 'Data/train2014/Tri Training 1/vgg16-20160129.tfmodel') model_options = { 'num_lstm_layers': args.num_lstm_layers, 'rnn_size': args.rnn_size, 'embedding_size': args.embedding_size, 'word_emb_dropout': args.word_emb_dropout, 'image_dropout': args.image_dropout, 'fc7_feature_length': args.fc7_feature_length, 'lstm_steps': vocab_data['max_question_length'] + 1, 'q_vocab_size': len(vocab_data['question_vocab']), 'ans_vocab_size': len(vocab_data['answer_vocab']) } question_words = re.findall(word_regex, args.question) base = vocab_data['max_question_length'] - len(question_words) for i in range(0, len(question_words)): if question_words[i] in question_vocab: question_ids[0][base + i] = question_vocab[question_words[i]] else: question_ids[0][base + i] = question_vocab['UNK'] ans_map = { vocab_data['answer_vocab'][ans]: ans for ans in vocab_data['answer_vocab'] } model = vis_lstm_model.Vis_lstm_model(model_options) input_tensors, t_prediction, t_ans_probab = model.build_generator() sess = tf.InteractiveSession() saver = tf.train.Saver() saver.restore(sess, args.model_path) print(question_ids.shape) print(fc7_features.shape) print(0 / 0) pred, answer_probab = sess.run( [t_prediction, t_ans_probab], feed_dict={ input_tensors['fc7']: fc7_features, input_tensors['sentence']: question_ids, }) print("Ans:", ans_map[pred[0]]) answer_probab_tuples = [(-answer_probab[0][idx], idx) for idx in range(len(answer_probab[0]))] answer_probab_tuples.sort() print("Top Answers") for i in range(5): #print(ans_map[answer_probab_tuples[i]]) print(ans_map[answer_probab_tuples[i][1]])
def main(): parser = argparse.ArgumentParser() parser.add_argument( '--image_path', type=str, default='Data/val2015/abstract_v002_val2015_000000022100.png', help='Image Path') parser.add_argument('--model_path', type=str, default='Data/Models/model19.ckpt', help='Model Path') parser.add_argument('--num_lstm_layers', type=int, default=2, help='num_lstm_layers') parser.add_argument('--cnn7_feature_length', type=int, default=512, help='cnn7_feature_length') parser.add_argument('--rnn_size', type=int, default=512, help='rnn_size') parser.add_argument('--embedding_size', type=int, default=512, help='embedding_size'), parser.add_argument('--word_emb_dropout', type=float, default=0.5, help='word_emb_dropout') parser.add_argument('--image_dropout', type=float, default=0.5, help='image_dropout') parser.add_argument('--data_dir', type=str, default='Data', help='Data directory') parser.add_argument('--question', type=str, default='What is the man sitting on?', help='Question') parser.add_argument('--lstm_direc', type=str, default='uni', help='LSTM Direction') args = parser.parse_args() #Extract vocabulary of question and answer vocab_data = data_loader.get_question_answer_vocab(data_dir=args.data_dir) #Build q_map but seems useless, ans_map useful q_map = { vocab_data['question_vocab'][qw]: qw for qw in vocab_data['question_vocab'] } ans_map = { vocab_data['answer_vocab'][ans]: ans for ans in vocab_data['answer_vocab'] } cnn7_features = utils.extract_cnn7_features( args.image_path, join(args.data_dir, 'vgg16.tfmodel')) #Word Splitting question_vocab = vocab_data['question_vocab'] word_regex = re.compile(r'\w+') question_words = re.findall(word_regex, args.question) #Find question's word in ques_vocab,record it in ques_id question_ids = np.zeros((1, vocab_data['max_question_length']), dtype='int32') base = vocab_data['max_question_length'] - len(question_words) for i in range(0, len(question_words)): if question_words[i] in question_vocab: question_ids[0][base + i] = question_vocab[question_words[i]] else: question_ids[0][base + i] = question_vocab[ 'UNK'] #Biggest index in answer #preparing model model_options = { 'num_lstm_layers': args.num_lstm_layers, 'rnn_size': args.rnn_size, 'embedding_size': args.embedding_size, 'word_emb_dropout': args.word_emb_dropout, 'image_dropout': args.image_dropout, 'cnn7_feature_length': args.cnn7_feature_length, 'lstm_steps': vocab_data['max_question_length'] + 1, 'q_vocab_size': len(vocab_data['question_vocab']), 'ans_vocab_size': len(vocab_data['answer_vocab']), } #resume model model = vis_lstm_model.Vis_lstm_model(model_options) input_tensors, t_prediction, t_ans_probab = model.build_generator(batch=1) sess = tf.InteractiveSession() saver = tf.train.Saver() saver.restore(sess, args.model_path) #predict pred, answer_probab = sess.run( [t_prediction, t_ans_probab], feed_dict={ input_tensors['cnn7']: cnn7_features, input_tensors['sentence']: question_ids, }) #showing image/question/answer print("Image:", args.image_path) print("Question:", args.question) print("Ans:", ans_map[pred[0]]) answer_probab_tuples = [(-answer_probab[0][idx], idx) for idx in range(len(answer_probab[0]))] answer_probab_tuples.sort() print("Top Answers") sess.close() for i in range(5): print(ans_map[answer_probab_tuples[i][1]])
def main(): tf.app.flags.DEFINE_integer("num_lstm_layers", 2, "number of lstm layers") tf.app.flags.DEFINE_integer("img_feat_len", 1001, "length of image feature vector") tf.app.flags.DEFINE_integer("rnn_size", 300, "size of rnn") tf.app.flags.DEFINE_integer("que_feat_len", 300, "length of question feature vector") tf.app.flags.DEFINE_float("word_dropout", 0.5, "dropout rate of word nodes") tf.app.flags.DEFINE_float("img_dropout", 0.5, "dropout rate of image nodes") tf.app.flags.DEFINE_string("data_dir", "./data", "directory of data") tf.app.flags.DEFINE_integer("batch_size", 200, "size of batches") tf.app.flags.DEFINE_float("learning_rate", 0.001, "learning rate") tf.app.flags.DEFINE_integer("epochs", 200, "number of epochs") tf.app.flags.DEFINE_string("checkpoint_path", './data/pretrain/model', "directory of checkpoint files") tf.app.flags.DEFINE_bool("debug", True, "debug subroutine") FLAGS = tf.app.flags.FLAGS print("Reading QA DATA") qa_data = utils.load_questions_answers(FLAGS.data_dir) vocab_data = utils.get_question_answer_vocab(FLAGS.data_dir) print("Reading image features") img_features, image_id_list = utils.load_image_features(FLAGS.data_dir, "val") print("img features", img_features.shape) print("image_id_list", image_id_list.shape) image_id_map = {} for i in range(len(image_id_list)): image_id_map[ image_id_list[i] ] = i ans_map = { qa_data['answer_vocab'][ans] : ans for ans in qa_data['answer_vocab']} model_options = { 'num_lstm_layers': FLAGS.num_lstm_layers, 'rnn_size': FLAGS.rnn_size, 'embedding_size': FLAGS.que_feat_len, 'word_emb_dropout': FLAGS.word_dropout, 'image_dropout': FLAGS.img_dropout, 'img_feature_length': FLAGS.img_feat_len, 'lstm_steps': vocab_data['max_question_length'] + 1, 'q_vocab_size': len(vocab_data['question_vocab']), 'ans_vocab_size': len(vocab_data['answer_vocab']) } model = vis_lstm_model.Vis_lstm_model(model_options) input_tensors, t_prediction, t_ans_probab = model.build_generator() with tf.Session() as sess: restorer = tf.train.Saver() avg_accuracy = 0.0 total = 0 checkpoint = tf.train.latest_checkpoint(FLAGS.checkpoint_path) restorer.restore(sess, checkpoint) batch_no = 0 while (batch_no*FLAGS.batch_size) < len(qa_data['validation']): sentence, answer, img = get_batch(batch_no, FLAGS.batch_size, img_features, image_id_map, qa_data) pred, ans_prob = sess.run([t_prediction, t_ans_probab], feed_dict={ input_tensors['img']:img, input_tensors['sentence']:sentence, }) batch_no += 1 if FLAGS.debug: for idx, p in enumerate(pred): print(ans_map[p], ans_map[ np.argmax(answer[idx])]) correct_predictions = np.equal(pred, np.argmax(answer, 1)) correct_predictions = correct_predictions.astype('float32') accuracy = correct_predictions.mean() print("Acc", accuracy) avg_accuracy += accuracy total += 1 print("Acc", avg_accuracy/total)