def visualizeNetwork(sess, net, C): # -*- coding: utf-8 -*- # Get handle for vgg model vgg, images = data_loader.getVGGhandle() # Parse all the vqa question informations qa_data = data_loader.load_questions_answers(C.datapath) data_validation = qa_data['validation'] data_training = qa_data['training'] question_vocab = qa_data['question_vocab'] answer_vocab = qa_data['answer_vocab'] reverse_answer_vocab = data_loader.get_reverse_vocab(answer_vocab) reverse_quest_vocab = data_loader.get_reverse_vocab(question_vocab) train_data_path = os.path.join(C.image_base_path, 'train2014') val_data_path = os.path.join(C.image_base_path, 'val2014') train_data_generator = data_loader.getNextBatch(sess, vgg, images, data_training, question_vocab, answer_vocab, train_data_path, batchSize=1, purpose='train') valid_data_generator = data_loader.getNextBatch(sess, vgg, images, data_validation, question_vocab, answer_vocab, val_data_path, batchSize=1, purpose='val') save_path = '../vizQnA/' for i in range(C.max_visualize): batch_question, batch_answer, batch_image_id, batch_features = train_data_generator.next( ) image_path = train_data_path image_save_dir = os.path.join(save_path, batch_image_id[0]) utils.make_dir(image_save_dir) [predicted_prob, attn_map_t0,attn_map_t8,attn_map_t17, attn_map_t19,attn_map_t21 ]= sess.run([net.ans_op_prob,net.attn_map_t0 ,net.attn_map_t8, \ net.attn_map_t17,net.attn_map_t19,net.attn_map_t21] , \ feed_dict = { net.qs_ip : batch_question , \ net.cnn_ip : batch_features }) [top_predicted_answer, predicted_answer_prob ] = utils.parse_predicted_probabilities(predicted_prob[0], C.numAnswer) attn_map = [ attn_map_t0[0], attn_map_t8[0], attn_map_t17[0], attn_map_t19[0], attn_map_t21[0] ] utils.process_results( top_predicted_answer, predicted_answer_prob, attn_map, image_path, batch_question[0], batch_answer[0], \ batch_image_id[0], image_save_dir, reverse_quest_vocab, reverse_answer_vocab, purpose='train' )
def preprocess_question(): glove_source = 'data/glove.6B.50d.txt' glove_pkl = 'data/glove_6B_50.pkl' missing_pkl = 'data/missing_glove_ques.pkl' # Prepare Glove Dictionary if os.path.isfile(glove_pkl): print "Glove dictionary already exists!" else: if w2g.build_glove_dict(glove_source, glove_pkl) == 0: print "COMPLETED: Glove dictionary parsing" # Identify missing words in Glove dictionary if os.path.isfile(missing_pkl): print "Missing question vectors already processed!" else: # Load Glove Dictionary glove_dict = w2g.get_glove_dict(glove_pkl) # Load VQA training data vqa_data = dl.load_questions_answers('data') print "COMPLETED: VQA data retrieval" ques_vocab = vqa_data['question_vocab'] if w2g.build_missing_w2g(ques_vocab, glove_dict, missing_pkl) == 0: print "COMPLETED Missing question words identification"
def main(): parser = argparse.ArgumentParser() parser.add_argument('--num_lstm_layers', type=int, default=2, help='num_lstm_layers') parser.add_argument('--fc7_feature_length', type=int, default=4096, help='fc7_feature_length') parser.add_argument('--rnn_size', type=int, default=512, help='rnn_size') parser.add_argument('--embedding_size', type=int, default=512, help='embedding_size'), parser.add_argument('--word_emb_dropout', type=float, default=0.5, help='word_emb_dropout') parser.add_argument('--image_dropout', type=float, default=0.5, help='image_dropout') parser.add_argument('--data_dir', type=str, default='Data', help='Data directory') parser.add_argument('--batch_size', type=int, default=200, help='Batch Size') parser.add_argument('--learning_rate', type=float, default=0.001, help='Batch Size') parser.add_argument('--epochs', type=int, default=200, help='Expochs') parser.add_argument('--debug', type=bool, default=False, help='Debug') parser.add_argument('--resume_model', type=str, default=None, help='Trained Model Path') args = parser.parse_args() print "Reading QA DATA" qa_data = data_loader.load_questions_answers(args) print "Reading fc7 features" fc7_features, image_id_list = data_loader.load_fc7_features( args.data_dir, 'train') print "FC7 features", fc7_features.shape print "image_id_list", image_id_list.shape image_id_map = {} for i in xrange(len(image_id_list)): image_id_map[image_id_list[i]] = i ans_map = { qa_data['answer_vocab'][ans]: ans for ans in qa_data['answer_vocab'] } model_options = { 'num_lstm_layers': args.num_lstm_layers, 'rnn_size': args.rnn_size, 'embedding_size': args.embedding_size, 'word_emb_dropout': args.word_emb_dropout, 'image_dropout': args.image_dropout, 'fc7_feature_length': args.fc7_feature_length, 'lstm_steps': qa_data['max_question_length'] + 1, 'q_vocab_size': len(qa_data['question_vocab']), 'ans_vocab_size': len(qa_data['answer_vocab']) } model = vis_lstm_model.Vis_lstm_model(model_options) input_tensors, t_loss, t_accuracy, t_p = model.build_model() train_op = tf.train.AdamOptimizer(args.learning_rate).minimize(t_loss) sess = tf.InteractiveSession() tf.initialize_all_variables().run() saver = tf.train.Saver() if args.resume_model: saver.restore(sess, args.resume_model) for i in xrange(args.epochs): batch_no = 0 while (batch_no * args.batch_size) < len(qa_data['training']): sentence, answer, fc7 = get_training_batch(batch_no, args.batch_size, fc7_features, image_id_map, qa_data, 'train') _, loss_value, accuracy, pred = sess.run( [train_op, t_loss, t_accuracy, t_p], feed_dict={ input_tensors['fc7']: fc7, input_tensors['sentence']: sentence, input_tensors['answer']: answer }) batch_no += 1 if args.debug: for idx, p in enumerate(pred): print ans_map[p], ans_map[np.argmax(answer[idx])] print "Loss", loss_value, batch_no, i print "Accuracy", accuracy print "---------------" else: print "Loss", loss_value, batch_no, i print "Training Accuracy", accuracy save_path = saver.save(sess, "Data/Models/model{}.ckpt".format(i))
def main(): config = json.load(open('config.json')) parser = argparse.ArgumentParser() parser.add_argument('--num_lstm_layers', type=int, default=2, help='num_lstm_layers') parser.add_argument('--fc7_feature_length', type=int, default=4096, help='fc7_feature_length') parser.add_argument('--rnn_size', type=int, default=512, help='rnn_size') parser.add_argument('--embedding_size', type=int, default=512, help='embedding_size'), parser.add_argument('--word_emb_dropout', type=float, default=0.5, help='word_emb_dropout') parser.add_argument('--image_dropout', type=float, default=0.5, help='image_dropout') parser.add_argument('--qa_dir', type=str, default=config['qa_dir'], help='QA Data directory') parser.add_argument('--data_dir', type=str, default=config['data_dir'], help='Common Data directory') parser.add_argument('--batch_size', type=int, default=200, help='Batch Size') parser.add_argument('--learning_rate', type=float, default=0.001, help='Batch Size') parser.add_argument('--epochs', type=int, default=2, help='Expochs') parser.add_argument('--debug', type=bool, default=False, help='Debug') parser.add_argument('--resume_model', type=str, default=None, help='Trained Model Path') parser.add_argument('--version', type=int, default=1, help='VQA data version') args = parser.parse_args() print("Reading QA DATA") qa_data = data_loader.load_questions_answers(args.qa_dir) print("Reading fc7 features") fc7_features, image_id_list = data_loader.load_fc7_features(args.data_dir, 'train') print("FC7 features", fc7_features.shape) print("image_id_list", image_id_list.shape) image_id_map = {} for i in range(len(image_id_list)): image_id_map[image_id_list[i]] = i ans_map = {qa_data['answer_vocab'][ans]: ans for ans in qa_data['answer_vocab']} model_options = { 'num_lstm_layers': args.num_lstm_layers, 'rnn_size': args.rnn_size, 'embedding_size': args.embedding_size, 'word_emb_dropout': args.word_emb_dropout, 'image_dropout': args.image_dropout, 'fc7_feature_length': args.fc7_feature_length, 'lstm_steps': qa_data['max_question_length'] + 1, 'q_vocab_size': len(qa_data['question_vocab']), 'ans_vocab_size': len(qa_data['answer_vocab']) } model = vis_lstm_model.Vis_lstm_model(model_options) input_tensors, t_loss, t_accuracy, t_p = model.build_model() train_op = tf.train.AdamOptimizer(args.learning_rate).minimize(t_loss) sess = tf.InteractiveSession() # tf.initialize_all_variables().run() # tf.initialize_all_variables() is deprecated since 2017-03-02 sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() if args.resume_model: saver.restore(sess, args.resume_model) acc_file = open('train_acc.txt', 'w', encoding='utf-8') acc_file.write('epoch avg_acc\n') for i in range(args.epochs): batch_no = 0 epochs_acc_sum = 0 while (batch_no * args.batch_size) < len(qa_data['training']): sentence, answer, fc7 = get_training_batch(batch_no, args.batch_size, fc7_features, image_id_map, qa_data, 'train') _, loss_value, accuracy, pred = sess.run([train_op, t_loss, t_accuracy, t_p], feed_dict={ input_tensors['fc7']: fc7, input_tensors['sentence']: sentence, input_tensors['answer']: answer } ) batch_no += 1 if args.debug: for idx, p in enumerate(pred): print(ans_map[p], ans_map[np.argmax(answer[idx])]) print("Loss", loss_value, batch_no, i) print("Accuracy", accuracy) print("---------------") epochs_acc_sum += accuracy else: print("Loss", loss_value, batch_no, i) print("Training Accuracy", accuracy) epochs_acc_sum += accuracy acc_file.write(str(i) + ' ' + str(epochs_acc_sum/batch_no) + '\n') print() save_path = saver.save(sess, "Data/Models/model{}.ckpt".format(i)) acc_file.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--num_lstm_layers', type=int, default=2, help='num_lstm_layers') parser.add_argument('--fc7_feature_length', type=int, default=4096, help='fc7_feature_length') parser.add_argument('--rnn_size', type=int, default=512, help='rnn_size') parser.add_argument('--embedding_size', type=int, default=512, help='embedding_size'), parser.add_argument('--word_emb_dropout', type=float, default=0.5, help='word_emb_dropout') parser.add_argument('--image_dropout', type=float, default=0.5, help='image_dropout') parser.add_argument('--data_dir', type=str, default='Data', help='Data directory') parser.add_argument('--batch_size', type=int, default=200, help='Batch Size') parser.add_argument('--learning_rate', type=float, default=0.001, help='Batch Size') parser.add_argument('--epochs', type=int, default=200, help='Expochs') parser.add_argument('--debug', type=bool, default=False, help='Debug') parser.add_argument('--resume_model', type=str, default=None, help='Trained Model Path') args = parser.parse_args() print "Reading QA DATA" qa_data = data_loader.load_questions_answers(args) print "Reading fc7 features" fc7_features, image_id_list = data_loader.load_fc7_features(args.data_dir, 'train') print "FC7 features", fc7_features.shape print "image_id_list", image_id_list.shape image_id_map = {} for i in xrange(len(image_id_list)): image_id_map[ image_id_list[i] ] = i ans_map = { qa_data['answer_vocab'][ans] : ans for ans in qa_data['answer_vocab']} model_options = { 'num_lstm_layers' : args.num_lstm_layers, 'rnn_size' : args.rnn_size, 'embedding_size' : args.embedding_size, 'word_emb_dropout' : args.word_emb_dropout, 'image_dropout' : args.image_dropout, 'fc7_feature_length' : args.fc7_feature_length, 'lstm_steps' : qa_data['max_question_length'] + 1, 'q_vocab_size' : len(qa_data['question_vocab']), 'ans_vocab_size' : len(qa_data['answer_vocab']) } model = vis_lstm_model.Vis_lstm_model(model_options) input_tensors, t_loss, t_accuracy, t_p = model.build_model() train_op = tf.train.AdamOptimizer(args.learning_rate).minimize(t_loss) sess = tf.InteractiveSession() tf.initialize_all_variables().run() saver = tf.train.Saver() if args.resume_model: saver.restore(sess, args.resume_model) for i in xrange(args.epochs): batch_no = 0 while (batch_no*args.batch_size) < len(qa_data['training']): sentence, answer, fc7 = get_training_batch(batch_no, args.batch_size, fc7_features, image_id_map, qa_data, 'train') _, loss_value, accuracy, pred = sess.run([train_op, t_loss, t_accuracy, t_p], feed_dict={ input_tensors['fc7']:fc7, input_tensors['sentence']:sentence, input_tensors['answer']:answer } ) batch_no += 1 if args.debug: for idx, p in enumerate(pred): print ans_map[p], ans_map[ np.argmax(answer[idx])] print "Loss", loss_value, batch_no, i print "Accuracy", accuracy print "---------------" else: print "Loss", loss_value, batch_no, i print "Training Accuracy", accuracy save_path = saver.save(sess, "Data/Models/model{}.ckpt".format(i))
def main(): config = json.load(open('config.json')) parser = argparse.ArgumentParser() parser.add_argument('--split', type=str, default=config['split'], help='train/val') parser.add_argument('--model_path', type=str, default=config['model_path'], help='Pretrained VGG16 Model') parser.add_argument('--qa_dir', type=str, default=config['qa_dir'], help='QA Data directory') parser.add_argument('--data_dir', type=str, default=config['data_dir'], help='Common Data directory') parser.add_argument('--batch_size', type=int, default=10, help='Batch Size') args = parser.parse_args() vgg_file = open(args.model_path, 'rb') vgg16raw = vgg_file.read() vgg_file.close() graph_def = tf.GraphDef() graph_def.ParseFromString(vgg16raw) images = tf.placeholder("float", [None, 224, 224, 3]) tf.import_graph_def(graph_def, input_map={"images": images}) graph = tf.get_default_graph() for opn in graph.get_operations(): print("Name", opn.name, list(opn.values())) all_data = data_loader.load_questions_answers(args.qa_dir) if args.split == "train": qa_data = all_data['training'] else: qa_data = all_data['validation'] image_ids = {} for qa in qa_data: image_ids[qa['image_id']] = 1 image_id_list = [img_id for img_id in image_ids] print("Total Images", len(image_id_list)) sess = tf.Session() fc7 = np.ndarray((len(image_id_list), 4096)) idx = 0 err_file = open('err.txt', 'w', encoding='utf-8') while idx < len(image_id_list): start = time.clock() image_batch = np.ndarray((args.batch_size, 224, 224, 3)) count = 0 for i in range(0, args.batch_size): if idx >= len(image_id_list): break # print(image_id_list[idx]) filename = 'COCO_%s2014_%.12d.jpg' % (args.split, image_id_list[idx]) image_file = join(args.data_dir, '%s2014' % args.split, filename) try: image_batch[i, :, :, :] = utils.load_image_array(image_file) except (ValueError, FileNotFoundError, OSError) as e: print("http://images.cocodataset.org/%s2014/%s" % (args.split, filename)) err_file.write(str(image_id_list[idx]) + '\n') idx += 1 count += 1 err_file.flush() feed_dict = {images: image_batch[0:count, :, :, :]} fc7_tensor = graph.get_tensor_by_name("import/Relu_1:0") fc7_batch = sess.run(fc7_tensor, feed_dict=feed_dict) fc7[(idx - count):idx, :] = fc7_batch[0:count, :] end = time.clock() print("Time for batch 10 photos", end - start) print("Hours For Whole Dataset", (len(image_id_list) * 1.0) * (end - start) / 60.0 / 60.0 / 10.0) print("Images Processed", idx) print("Saving fc7 features") h5f_fc7 = h5py.File(join(args.data_dir, args.split + '_fc7.h5'), 'w') h5f_fc7.create_dataset('fc7_features', data=fc7) h5f_fc7.close() print("Saving image id list") h5f_image_id_list = h5py.File( join(args.data_dir, args.split + '_image_id_list.h5'), 'w') h5f_image_id_list.create_dataset('image_id_list', data=image_id_list) h5f_image_id_list.close() print("Done!")
def main(): parser = argparse.ArgumentParser() parser.add_argument('--split', type=str, default='train', help='train/val') parser.add_argument('--model_path', type=str, help='VGGNet') #VGGNet version parser.add_argument('--data_dir', type=str, default='Data', help='Data directory') parser.add_argument('--batch_size', type=int, default=100) # read pretrained vgg16 network args = parser.parse_args() vgg_file = open(args.model_path, 'rb') vgg16raw = vgg_file.read() vgg_file.close() # load the pretrained network into a tf graph graph_def = tf.GraphDef() graph_def.ParseFromString(vgg16raw) images = tf.placeholder("float", [None, 224, 224, 3]) tf.import_graph_def(graph_def, input_map={"images": images}) graph = tf.get_default_graph() for opn in graph.get_operations(): print("[VGG16] Name", opn.name, list(opn.values())) #Loading data all_data = data_loader.load_questions_answers() print(args) if args.split == "train": qa_data = all_data['training'] else: qa_data = all_data['validation'] image_ids = {} for qa in qa_data: image_ids[qa['image_id']] = 1 image_id_list = [img_id for img_id in image_ids] print("Total Images", len(image_id_list)) print(image_id_list[0:10]) # begin extracting sess = tf.Session() idx = 0 cnn7 = np.ndarray((len(image_id_list), 512, 49)) while idx < len(image_id_list): start = time.clock() image_batch = np.ndarray((args.batch_size, 224, 224, 3)) # load images into a batch count = 0 for i in range(0, args.batch_size): if idx >= len(image_id_list): break image_file = join( args.data_dir, '%s2015/abstract_v002_%s2015_%.12d.png' % (args.split, args.split, image_id_list[idx])) image_batch[i, :, :, :] = utils.load_image_array( image_file)[:, :, :3] idx += 1 count += 1 feed_dict = {images: image_batch[0:count, :, :, :]} cnn7_tensor = graph.get_tensor_by_name("import/pool5:0") cnn7_batch = sess.run(cnn7_tensor, feed_dict=feed_dict) cnn7_batch = np.transpose(cnn7_batch, [0, 3, 1, 2]) cnn7_batch = cnn7_batch.reshape(count, 512, -1) for i in range(args.batch_size): cnn7_batch[i, :, :] = cnn7_batch[i, :, :] / np.linalg.norm( cnn7_batch[i, :, :], axis=0, keepdims=True) cnn7[(idx - count):idx, ...] = cnn7_batch[0:count, ...] end = time.clock() print("Time for batch 10 photos", end - start) print("Hours For Whole Dataset", (len(image_id_list) * 1.0) * (end - start) / 60.0 / 60.0 / 10.0) print("Images Processed", idx) print("Saving cnn7 features") h5f_cnn7 = h5py.File(join(args.data_dir, args.split + '_cnn7.h5'), 'w') h5f_cnn7.create_dataset('cnn7_features', data=cnn7) h5f_cnn7.close() print("Saving image id list") h5f_image_id_list = h5py.File( join(args.data_dir, args.split + '_image_id_list.h5'), 'w') h5f_image_id_list.create_dataset('image_id_list', data=image_id_list) h5f_image_id_list.close() print("Done!")
def train(): batch_size = 10 print "Starting ABC-CNN training" vqa = dl.load_questions_answers('data') # Create subset of data for over-fitting sub_vqa = {} sub_vqa['training'] = vqa['training'][:10] sub_vqa['validation'] = vqa['validation'][:10] sub_vqa['answer_vocab'] = vqa['answer_vocab'] sub_vqa['question_vocab'] = vqa['question_vocab'] sub_vqa['max_question_length'] = vqa['max_question_length'] train_size = len(vqa['training']) max_itr = (train_size // batch_size) * 10 with tf.Session() as sess: image, ques, ans, optimizer, loss, accuracy = abc.model( sess, batch_size) print "Defined ABC model" train_loader = util.get_batch(sess, vqa, batch_size, 'training') print "Created train dataset generator" valid_loader = util.get_batch(sess, vqa, batch_size, 'validation') print "Created validation dataset generator" writer = abc.write_tensorboard(sess) init = tf.global_variables_initializer() merged = tf.summary.merge_all() sess.run(init) print "Initialized Tensor variables" itr = 1 while itr < max_itr: run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() _, vgg_batch, ques_batch, answer_batch = train_loader.next() _, valid_vgg_batch, valid_ques_batch, valid_answer_batch = valid_loader.next( ) sess.run(optimizer, feed_dict={ image: vgg_batch, ques: ques_batch, ans: answer_batch }) [train_summary, train_loss, train_accuracy] = sess.run([merged, loss, accuracy], feed_dict={ image: vgg_batch, ques: ques_batch, ans: answer_batch }, options=run_options, run_metadata=run_metadata) [valid_loss, valid_accuracy] = sess.run( [loss, accuracy], feed_dict={ image: valid_vgg_batch, ques: valid_ques_batch, ans: valid_answer_batch }) writer.add_run_metadata(run_metadata, 'step%03d' % itr) writer.add_summary(train_summary, itr) writer.flush() print "Iteration:%d\tTraining Loss:%f\tTraining Accuracy:%f\tValidation Loss:%f\tValidation Accuracy:%f" % ( itr, train_loss, 100. * train_accuracy, valid_loss, 100. * valid_accuracy) itr += 1
def main(): parser = argparse.ArgumentParser() parser.add_argument('--split', type=str, default='train', help='train/val/test') parser.add_argument('--model_path', type=str, default='./Data/ResNet/resnet_v2_101.ckpt', help='Pretrained RESNET Model') parser.add_argument('--data_dir', type=str, default='Data', help='Data directory') parser.add_argument('--batch_size', type=int, default=10, help='Batch Size') args = parser.parse_args() slim = tf.contrib.slim resnet = nets.resnet_v2 config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = 0.9 sess = tf.InteractiveSession(config=config) sess.run(tf.global_variables_initializer()) if args.split == 'test': all_data = data_loader.load_test_questions() qa_data = all_data['testing'] else: all_data = data_loader.load_questions_answers(args) if args.split == "train": qa_data = all_data['training'] else: qa_data = all_data['validation'] image_ids = {} for qa in qa_data: image_ids[qa['image_id']] = 1 image_id_list = [img_id for img_id in image_ids] print("Total Images", len(image_id_list)) length = 100 if args.split == 'test' else len(image_id_list) res5c = np.ndarray((length, 2048)) idx = 0 SIZE = 299 flag = 0 while idx < length: if idx % 500 == 0: flag = 0 tf.reset_default_graph() with tf.Graph().as_default(): with tf.Session() as sess: while idx < length: start = time.clock() image_batch = np.ndarray((args.batch_size, SIZE, SIZE, 3), dtype=np.float32) count = 0 for i in range(0, args.batch_size): if idx >= len(image_id_list): break if args.split == 'test': image_file = join( args.data_dir, '%s2015/COCO_%s2015_%.12d.jpg' % (args.split, args.split, image_id_list[idx])) else: image_file = join( args.data_dir, '%s2014/COCO_%s2014_%.12d.jpg' % (args.split, args.split, image_id_list[idx])) image_batch[i, :, :, :] = utils.load_image_array( image_file, size=SIZE) idx += 1 count += 1 with slim.arg_scope(resnet.resnet_arg_scope()): logits, end_points = resnet.resnet_v2_101( image_batch[0:count, :, :, :], num_classes=None, is_training=False, reuse=tf.AUTO_REUSE) if not flag: vals = slim.get_model_variables('resnet_v2_101') init_fn = slim.assign_from_checkpoint_fn( args.model_path, vals) init_fn(sess) flag = 1 res5c_batch = sess.run([logits]) if idx % 10 == 0: res5c_batch = res5c_batch[0].reshape( (args.batch_size, 2048)) else: res5c_batch = res5c_batch[0].reshape( (idx % args.batch_size, 2048)) res5c[(idx - count):idx, :] = res5c_batch[0:count, :] end = time.clock() print("Time for batch 10 photos", end - start) print("Hours For Whole Dataset", (len(image_id_list) * 1.0) * (end - start) / 60.0 / 60.0 / 10.0) print("Images Processed", idx) if idx % 500 == 0: break print("Saving res5c features") h5f_res5c = h5py.File(join(args.data_dir, args.split + '_res5c.h5'), 'w') h5f_res5c.create_dataset('res5c_features', data=res5c) h5f_res5c.close() print("Saving image id list") h5f_image_id_list = h5py.File( join(args.data_dir, args.split + '_image_id_list.h5'), 'w') h5f_image_id_list.create_dataset('image_id_list', data=image_id_list) h5f_image_id_list.close() print("Done!")
help='network type iBOWIMG|HieCoAtten') parser.add_argument('--use_soft', action='store_true', default=False, help='using soft cross entropy') args = parser.parse_args() data_dir = args.data_dir batch_size = args.batch_size num_epochs = args.num_epochs network = args.network use_soft = args.use_soft # Load QA Data print("Reading QA DATA") qa_data = load_questions_answers(data_dir=data_dir) print("train questions", len(qa_data['train'])) print("val questions", len(qa_data['val'])) print("answer vocab", len(qa_data['answer_vocab'])) print("question vocab", len(qa_data['question_vocab'])) print("max question length", qa_data['max_question_length']) # Define Data Loader data_splits = ('train', 'val') pdb.set_trace() if network == 'iBOWIMG': feature_type = 'vgg19Fc' elif network == 'HieCoAtten': feature_type = 'vgg19TwoBlocks' datasets = {
def main(): parser = argparse.ArgumentParser() parser.add_argument('--num_lstm_layers', type=int, default=2, help='num_lstm_layers') parser.add_argument('--cnn7_feature_length', type=int, default=512, help='cnn7_feature_length') parser.add_argument('--rnn_size', type=int, default=512, help='rnn_size') parser.add_argument('--embedding_size', type=int), parser.add_argument('--word_emb_dropout', type=float) parser.add_argument('--image_dropout', type=float) parser.add_argument('--data_dir', type=str) parser.add_argument('--batch_size', type=int, default=100, help='Batch Size') parser.add_argument('--learning_rate', type=float, default=0.1, help='Batch Size') parser.add_argument('--epochs', type=int, default=400, help='Expochs') parser.add_argument('--debug', type=bool, default=False, help='Debug') parser.add_argument('--resume_model', type=str, default=None, help='Trained Model Path') parser.add_argument('--version', type=int, default=2, help='VQA data version') args = parser.parse_args() print("Reading QA DATA") qa_data = data_loader.load_questions_answers(args.version, args.data_dir) print("Reading cnn7 features") cnn7_features, image_id_list = data_loader.load_cnn7_features(args.data_dir, 'train') print("cnn7 features", cnn7_features.shape) print("image_id_list", image_id_list.shape) image_id_map = {} for i in range(len(image_id_list)): image_id_map[ image_id_list[i] ] = i ans_map = { qa_data['answer_vocab'][ans] : ans for ans in qa_data['answer_vocab']} model_options = { 'num_lstm_layers' : args.num_lstm_layers, 'rnn_size' : args.rnn_size, 'embedding_size' : args.embedding_size, 'word_emb_dropout' : args.word_emb_dropout, 'image_dropout' : args.image_dropout, 'cnn7_feature_length' : args.cnn7_feature_length, 'lstm_steps' : qa_data['max_question_length'] + 1, 'q_vocab_size' : len(qa_data['question_vocab']), 'ans_vocab_size' : len(qa_data['answer_vocab']) } model = vis_lstm_model.Vis_lstm_model(model_options) input_tensors, t_loss, t_accuracy, t_p = model.build_model() train_op = tf.train.AdamOptimizer(args.learning_rate).minimize(t_loss) sess = tf.InteractiveSession() tf.initialize_all_variables().run() saver = tf.train.Saver() if args.resume_model: saver.restore(sess, args.resume_model) last_epoch = int(args.resume_model[-7:-5]) print(f'I resume Epoch {last_epoch}') else: last_epoch = int(-1) for i in range(args.epochs): batch_no = 0 batch_acc_record = [] while batch_no < 220: start = time.clock() sentence, answer, cnn7 = get_training_batch(batch_no, args.batch_size, cnn7_features, image_id_map, qa_data, 'train') _, loss_value, accuracy, pred = sess.run([train_op, t_loss, t_accuracy, t_p], feed_dict={ input_tensors['cnn7']:cnn7, input_tensors['sentence']:sentence, input_tensors['answer']:answer } ) batch_acc_record.append(accuracy) batch_no += 1 if args.debug: for idx, p in enumerate(pred): print(ans_map[p], ans_map[ np.argmax(answer[idx])]) print("Loss", loss_value, batch_no, i + 1 + last_epoch) print("Accuracy", accuracy) print("---------------") else: print("Loss", loss_value, batch_no, i + 1 + last_epoch) print("Training Accuracy", accuracy) end = time.clock() print("Time for one batch", end - start) print("Hours For one epoch" , (291 * 1.0)*(end - start)/60.0/60.0) save_path = saver.save(sess, "Data/Models/model{}.ckpt".format(i + 1 + last_epoch)) if np.mean(batch_acc_record)>=0.9: break sess.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--split', type=str, default='train', help='train/val') parser.add_argument('--model_path', type=str, default='Data/vgg16.tfmodel', help='Pretrained VGG16 Model') parser.add_argument('--data_dir', type=str, default='Data', help='Data directory') parser.add_argument('--batch_size', type=int, default=10, help='Batch Size') args = parser.parse_args() vgg_file = open(args.model_path) vgg16raw = vgg_file.read() vgg_file.close() graph_def = tf.GraphDef() graph_def.ParseFromString(vgg16raw) images = tf.placeholder("float", [None, 224, 224, 3]) tf.import_graph_def(graph_def, input_map={ "images": images }) graph = tf.get_default_graph() for opn in graph.get_operations(): print "Name", opn.name, opn.values() all_data = data_loader.load_questions_answers(args) if args.split == "train": qa_data = all_data['training'] else: qa_data = all_data['validation'] image_ids = {} for qa in qa_data: image_ids[qa['image_id']] = 1 image_id_list = [img_id for img_id in image_ids] print "Total Images", len(image_id_list) sess = tf.Session() fc7 = np.ndarray( (len(image_id_list), 4096 ) ) idx = 0 while idx < len(image_id_list): start = time.clock() image_batch = np.ndarray( (args.batch_size, 224, 224, 3 ) ) count = 0 for i in range(0, args.batch_size): if idx >= len(image_id_list): break image_file = join(args.data_dir, '%s2014/COCO_%s2014_%.12d.jpg'%(args.split, args.split, image_id_list[idx]) ) image_batch[i,:,:,:] = utils.load_image_array(image_file) idx += 1 count += 1 feed_dict = { images : image_batch[0:count,:,:,:] } fc7_tensor = graph.get_tensor_by_name("import/Relu_1:0") fc7_batch = sess.run(fc7_tensor, feed_dict = feed_dict) fc7[(idx - count):idx, :] = fc7_batch[0:count,:] end = time.clock() print "Time for batch 10 photos", end - start print "Hours For Whole Dataset" , (len(image_id_list) * 1.0)*(end - start)/60.0/60.0/10.0 print "Images Processed", idx print "Saving fc7 features" h5f_fc7 = h5py.File( join(args.data_dir, args.split + '_fc7.h5'), 'w') h5f_fc7.create_dataset('fc7_features', data=fc7) h5f_fc7.close() print "Saving image id list" h5f_image_id_list = h5py.File( join(args.data_dir, args.split + '_image_id_list.h5'), 'w') h5f_image_id_list.create_dataset('image_id_list', data=image_id_list) h5f_image_id_list.close() print "Done!"
def main(): # Set arguments to get file from directories parser = argparse.ArgumentParser() parser.add_argument('/home/vmhatre/vqa_supervised/Data/train/val') parser.add_argument( '--model_path', type=str, default='/home/vmhatre/vqa_supervised/Data/vgg16.tfmodel') parser.add_argument('--data_dir', type=str, default='/home/vmhatre/vqa_supervised/Data') parser.add_argument('--batch_size', type=int, default=10) args = parser.parse_args() vgg_file = open(args.model_path) vgg16raw = vgg_file.read() vgg_file.close() #using a connected graph with data type GraphDef from tensorflow to find connected componenets #within features if any graph_def = tf.GraphDef() graph_def.ParseFromString(vgg16raw) #Creating image input variable with 512*512 features as tf placeholder object images = tf.placeholder("float", [None, 512, 512, 3]) tf.import_graph_def(graph_def, input_map={"images": images}) #Get default connected components graph = tf.get_default_graph() # Defining name-value pair to retreive graphs by operation name for opn in graph.get_operations(): print "Name", opn.name, opn.values() # Get data from training and validation files all_data = data_loader.load_questions_answers(args) if args.split == "train": qa_data = all_data['training'] else: qa_data = all_data['validation'] #Evaluating total images before building/testing model image_ids = {} for qa in qa_data: image_ids[qa['image_id']] = 1 image_id_list = [img_id for img_id in image_ids] #print "Total Images", len(image_id_list) sess = tf.Session() fc7 = np.ndarray((len(image_id_list), 4096)) idx = 0 #For every pixel in size 512*512 storing features in image_batch while idx < len(image_id_list): image_batch = np.ndarray((args.batch_size, 512, 512, 3)) #for every image in dataset load image in imagebatch file using load_image_array count = 0 for i in range(0, args.batch_size): if idx >= len(image_id_list): break image_file = join( args.data_dir, '%s2014/COCO_%s2014_%.12d.jpg' % (args.split, args.split, image_id_list[idx])) image_batch[i, :, :, :] = utils.load_image_array(image_file) idx += 1 count += 1 feed_dict = {images: image_batch[0:count, :, :, :]} #Define a Rectified Linear Unit (ReLU) to store Graph of images which we then mulitply by feed images fc7_tensor = graph.get_tensor_by_name("import/Relu_1:0") fc7_batch = sess.run(fc7_tensor, feed_dict=feed_dict) fc7[(idx - count):idx, :] = fc7_batch[0:count, :] #Saving fc7 features after extracting from ReLU and image h5f_fc7 = h5py.File(join(args.data_dir, args.split + '_fc7.h5'), 'w') h5f_fc7.create_dataset('fc7_features', data=fc7) h5f_fc7.close() print "Saving image id list" h5f_image_id_list = h5py.File( join(args.data_dir, args.split + '_image_id_list.h5'), 'w') h5f_image_id_list.create_dataset('image_id_list', data=image_id_list) h5f_image_id_list.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--residual_channels', type=int, default=512, help='residual_channels') parser.add_argument('--data_dir', type=str, default='Data', help='Data directory') parser.add_argument('--batch_size', type=int, default=64, help='Batch Size') parser.add_argument('--learning_rate', type=float, default=0.001, help='Batch Size') parser.add_argument('--epochs', type=int, default=25, help='Expochs') parser.add_argument('--max_steps', type=int, default=50000, help='max steps, set 1 for evaluating the model') parser.add_argument('--version', type=int, default=1, help='VQA data version') parser.add_argument('--sample_every', type=int, default=200, help='Debug every x iterations') parser.add_argument('--evaluate_every', type=int, default=6000, help='Evaluate every x steps') parser.add_argument('--resume_model', type=str, default=None, help='Trained Model Path') parser.add_argument('--training_log_file', type=str, default='Data/training_log.json', help='Log file for accuracy') parser.add_argument('--feature_layer', type=str, default="block4", help='CONV FEATURE LAYER, fc7, pool5 or block4') parser.add_argument('--cnn_model', type=str, default="resnet", help='CNN model') parser.add_argument('--text_model', type=str, default="bytenet", help='bytenet/lstm') # evaluation_steps = [6000, 12000, 18000, 25000, 30000, 35000, 50000] # evaluation_steps = [400, 800, 1200, 1600, 2000, 2400, 2800] args = parser.parse_args() print "Reading QA DATA", args.version qa_data = data_loader.load_questions_answers(args.version, args.data_dir) shuffle(qa_data['training']) shuffle(qa_data['validation']) ans_vocab_rev = qa_data['index_to_ans'] ques_vocab_rev = qa_data['index_to_qw'] print "Reading conv features" conv_features, image_id_list = data_loader.load_conv_features( 'train', args.cnn_model, args.feature_layer) # image_id_map = {image_id_list[i] : i for i in xrange(len(image_id_list))} image_id_map = {image_id_list[i]: i for i in xrange(len(image_id_list))} conv_features_val, image_id_list_val = data_loader.load_conv_features( 'val', args.cnn_model, args.feature_layer) image_id_map_val = { image_id_list_val[i]: i for i in xrange(len(image_id_list_val)) } conv_features = data_loader.load_conv_features('train', args.cnn_model, args.feature_layer, load_image_list=False) model_options = { 'question_vocab_size': len(qa_data['index_to_qw']), 'residual_channels': args.residual_channels, 'ans_vocab_size': len(qa_data['index_to_ans']), 'filter_width': 3, 'img_dim': 14, 'img_channels': 2048, 'dilations': [ 1, 2, 4, 8, 1, 2, 4, 8, ], 'text_model': args.text_model, 'dropout_keep_prob': 0.6, 'max_question_length': qa_data['max_question_length'], 'num_answers': 10 } print "MODEL OPTIONS" print model_options model = VQA_model_attention.VQA_model(model_options) model.build_model() train_op = tf.train.AdamOptimizer(args.learning_rate).minimize(model.loss) model.build_generator(reuse=True) sess = tf.InteractiveSession() tf.initialize_all_variables().run() saver = tf.train.Saver() if args.resume_model: saver.restore(sess, args.resume_model) step = 0 training_log = [] for epoch in xrange(args.epochs): batch_no = 0 while (batch_no * args.batch_size) < len(qa_data['training']): start = time.clock() question, answer, image_features, image_ids, _ = get_batch( batch_no, args.batch_size, qa_data['training'], conv_features, image_id_map, 'train', model_options) _, loss_value = sess.run( [train_op, model.loss], feed_dict={ model.question: question, model.image_features: image_features, model.answers: answer }) end = time.clock() print "Time for batch of photos", end - start print "Time for one epoch (mins)", len( qa_data['training']) / args.batch_size * (end - start) / 60.0 batch_no += 1 step += 1 print "LOSS", loss_value, batch_no, len( qa_data) / args.batch_size, step, epoch print "****" if step % args.sample_every == 0: try: shutil.rmtree('Data/samples') except: pass os.makedirs('Data/samples') pred_answer, prob1, prob2 = sess.run( [model.g_predictions, model.g_prob1, model.g_prob2], feed_dict={ model.g_question: question, model.g_image_features: image_features }) pred_ans_text = utils.answer_indices_to_text( pred_answer, ans_vocab_rev) # just a sample actual_ans_text = utils.answer_indices_to_text( answer[:, 0], ans_vocab_rev) sample_data = [] print "Actual vs Prediction" for sample_i in range(len(pred_ans_text)): print actual_ans_text[sample_i], pred_ans_text[sample_i] question_text = utils.question_indices_to_text( question[sample_i], ques_vocab_rev) image_array = utils.image_array_from_image_id( image_ids[sample_i], 'train') blend1 = utils.get_blend_map(image_array, prob1[sample_i], overlap=True) blend2 = utils.get_blend_map(image_array, prob2[sample_i], overlap=True) sample_data.append({ 'question': question_text, 'actual_answer': actual_ans_text[sample_i], 'predicted_answer': pred_ans_text[sample_i], 'image_id': image_ids[sample_i], 'batch_index': sample_i }) misc.imsave( 'Data/samples/{}_actual_image.jpg'.format(sample_i), image_array) misc.imsave('Data/samples/{}_blend1.jpg'.format(sample_i), blend1) misc.imsave('Data/samples/{}_blend2.jpg'.format(sample_i), blend2) f = open('Data/samples/sample.json', 'wb') f.write(json.dumps(sample_data)) f.close() shutil.make_archive('Data/samples', 'zip', 'Data/samples') gc.collect() if step % args.evaluate_every == 0: accuracy = evaluate_model(model, qa_data, args, model_options, sess, conv_features_val, image_id_map_val) print "ACCURACY>> ", accuracy, step, epoch training_log.append({ 'step': step, 'epoch': epoch, 'accuracy': accuracy, }) f = open(args.training_log_file, 'wb') f.write(json.dumps(training_log)) f.close() save_path = saver.save( sess, "Data/Models{}/model{}.ckpt".format(args.version, epoch)) gc.collect() # to avoid h5py from slowing down. conv_features = data_loader.load_conv_features( 'train', args.cnn_model, args.feature_layer, load_image_list=False) if step >= args.max_steps: break
def main(): parser = argparse.ArgumentParser() # argparse 是 Python 内置的一个用于命令项选项与参数解析的模块,\ # 通过在程序中定义好我们需要的参数,argparse 将会从 sys.argv 中解析出这些参数,并自动生成帮助和使用信息 parser.add_argument('--num_lstm_layers', type=int, default=2, help='num_lstm_layers') parser.add_argument('--fc7_feature_length', type=int, default=4096, help='fc7_feature_length') parser.add_argument('--rnn_size', type=int, default=512, help='rnn_size') parser.add_argument('--embedding_size', type=int, default=512, help='embedding_size'), parser.add_argument('--word_emb_dropout', type=float, default=0.5, help='word_emb_dropout') parser.add_argument('--image_dropout', type=float, default=0.5, help='image_dropout') parser.add_argument('--data_dir', type=str, default='Data', help='Data directory') parser.add_argument('--batch_size', type=int, default=200, help='Batch Size') parser.add_argument('--learning_rate', type=float, default=0.001, help='Batch Size') parser.add_argument('--epochs', type=int, default=200, help='Expochs') parser.add_argument('--debug', type=bool, default=False, help='Debug') parser.add_argument('--resume_model', type=str, default=None, help='Trained Model Path') parser.add_argument('--version', type=int, default=2, help='VQA data version') args = parser.parse_args() # Get the two attributes, integers and accumulate. print("Reading QA DATA") #存了些什么样子的数据?函数返回的数据结构啥样子= qa_data的结构啥样子。 qa_data = data_loader.load_questions_answers(args.version, args.data_dir) print("Reading fc7 features") #下面data_loader提取到的就是feature和id,但是dataloader应该还没有经过training,如何得到的? #data_loader的到的image_id_list是什么样子的? fc7_features, image_id_list = data_loader.load_fc7_features(args.data_dir, 'train') print("FC7 features", fc7_features.shape) print("image_id_list", image_id_list.shape) image_id_map = {} #得到的是image_id名字对应的id数字;数据类型为字典 for i in range(len(image_id_list)): image_id_map[ image_id_list[i] ] = i # 为啥需要一个ans_map这样的字典? # 这里面的ans是什么东西,以及qa_data['answer_vocab'][ans]的数据结构为何会是这样? ans_map = { qa_data['answer_vocab'][ans] : ans for ans in qa_data['answer_vocab']} #下面这个是配置好TensorFlow初始化的参数。 model_options = { 'num_lstm_layers' : args.num_lstm_layers, 'rnn_size' : args.rnn_size, 'embedding_size' : args.embedding_size, 'word_emb_dropout' : args.word_emb_dropout, 'image_dropout' : args.image_dropout, 'fc7_feature_length' : args.fc7_feature_length, 'lstm_steps' : qa_data['max_question_length'] + 1, 'q_vocab_size' : len(qa_data['question_vocab']), 'ans_vocab_size' : len(qa_data['answer_vocab']) } #下面这里几句话对TensorFLow进行了初始化与调用。 model = vis_lstm_model.Vis_lstm_model(model_options)# 初始化TensorFlow input_tensors, t_loss, t_accuracy, t_p = model.build_model() # Get the results of the Neural Network Model(LSTM) train_op = tf.train.AdamOptimizer(args.learning_rate).minimize(t_loss) # Use Adam to get better learning rate sess = tf.InteractiveSession() # Get into the interactive session, I think here just open a window or sth to display sth. tf.initialize_all_variables().run() # I think here is the interrupt processing. if resume from previous process, resume with previous process results. saver = tf.train.Saver() if args.resume_model: saver.restore(sess, args.resume_model) for i in range(args.epochs): batch_no = 0 while (batch_no*args.batch_size) < len(qa_data['training']): # batch_no*args.batch_size = the total number of elements #in training set that has been explored. #Get the batch of the training set. sentence, answer, fc7 = get_training_batch(batch_no, args.batch_size, fc7_features, image_id_map, qa_data, 'train') _, loss_value, accuracy, pred = sess.run([train_op, t_loss, t_accuracy, t_p], feed_dict={ input_tensors['fc7']:fc7, input_tensors['sentence']:sentence, input_tensors['answer']:answer } ) ### The whole part just store all the parameters into tensorflow inner class! ### batch_no += 1 if args.debug: for idx, p in enumerate(pred): print(ans_map[p], ans_map[ np.argmax(answer[idx])]) print("Loss", loss_value, batch_no, i) print("Accuracy", accuracy) print("---------------") else: print("Loss", loss_value, batch_no, i) print("Training Accuracy", accuracy) save_path = saver.save(sess, "Data/Models/model{}.ckpt".format(i))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--num_lstm_layers', type=int, default=2, help='num_lstm_layers') parser.add_argument('--fc7_feature_length', type=int, default=4096, help='fc7_feature_length') parser.add_argument('--rnn_size', type=int, default=512, help='rnn_size') parser.add_argument('--embedding_size', type=int, default=512, help='embedding_size'), parser.add_argument('--word_emb_dropout', type=float, default=0.5, help='word_emb_dropout') parser.add_argument('--image_dropout', type=float, default=0.5, help='image_dropout') parser.add_argument('--data_dir', type=str, default='Data', help='Data directory') parser.add_argument('--batch_size', type=int, default=200, help='Batch Size') parser.add_argument('--learning_rate', type=float, default=0.001, help='Batch Size') parser.add_argument('--epochs', type=int, default=200, help='Expochs') parser.add_argument('--debug', type=bool, default=False, help='Debug') parser.add_argument('--model_path', type=str, default = 'Data/Models/model21.ckpt', help='Model Path') args = parser.parse_args() print "Reading QA DATA" qa_data = data_loader.load_questions_answers(args) print "Reading fc7 features" fc7_features, image_id_list = data_loader.load_fc7_features(args.data_dir, 'val') print "FC7 features", fc7_features.shape print "image_id_list", image_id_list.shape image_id_map = {} for i in xrange(len(image_id_list)): image_id_map[ image_id_list[i] ] = i ans_map = { qa_data['answer_vocab'][ans] : ans for ans in qa_data['answer_vocab']} model_options = { 'num_lstm_layers' : args.num_lstm_layers, 'rnn_size' : args.rnn_size, 'embedding_size' : args.embedding_size, 'word_emb_dropout' : args.word_emb_dropout, 'image_dropout' : args.image_dropout, 'fc7_feature_length' : args.fc7_feature_length, 'lstm_steps' : qa_data['max_question_length'] + 1, 'q_vocab_size' : len(qa_data['question_vocab']), 'ans_vocab_size' : len(qa_data['answer_vocab']) } model = vis_lstm_model.Vis_lstm_model(model_options) input_tensors, t_prediction, t_ans_probab = model.build_generator() sess = tf.InteractiveSession() saver = tf.train.Saver() avg_accuracy = 0.0 total = 0 saver.restore(sess, args.model_path) batch_no = 0 while (batch_no*args.batch_size) < len(qa_data['validation']): sentence, answer, fc7 = get_batch(batch_no, args.batch_size, fc7_features, image_id_map, qa_data, 'val') pred, ans_prob = sess.run([t_prediction, t_ans_probab], feed_dict={ input_tensors['fc7']:fc7, input_tensors['sentence']:sentence, }) batch_no += 1 if args.debug: for idx, p in enumerate(pred): print ans_map[p], ans_map[ np.argmax(answer[idx])] correct_predictions = np.equal(pred, np.argmax(answer, 1)) correct_predictions = correct_predictions.astype('float32') accuracy = correct_predictions.mean() print "Acc", accuracy avg_accuracy += accuracy total += 1 print "Acc", avg_accuracy/total
def main(): print "Total Images" parser = argparse.ArgumentParser() parser.add_argument('--split', type=str, default='train', help='train/val') parser.add_argument('--data_dir', type=str, default='Data', help='Data directory') parser.add_argument('--batch_size', type=int, default=10, help='Batch Size') print "Total Images" args = parser.parse_args() print "Total Images" data_loader.prepare_training_data(version=1); all_data = data_loader.load_questions_answers(version=1); if args.split == "train": qa_data = all_data['training'] else: qa_data = all_data['validation'] image_ids = {} for qa in qa_data: image_ids[qa['image_id']] = 1 image_id_list = [img_id for img_id in image_ids] print "Total Images", len(image_id_list) model = VGG16(weights='imagenet', include_top=False, outputs=base_model.get_layer('Conv2D').output) fc7 = np.ndarray((len(image_id_list), 4096)) idx = 0 while idx < len(image_id_list): start = time.clock() image_batch = np.ndarray((args.batch_size, 224, 224, 3)) count = 0 for i in range(0, args.batch_size): if idx >= len(image_id_list): break image_file = join(args.data_dir, '%s2014/COCO_%s2014_%.12d.jpg' % (args.split, args.split, image_id_list[idx])) image_batch[i, :, :, :] = utils.load_image_array(image_file) x = np.expand_dims(image_batch[i, :, :, :], axis=0) x = preprocess_input(x) features = model.predict(x) fc7_batch[i, :] = features idx += 1 count += 1 fc7[(idx - count):idx, :] = fc7_batch[0:count, :] end = time.clock() print "Time for batch 10 photos", end - start print "Hours For Whole Dataset", (len(image_id_list) * 1.0) * (end - start) / 60.0 / 60.0 / 10.0 print "Images Processed", idx print "Saving fc7 features" h5f_fc7 = h5py.File(join(args.data_dir, args.split + '_fc7.h5'), 'w') h5f_fc7.create_dataset('fc7_features', data=fc7) h5f_fc7.close() print "Saving image id list" h5f_image_id_list = h5py.File(join(args.data_dir, args.split + '_image_id_list.h5'), 'w') h5f_image_id_list.create_dataset('image_id_list', data=image_id_list) h5f_image_id_list.close() print "Done!"
def main(): parser = argparse.ArgumentParser() parser.add_argument('--split', type=str, default='train', help='train/val') parser.add_argument('--model_path', type=str, default='Data/vgg16.tfmodel', help='Pretrained VGG16 Model') parser.add_argument('--data_dir', type=str, default='Data', help='Data directory') parser.add_argument('--batch_size', type=int, default=10, help='Batch Size') args = parser.parse_args() #print(args.model_path) vgg_file = open(args.model_path, 'rb') vgg16raw = vgg_file.read() vgg_file.close() graph_def = tf.GraphDef() graph_def.ParseFromString(vgg16raw) images = tf.placeholder("float", [None, 224, 224, 3]) tf.import_graph_def(graph_def, input_map={"images": images}) graph = tf.get_default_graph() for opn in graph.get_operations(): print("Name", opn.name, list(opn.values())) #Loading data # data_loader.prepare_training_data(version = 2, data_dir = 'Data') all_data = data_loader.load_questions_answers() print(args) if args.split == "train": qa_data = all_data['training'] else: qa_data = all_data['validation'] image_ids = {} for qa in qa_data: image_ids[qa['image_id']] = 1 image_id_list = [img_id for img_id in image_ids] print("Total Images", len(image_id_list)) sess = tf.Session() fc7 = np.ndarray((len(image_id_list), 4096)) idx = 0 while idx < len(image_id_list): start = time.clock() image_batch = np.ndarray((args.batch_size, 224, 224, 3)) count = 0 for i in range(0, args.batch_size): if idx >= len(image_id_list): break image_file = join( args.data_dir, '%s2015/abstract_v002_%s2015_%.12d.png' % (args.split, args.split, image_id_list[idx])) image_batch[i, :, :, :] = utils.load_image_array( image_file)[:, :, :3] idx += 1 count += 1 feed_dict = {images: image_batch[0:count, :, :, :]} fc7_tensor = graph.get_tensor_by_name("import/Relu_1:0") fc7_batch = sess.run(fc7_tensor, feed_dict=feed_dict) fc7[(idx - count):idx, :] = fc7_batch[0:count, :] end = time.clock() print("Time for batch 10 photos", end - start) print("Hours For Whole Dataset", (len(image_id_list) * 1.0) * (end - start) / 60.0 / 60.0 / 10.0) print("Images Processed", idx) print("Saving fc7 features") h5f_fc7 = h5py.File(join(args.data_dir, args.split + '_fc7.h5'), 'w') h5f_fc7.create_dataset('fc7_features', data=fc7) h5f_fc7.close() print("Saving image id list") h5f_image_id_list = h5py.File( join(args.data_dir, args.split + '_image_id_list.h5'), 'w') h5f_image_id_list.create_dataset('image_id_list', data=image_id_list) h5f_image_id_list.close() print("Done!")
def trainNetwork(sess, net, num_epochs, C, saver_all): # -*- coding: utf-8 -*- # Get handle for vgg model vgg, images = data_loader.getVGGhandle() # Parse all the vqa question informations qa_data = data_loader.load_questions_answers(C.datapath) data_validation = qa_data['validation'] data_training = qa_data['training'] question_vocab = qa_data['question_vocab'] answer_vocab = qa_data['answer_vocab'] bp() question_input_dim = len(question_vocab) answer_out_dim = len(answer_vocab) num_training_data = len(data_training) nIter = num_training_data // net.batchSize # Prepare data generator which will be used for training the network train_data_generator = data_loader.getNextBatch(sess, vgg, images, data_training, question_vocab, answer_vocab, os.path.join( C.image_base_path, 'train2014'), batchSize=C.batchSize, purpose='train') valid_data_generator = data_loader.getNextBatch(sess, vgg, images, data_validation, question_vocab, answer_vocab, os.path.join( C.image_base_path, 'val2014'), batchSize=C.batchSize, purpose='val') # global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name='global_step') # Generate data in batches: # batch_question : [batchSize = 32, maxQuestionLength=22, questionVocabDim = 15xxx] # batch_answer : [batchSize = 32, answer_vocab = 1000] # batch_image_id : [batchSize = 32, 'filename of all the images in the batch' -> ['487025', '487025', '78077' ...... ] ] # batch_features : [batchSize = 32, cnnHeight=14, cnnWidth=14, featureDim = 512] # batch_question,batch_answer,batch_image_id,batch_features = train_data_generator.next() batch_question, batch_answer, batch_image_id, batch_features = train_data_generator.next( ) prev_loss = sess.run(net.cross_entropy, feed_dict = { net.qs_ip : batch_question , \ net.ans_ip : batch_answer , \ net.cnn_ip : batch_features }) # global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name='global_step') # sess.run(tf.initialize_variables([global_step])) batchCount = -1 log_filename = './log_dir/train_' + datetime.now().strftime( "%Y%m%d-%H%M%S") + '.log' fHandle = open(log_filename, 'w') print("Writing log to file: ", log_filename) print("Training network\n") print("Initial Loss: ", prev_loss) print "Number of epochs:%d , \t Iteration per epoch:%d" % (num_epochs, nIter) fHandle.write("Training Network\n") fHandle.write("Initial Loss: \n" % (prev_loss)) start_time = time.time() for epoch in range(num_epochs): for iter in range(nIter): batchCount += 1 batch_question, batch_answer, batch_image_id, batch_features = train_data_generator.next( ) if (batchCount % 1 == 0): run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() [curr_train_loss, curr_train_acc , train_summary, true_answer, predicted_answer] = sess.run([net.cross_entropy, net.accuracy ,net.summary_op, net.true_answer, net.predicted_answer ] , \ feed_dict = { net.qs_ip : batch_question , \ net.ans_ip : batch_answer , \ net.cnn_ip : batch_features }, options=run_options, run_metadata=run_metadata) print("True labels") print true_answer print("Predicted labels") print predicted_answer net.writer.add_run_metadata(run_metadata, 'step%03d' % batchCount) net.writer.add_summary(train_summary) valid_batch_question, valid_batch_answer, valid_batch_image_id, valid_batch_features = valid_data_generator.next( ) [curr_valid_loss, curr_valid_acc, valid_summary ] = sess.run([net.cross_entropy, net.accuracy ,net.summary_op] , feed_dict = { net.qs_ip : valid_batch_question , \ net.ans_ip : valid_batch_answer , \ net.cnn_ip : valid_batch_features } ) if (curr_train_loss < prev_loss): print("Loss decreased from %.4f to %.4f" % (prev_loss, curr_train_loss)) print("Saving session") fHandle.write("Loss decreased from %.4f to %.4f" % (prev_loss, curr_train_loss)) saver_all.save(sess, 'checkpoints/vqa', global_step=net.global_step) prev_loss = curr_train_loss print "Epoc:%d/%d_Iter:%d/%d, TrainLoss:%.2f TrainAccuracy:%.2f, ValidLoss:%.2f ValidAccuracy:%.2f Elapsed time: %d" % ( epoch, num_epochs, iter, nIter, curr_train_loss, curr_train_acc * 100, curr_valid_loss, curr_valid_acc * 100, time.time() - start_time) fHandle.write( "Epoc:%d/%d_Iter:%d/%d \t, TrainLoss: %.2f \t TrainAccuracy: %.2f \t, ValidLoss:%.2f \t ValidAccuracy:%.2f \t Elapsed time: %d\n" % (epoch, num_epochs, iter, nIter, curr_train_loss, curr_train_acc * 100, curr_valid_loss, curr_valid_acc * 100, time.time() - start_time)) start_time = time.time() # train the batch sess.run(net.train_step, feed_dict={ net.qs_ip: batch_question, net.ans_ip: batch_answer, net.cnn_ip: batch_features }) # net.print_variables() net.writer.close() fHandle.close()