def main(): start_time = time.time() parser = argparse.ArgumentParser(prog='trainLSTM_MLP.py', description='Train LSTM-MLP model for visual question answering') parser.add_argument('--mlp-hidden-units', type=int, default=1024, metavar='<mlp-hidden-units>') parser.add_argument('--lstm-hidden-units', type=int, default=512, metavar='<lstm-hidden-units>') parser.add_argument('--mlp-hidden-layers', type=int, default=3, metavar='<mlp-hidden-layers>') parser.add_argument('--lstm-hidden-layers', type=int, default=1, metavar='<lstm-hidden-layers>') parser.add_argument('--dropout', type=float, default=0.5, metavar='<dropout-rate>') parser.add_argument('--mlp-activation', type=str, default='tanh', metavar='<activation-function>') parser.add_argument('--num-epochs', type=int, default=100, metavar='<num-epochs>') parser.add_argument('--batch-size', type=int, default=128, metavar='<batch-size>') parser.add_argument('--learning-rate', type=float, default=0.001, metavar='<learning-rate>') parser.add_argument('--dev-accuracy-path', type=str, required=True, metavar='<accuracy-path>') args = parser.parse_args() word_vec_dim = 300 img_dim = 4096 max_len = 30 ###################### # Load Data # ###################### data_dir = '/home/mlds/data/0.05_val/' print('Loading data...') train_id_pairs, train_image_ids = LoadIds('train', data_dir) dev_id_pairs, dev_image_ids = LoadIds('dev', data_dir) train_questions = LoadQuestions('train', data_dir) dev_questions = LoadQuestions('dev', data_dir) train_choices = LoadChoices('train', data_dir) dev_choices = LoadChoices('dev', data_dir) train_answers = LoadAnswers('train', data_dir) dev_answers = LoadAnswers('dev', data_dir) print('Finished loading data.') print('Time: %f s' % (time.time()-start_time)) ###################### # Model Descriptions # ###################### print('Generating and compiling model...') # image model (CNN features) image_model = Sequential() image_model.add(Reshape( input_shape=(img_dim,), dims=(img_dim,) )) # language model (LSTM) language_model = Sequential() if args.lstm_hidden_layers == 1: language_model.add(LSTM( output_dim=args.lstm_hidden_units, return_sequences=False, input_shape=(max_len, word_vec_dim) )) else: language_model.add(LSTM( output_dim=args.lstm_hidden_units, return_sequences=True, input_shape=(max_len, word_vec_dim) )) for i in range(args.lstm_hidden_layers-2): language_model.add(LSTM( output_dim=args.lstm_hidden_units, return_sequences=True )) language_model.add(LSTM( output_dim=args.lstm_hidden_units, return_sequences=False )) # feedforward model (MLP) model = Sequential() model.add(Merge( [language_model, image_model], mode='concat', concat_axis=1 )) for i in range(args.mlp_hidden_layers): model.add(Dense( args.mlp_hidden_units, init='uniform' )) model.add(Activation(args.mlp_activation)) model.add(Dropout(args.dropout)) model.add(Dense(word_vec_dim)) #model.add(Activation('softmax')) json_string = model.to_json() model_filename = 'models/vgg_lstm_units_%i_layers_%i_mlp_units_%i_layers_%i_%s_lr%.1e_dropout%.2f' % (args.lstm_hidden_units, args.lstm_hidden_layers, args.mlp_hidden_units, args.mlp_hidden_layers, args.mlp_activation, args.learning_rate, args.dropout) #model_filename = 'models/vgg_lstm_units_%i_layers_%i_mlp_units_%i_layers_%i_%s_lr%.1e_dropout%.2f_loss_cosine' % (args.lstm_hidden_units, args.lstm_hidden_layers, args.mlp_hidden_units, args.mlp_hidden_layers, args.mlp_activation, args.learning_rate, args.dropout) open(model_filename + '.json', 'w').write(json_string) # loss and optimizer rmsprop = RMSprop(lr=args.learning_rate) #model.compile(loss='categorical_crossentropy', optimizer=rmsprop) model.compile(loss=Loss, optimizer=rmsprop) print('Compilation finished.') print('Time: %f s' % (time.time()-start_time)) ######################################## # Load CNN Features and Word Vectors # ######################################## # load VGG features print('Loading VGG features...') VGG_features, img_map = LoadVGGFeatures() print('VGG features loaded') print('Time: %f s' % (time.time()-start_time)) # load GloVe vectors print('Loading GloVe vectors...') word_embedding, word_map = LoadGloVe() print('GloVe vectors loaded') print('Time: %f s' % (time.time()-start_time)) ###################### # Make Batches # ###################### print('Making batches...') # training batches train_question_batches = [ b for b in MakeBatches(train_questions, args.batch_size, fillvalue=train_questions[-1]) ] train_answer_batches = [ b for b in MakeBatches(train_answers['toks'], args.batch_size, fillvalue=train_answers['toks'][-1]) ] train_image_batches = [ b for b in MakeBatches(train_image_ids, args.batch_size, fillvalue=train_image_ids[-1]) ] train_indices = list(range(len(train_question_batches))) # validation batches dev_question_batches = [ b for b in MakeBatches(dev_questions, args.batch_size, fillvalue=dev_questions[-1]) ] dev_answer_batches = [ b for b in MakeBatches(dev_answers['labs'], args.batch_size, fillvalue=dev_answers['labs'][-1]) ] dev_choice_batches = [ b for b in MakeBatches(dev_choices, args.batch_size, fillvalue=dev_choices[-1]) ] dev_image_batches = [ b for b in MakeBatches(dev_image_ids, args.batch_size, fillvalue=dev_image_ids[-1]) ] print('Finished making batches.') print('Time: %f s' % (time.time()-start_time)) ###################### # Training # ###################### acc_file = open(args.dev_accuracy_path, 'w') dev_accs = [] max_acc = -1 max_acc_epoch = -1 # define interrupt handler def PrintDevAcc(): print('Max validation accuracy epoch: %i' % max_acc_epoch) print(dev_accs) def InterruptHandler(sig, frame): print(str(sig)) PrintDevAcc() sys.exit(-1) signal.signal(signal.SIGINT, InterruptHandler) signal.signal(signal.SIGTERM, InterruptHandler) # print training information print('-'*80) print('Training Information') print('# of LSTM hidden units: %i' % args.lstm_hidden_units) print('# of LSTM hidden layers: %i' % args.lstm_hidden_layers) print('# of MLP hidden units: %i' % args.mlp_hidden_units) print('# of MLP hidden layers: %i' % args.mlp_hidden_layers) print('Dropout: %f' % args.dropout) print('MLP activation function: %s' % args.mlp_activation) print('# of training epochs: %i' % args.num_epochs) print('Batch size: %i' % args.batch_size) print('Learning rate: %f' % args.learning_rate) print('# of train questions: %i' % len(train_questions)) print('# of dev questions: %i' % len(dev_questions)) print('-'*80) acc_file.write('-'*80 + '\n') acc_file.write('Training Information\n') acc_file.write('# of LSTM hidden units: %i\n' % args.lstm_hidden_units) acc_file.write('# of LSTM hidden layers: %i\n' % args.lstm_hidden_layers) acc_file.write('# of MLP hidden units: %i\n' % args.mlp_hidden_units) acc_file.write('# of MLP hidden layers: %i\n' % args.mlp_hidden_layers) acc_file.write('Dropout: %f\n' % args.dropout) acc_file.write('MLP activation function: %s\n' % args.mlp_activation) acc_file.write('# of training epochs: %i\n' % args.num_epochs) acc_file.write('Batch size: %i\n' % args.batch_size) acc_file.write('Learning rate: %f\n' % args.learning_rate) acc_file.write('# of train questions: %i\n' % len(train_questions)) acc_file.write('# of dev questions: %i\n' % len(dev_questions)) acc_file.write('-'*80 + '\n') # start training print('Training started...') for k in range(args.num_epochs): print('-'*80) print('Epoch %i' % (k+1)) progbar = generic_utils.Progbar(len(train_indices)*args.batch_size) # shuffle batch indices random.shuffle(train_indices) for i in train_indices: X_question_batch = GetQuestionsTensor(train_question_batches[i], word_embedding, word_map) X_image_batch = GetImagesMatrix(train_image_batches[i], img_map, VGG_features) Y_answer_batch = GetAnswersMatrix(train_answer_batches[i], word_embedding, word_map) loss = model.train_on_batch([X_question_batch, X_image_batch], Y_answer_batch) loss = loss[0].tolist() progbar.add(args.batch_size, values=[('train loss', loss)]) print('Time: %f s' % (time.time()-start_time)) # evaluate on dev set pbar = generic_utils.Progbar(len(dev_question_batches)*args.batch_size) dev_correct = 0 # feed forward for i in range(len(dev_question_batches)): X_question_batch = GetQuestionsTensor(dev_question_batches[i], word_embedding, word_map) X_image_batch = GetImagesMatrix(dev_image_batches[i], img_map, VGG_features) prob = model.predict_proba([X_question_batch, X_image_batch], args.batch_size, verbose=0) # get word vecs of choices choice_feats = GetChoicesTensor(dev_choice_batches[i], word_embedding, word_map) similarity = np.zeros((5, args.batch_size), float) # calculate cosine distances for j in range(5): similarity[j] = np.diag(cosine_similarity(prob, choice_feats[j])) # take argmax of cosine distances pred = np.argmax(similarity, axis=0) + 1 if i != (len(dev_question_batches)-1): dev_correct += np.count_nonzero(dev_answer_batches[i]==pred) else: num_padding = args.batch_size * len(dev_question_batches) - len(dev_questions) last_idx = args.batch_size - num_padding dev_correct += np.count_nonzero(dev_answer_batches[:last_idx]==pred[:last_idx]) pbar.add(args.batch_size) dev_acc = float(dev_correct)/len(dev_questions) dev_accs.append(dev_acc) print('Validation Accuracy: %f' % dev_acc) print('Time: %f s' % (time.time()-start_time)) if dev_acc > max_acc: max_acc = dev_acc max_acc_epoch = k model.save_weights(model_filename + '_best.hdf5', overwrite=True) #model.save_weights(model_filename + '_epoch_{:03d}.hdf5'.format(k+1)) print(dev_accs) for acc in dev_accs: acc_file.write('%f\n' % acc) print('Best validation accuracy: %f; epoch#%i' % (max_acc,(max_acc_epoch+1))) acc_file.write('Best validation accuracy: %f; epoch#%i\n' % (max_acc,(max_acc_epoch+1))) print('Training finished.') acc_file.write('Training finished.\n') print('Time: %f s' % (time.time()-start_time)) acc_file.write('Time: %f s\n' % (time.time()-start_time)) acc_file.close()
def main(): start_time = time.time() #signal.signal(signal.SIGINT, InterruptHandler) #signal.signal(signal.SIGKILL, InterruptHandler) #signal.signal(signal.SIGTERM, InterruptHandler) parser = argparse.ArgumentParser( prog='testLSTM.py', description='Test LSTM model for visual question answering') parser.add_argument('--model', type=str, required=True, metavar='<model-path>') parser.add_argument('--weights', type=str, required=True, metavar='<weights-path>') parser.add_argument('--output', type=str, required=True, metavar='<prediction-path>') args = parser.parse_args() word_vec_dim = 300 batch_size = 128 ####################### # Load Model # ####################### print('Loading model and weights...') model = model_from_json(open(args.model, 'r').read()) model.compile(loss='categorical_crossentropy', optimizer='rmsprop') model.load_weights(args.weights) print('Model and weights loaded.') print('Time: %f s' % (time.time() - start_time)) ###################### # Load Data # ###################### print('Loading data...') dev_id_pairs, dev_image_ids = LoadIds('dev') #test_id_pairs, test_image_ids = LoadIds('test') dev_questions = LoadQuestions('dev') #test_questions = LoadQuestions('test') dev_choices = LoadChoices('dev') #test_choices = LoadChoices('test') dev_answers = LoadAnswers('dev') print('Finished loading data.') print('Time: %f s' % (time.time() - start_time)) ####################### # Load Word Vectors # ####################### # load GloVe vectors print('Loading GloVe vectors...') word_embedding, word_map = LoadGloVe() print('GloVe vectors loaded') print('Time: %f s' % (time.time() - start_time)) ###################### # Make Batches # ###################### print('Making batches...') # validation batches dev_question_batches = [ b for b in MakeBatches( dev_questions, batch_size, fillvalue=dev_questions[-1]) ] dev_answer_batches = [ b for b in MakeBatches( dev_answers['labs'], batch_size, fillvalue=dev_answers['labs'][-1]) ] dev_choice_batches = [ b for b in MakeBatches( dev_choices, batch_size, fillvalue=dev_choices[-1]) ] print('Finished making batches.') print('Time: %f s' % (time.time() - start_time)) ###################### # Testing # ###################### # evaluate on dev set widgets = [ 'Evaluating ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] pbar = ProgressBar(widgets=widgets) dev_correct = 0 predictions = [] for i in pbar(range(len(dev_question_batches))): # feed forward X_question_batch = GetQuestionsTensor(dev_question_batches[i], word_embedding, word_map) prob = model.predict_proba(X_question_batch, batch_size, verbose=0) # get word vecs of choices choice_feats = GetChoicesTensor(dev_choice_batches[i], word_embedding, word_map) similarity = np.zeros((5, batch_size), float) # calculate cosine distances for j in range(5): similarity[j] = np.diag(cosine_similarity(prob, choice_feats[j])) # take argmax of cosine distances pred = np.argmax(similarity, axis=0) + 1 predictions.extend(pred.tolist()) dev_correct += np.count_nonzero(dev_answer_batches[i] == pred) dev_acc = float(dev_correct) / len(dev_questions) print('Validation Accuracy: %f' % dev_acc) print('Validation Accuracy: %f' % dev_acc, file=sys.stderr) SavePredictions(args.output, predictions, dev_id_pairs) print('Time: %f s' % (time.time() - start_time)) print('Time: %f s' % (time.time() - start_time), file=sys.stderr) print('Testing finished.') print('Testing finished.', file=sys.stderr)
def main(): start_time = time.time() signal.signal(signal.SIGINT, InterruptHandler) #signal.signal(signal.SIGKILL, InterruptHandler) signal.signal(signal.SIGTERM, InterruptHandler) parser = argparse.ArgumentParser( prog='trainLSTM.py', description='Train LSTM model for visual question answering') parser.add_argument('--lstm-hidden-units', type=int, default=512, metavar='<lstm-hidden-units>') parser.add_argument('--lstm-hidden-layers', type=int, default=1, metavar='<lstm-hidden-layers>') #parser.add_argument('--dropout', type=float, default=0.5, metavar='<dropout-rate>') parser.add_argument('--num-epochs', type=int, default=100, metavar='<num-epochs>') #parser.add_argument('--model-save-interval', type=int, default=5, metavar='<interval>') parser.add_argument('--batch-size', type=int, default=128, metavar='<batch-size>') args = parser.parse_args() Inc_features_dim = 2048 word_vec_dim = 300 max_len = 30 data_dir = '/home/mlds/data/0.2_val/' ###################### # Load Data # ###################### print('Loading data...') train_id_pairs, train_image_ids = LoadIds('train', data_dir) dev_id_pairs, dev_image_ids = LoadIds('dev', data_dir) train_questions = LoadQuestions('train', data_dir) dev_questions = LoadQuestions('dev', data_dir) train_choices = LoadChoices('train', data_dir) dev_choices = LoadChoices('dev', data_dir) train_answers = LoadAnswers('train', data_dir) dev_answers = LoadAnswers('dev', data_dir) print('Finished loading data.') print('Time: %f s' % (time.time() - start_time)) print('-' * 100, file=sys.stderr) print('Training Information', file=sys.stderr) print('# of LSTM hidden units: %i' % args.lstm_hidden_units, file=sys.stderr) print('# of LSTM hidden layers: %i' % args.lstm_hidden_layers, file=sys.stderr) #print('Dropout: %f' % args.dropout, file=sys.stderr) print('# of training epochs: %i' % args.num_epochs, file=sys.stderr) print('Batch size: %i' % args.batch_size, file=sys.stderr) print('# of train questions: %i' % len(train_questions), file=sys.stderr) print('# of dev questions: %i' % len(dev_questions), file=sys.stderr) print('-' * 100, file=sys.stderr) ###################### # Model Descriptions # ###################### # LSTM model model = Sequential() model.add( LSTM(output_dim=args.lstm_hidden_units, return_sequences=True, input_shape=(max_len, Inc_features_dim + word_vec_dim))) for i in range(args.lstm_hidden_layers - 2): model.add( LSTM(output_dim=args.lstm_hidden_units, return_sequences=True)) model.add(LSTM(output_dim=word_vec_dim, return_sequences=False)) model.add(Activation('softmax')) json_string = model.to_json() model_filename = 'models/lstm_units_%i_layers_%i' % ( args.lstm_hidden_units, args.lstm_hidden_layers) open(model_filename + '.json', 'w').write(json_string) # loss and optimizer model.compile(loss='categorical_crossentropy', optimizer='rmsprop') print('Compilation finished.') print('Time: %f s' % (time.time() - start_time)) ############################################## # Load Word Vectors and Inception Features # ############################################## # load GloVe vectors print('Loading GloVe vectors...') word_embedding, word_map = LoadGloVe() print('GloVe vectors loaded') print('Time: %f s' % (time.time() - start_time)) print('Loading Inception features...') Inc_features, img_map = LoadInceptionFeatures() print('Inception features loaded') print('Time: %f s' % (time.time() - start_time)) ###################### # Make Batches # ###################### print('Making batches...') # training batches train_question_batches = [ b for b in MakeBatches( train_questions, args.batch_size, fillvalue=train_questions[-1]) ] train_answer_batches = [ b for b in MakeBatches(train_answers['toks'], args.batch_size, fillvalue=train_answers['toks'][-1]) ] train_image_batches = [ b for b in MakeBatches( train_image_ids, args.batch_size, fillvalue=train_image_ids[-1]) ] train_indices = list(range(len(train_question_batches))) # validation batches dev_question_batches = [ b for b in MakeBatches( dev_questions, args.batch_size, fillvalue=dev_questions[-1]) ] dev_answer_batches = [ b for b in MakeBatches(dev_answers['labs'], args.batch_size, fillvalue=dev_answers['labs'][-1]) ] dev_image_batches = [ b for b in MakeBatches( dev_image_ids, args.batch_size, fillvalue=dev_image_ids[-1]) ] dev_choice_batches = [ b for b in MakeBatches( dev_choices, args.batch_size, fillvalue=dev_choices[-1]) ] print('Finished making batches.') print('Time: %f s' % (time.time() - start_time)) ###################### # Training # ###################### dev_accs = [] max_acc = -1 max_acc_epoch = -1 print('Training started...') for k in range(args.num_epochs): print('Epoch %i' % (k + 1), file=sys.stderr) print('-' * 80) print('Epoch %i' % (k + 1)) progbar = generic_utils.Progbar(len(train_indices) * args.batch_size) # shuffle batch indices random.shuffle(train_indices) for i in train_indices: X_imgquestion_batch = GetImgQuestionsTensor( train_image_batches[i], Inc_features, img_map, train_question_batches[i], word_embedding, word_map) Y_answer_batch = GetAnswersMatrix(train_answer_batches[i], word_embedding, word_map) loss = model.train_on_batch(X_imgquestion_batch, Y_answer_batch) loss = loss[0].tolist() progbar.add(args.batch_size, values=[('train loss', loss)]) #if k % args.model_save_interval == 0: #model.save_weights(model_filename + '_epoch_{:03d}.hdf5'.format(k+1), overwrite=True) # evaluate on dev set progbar = generic_utils.Progbar( len(dev_question_batches) * args.batch_size) dev_correct = 0 for i in range(len(dev_question_batches)): # feed forward X_imgquestion_batch = GetImgQuestionsTensor( dev_image_batches[i], Inc_features, img_map, dev_question_batches[i], word_embedding, word_map) prob = model.predict_proba(X_imgquestion_batch, args.batch_size, verbose=0) # get word vecs of choices choice_feats = GetChoicesTensor(dev_choice_batches[i], word_embedding, word_map) similarity = np.zeros((5, args.batch_size), float) # calculate cosine distances for j in range(5): similarity[j] = np.diag( cosine_similarity(prob, choice_feats[j])) # take argmax of cosine distances pred = np.argmax(similarity, axis=0) + 1 dev_correct += np.count_nonzero(dev_answer_batches[i] == pred) progbar.add(args.batch_size) dev_acc = float(dev_correct) / len(dev_questions) dev_accs.append(dev_acc) print('Validation Accuracy: %f' % dev_acc) print('Validation Accuracy: %f' % dev_acc, file=sys.stderr) print('Time: %f s' % (time.time() - start_time)) print('Time: %f s' % (time.time() - start_time), file=sys.stderr) if dev_acc > max_acc: max_acc = dev_acc max_acc_epoch = k model.save_weights(model_filename + '_best.hdf5', overwrite=True) #model.save_weights(model_filename + '_epoch_{:03d}.hdf5'.format(k+1)) print(dev_accs, file=sys.stderr) print('Best validation accuracy: epoch#%i' % max_acc_epoch) print('Training finished.') print('Training finished.', file=sys.stderr) print('Time: %f s' % (time.time() - start_time)) print('Time: %f s' % (time.time() - start_time), file=sys.stderr)
def main(): start_time = time.time() parser = argparse.ArgumentParser(prog='valLSTM_MLP.py', description='Test LSTM-MLP model for visual question answering') parser.add_argument('--model-vgg', type=str, required=True, metavar='<model-path>') parser.add_argument('--weights-vgg', type=str, required=True, metavar='<weights-path>') parser.add_argument('--model-inc', type=str, required=True, metavar='<model-path>') parser.add_argument('--weights-inc', type=str, required=True, metavar='<weights-path>') args = parser.parse_args() word_vec_dim = 300 batch_size = 128 vgg_weight = 0.25 inc_weight = 1 - vgg_weight ####################### # Load Models # ####################### print('Loading models and weights...') model_vgg = model_from_json(open(args.model_vgg,'r').read()) model_vgg.compile(loss='categorical_crossentropy', optimizer='rmsprop') model_vgg.load_weights(args.weights_vgg) model_inc = model_from_json(open(args.model_inc,'r').read()) model_inc.compile(loss='categorical_crossentropy', optimizer='rmsprop') model_inc.load_weights(args.weights_inc) print('Models and weights loaded.') print('Time: %f s' % (time.time()-start_time)) ###################### # Load Data # ###################### print('Loading data...') train_id_pairs, train_image_ids = LoadIds('train') dev_id_pairs, dev_image_ids = LoadIds('dev') #test_id_pairs, test_image_ids = LoadIds('test') train_questions = LoadQuestions('train') dev_questions = LoadQuestions('dev') #test_questions = LoadQuestions('test') train_choices = LoadChoices('train') dev_choices = LoadChoices('dev') #test_choices = LoadChoices('test') train_answers = LoadAnswers('train') dev_answers = LoadAnswers('dev') print('Finished loading data.') print('Time: %f s' % (time.time()-start_time)) ######################################## # Load CNN Features and Word Vectors # ######################################## # load VGG features print('Loading VGG features...') VGG_features, vgg_img_map = LoadVGGFeatures() print('VGG features loaded') print('Time: %f s' % (time.time()-start_time)) # load Inception features print('Loading Inception features...') INC_features, inc_img_map = LoadInceptionFeatures() print('Inception features loaded') print('Time: %f s' % (time.time()-start_time)) # load GloVe vectors print('Loading GloVe vectors...') word_embedding, word_map = LoadGloVe() print('GloVe vectors loaded') print('Time: %f s' % (time.time()-start_time)) ###################### # Make Batches # ###################### print('Making batches...') # train batches train_question_batches = [ b for b in MakeBatches(train_questions, batch_size, fillvalue=train_questions[-1]) ] train_answer_batches = [ b for b in MakeBatches(train_answers['labs'], batch_size, fillvalue=train_answers['labs'][-1]) ] train_choice_batches = [ b for b in MakeBatches(train_choices, batch_size, fillvalue=train_choices[-1]) ] train_image_batches = [ b for b in MakeBatches(train_image_ids, batch_size, fillvalue=train_image_ids[-1]) ] # validation batches dev_question_batches = [ b for b in MakeBatches(dev_questions, batch_size, fillvalue=dev_questions[-1]) ] dev_answer_batches = [ b for b in MakeBatches(dev_answers['labs'], batch_size, fillvalue=dev_answers['labs'][-1]) ] dev_choice_batches = [ b for b in MakeBatches(dev_choices, batch_size, fillvalue=dev_choices[-1]) ] dev_image_batches = [ b for b in MakeBatches(dev_image_ids, batch_size, fillvalue=dev_image_ids[-1]) ] # testing batches #test_question_batches = [ b for b in MakeBatches(test_questions, batch_size, fillvalue=test_questions[-1]) ] #test_choice_batches = [ b for b in MakeBatches(test_choices, batch_size, fillvalue=test_choices[-1]) ] #test_image_batches = [ b for b in MakeBatches(test_image_ids, batch_size, fillvalue=test_image_ids[-1]) ] print('Finished making batches.') print('Time: %f s' % (time.time()-start_time)) ###################### # Testing # ###################### # evaluate on dev set pbar = generic_utils.Progbar(len(dev_question_batches)*batch_size) dev_correct = 0 for i in range(len(dev_question_batches)): # feed forward X_question_batch = GetQuestionsTensor(dev_question_batches[i], word_embedding, word_map) X_vgg_image_batch = GetImagesMatrix(dev_image_batches[i], vgg_img_map, VGG_features) X_inc_image_batch = GetImagesMatrix(dev_image_batches[i], inc_img_map, INC_features) prob_vgg = model_vgg.predict_proba([X_question_batch, X_vgg_image_batch], batch_size, verbose=0) prob_inc = model_inc.predict_proba([X_question_batch, X_inc_image_batch], batch_size, verbose=0) prob = (vgg_weight*prob_vgg + inc_weight*prob_inc) # get word vecs of choices choice_feats = GetChoicesTensor(dev_choice_batches[i], word_embedding, word_map) similarity = np.zeros((5, batch_size), float) # calculate cosine distances for j in range(5): similarity[j] = np.diag(cosine_similarity(prob, choice_feats[j])) # take argmax of cosine distances pred = np.argmax(similarity, axis=0) + 1 #predictions.extend(pred.tolist()) if i != (len(dev_question_batches)-1): dev_correct += np.count_nonzero(dev_answer_batches[i]==pred) else: num_padding = batch_size * len(dev_question_batches) - len(dev_questions) last_idx = batch_size - num_padding dev_correct += np.count_nonzero(dev_answer_batches[:last_idx]==pred[:last_idx]) #dev_correct += np.count_nonzero(dev_answer_batches[i]==pred) pbar.add(batch_size) print('Validation accuracy: %f' % (float(dev_correct)/len(dev_questions))) ''' train_correct = 0 pbar = generic_utils.Progbar(len(train_question_batches)*batch_size) for i in range(len(train_question_batches)): # feed forward X_question_batch = GetQuestionsTensor(train_question_batches[i], word_embedding, word_map) X_vgg_image_batch = GetImagesMatrix(train_image_batches[i], vgg_img_map, VGG_features) X_inc_image_batch = GetImagesMatrix(train_image_batches[i], inc_img_map, INC_features) prob_vgg = model_vgg.predict_proba([X_question_batch, X_vgg_image_batch], batch_size, verbose=0) prob_inc = model_inc.predict_proba([X_question_batch, X_inc_image_batch], batch_size, verbose=0) prob = (vgg_weight*prob_vgg + inc_weight*prob_inc) # get word vecs of choices choice_feats = GetChoicesTensor(train_choice_batches[i], word_embedding, word_map) similarity = np.zeros((5, batch_size), float) # calculate cosine distances for j in range(5): similarity[j] = np.diag(cosine_similarity(prob, choice_feats[j])) # take argmax of cosine distances pred = np.argmax(similarity, axis=0) + 1 #predictions.extend(pred.tolist()) if i != (len(dev_question_batches)-1): train_correct += np.count_nonzero(train_answer_batches[i]==pred) else: num_padding = batch_size * len(train_question_batches) - len(train_questions) last_idx = batch_size - num_padding train_correct += np.count_nonzero(train_answer_batches[:last_idx]==pred[:last_idx]) #train_correct += np.count_nonzero(train_answer_batches[i]==pred) pbar.add(batch_size) print('Training accuracy: %f' % (float(train_correct)/len(train_questions))) ''' print('Time: %f s' % (time.time()-start_time)) print('Testing finished.')
def main(): start_time = time.time() parser = argparse.ArgumentParser( prog='valLSTM_MLP.py', description='Test LSTM-MLP model for visual question answering') parser.add_argument('--model', type=str, required=True, metavar='<model-path>') parser.add_argument('--weights', type=str, required=True, metavar='<weights-path>') args = parser.parse_args() word_vec_dim = 300 batch_size = 128 ####################### # Load Model # ####################### print('Loading model and weights...') model = model_from_json(open(args.model, 'r').read()) model.compile(loss='categorical_crossentropy', optimizer='rmsprop') model.load_weights(args.weights) print('Model and weights loaded.') print('Time: %f s' % (time.time() - start_time)) ###################### # Load Data # ###################### print('Loading data...') train_id_pairs, train_image_ids = LoadIds('train') dev_id_pairs, dev_image_ids = LoadIds('dev') #test_id_pairs, test_image_ids = LoadIds('test') train_questions = LoadQuestions('train') dev_questions = LoadQuestions('dev') #test_questions = LoadQuestions('test') train_choices = LoadChoices('train') dev_choices = LoadChoices('dev') #test_choices = LoadChoices('test') train_answers = LoadAnswers('train') dev_answers = LoadAnswers('dev') train_qa_type, train_qtype_count, train_atype_count = LoadQAType('train') dev_qa_type, dev_qtype_count, dev_atype_count = LoadQAType('dev') train_total_qtype_count, train_total_atype_count = LoadTotalQAType('train') dev_total_qtype_count, dev_total_atype_count = LoadTotalQAType('dev') print('Finished loading data.') print('Time: %f s' % (time.time() - start_time)) ######################################## # Load CNN Features and Word Vectors # ######################################## # load VGG features print('Loading VGG features...') VGG_features, img_map = LoadVGGFeatures() print('VGG features loaded') print('Time: %f s' % (time.time() - start_time)) # load VGG features print('Loading Inception features...') INC_features, inc_img_map = LoadInceptionFeatures() print('Inception features loaded') print('Time: %f s' % (time.time() - start_time)) # load GloVe vectors print('Loading GloVe vectors...') word_embedding, word_map = LoadGloVe() print('GloVe vectors loaded') print('Time: %f s' % (time.time() - start_time)) ###################### # Make Batches # ###################### print('Making batches...') # train batches train_question_batches = [ b for b in MakeBatches( train_questions, batch_size, fillvalue=train_questions[-1]) ] train_answer_batches = [ b for b in MakeBatches(train_answers['labs'], batch_size, fillvalue=train_answers['labs'][-1]) ] train_choice_batches = [ b for b in MakeBatches( train_choices, batch_size, fillvalue=train_choices[-1]) ] train_image_batches = [ b for b in MakeBatches( train_image_ids, batch_size, fillvalue=train_image_ids[-1]) ] train_qatype_batches = [ b for b in MakeBatches( train_qa_type, batch_size, fillvalue=train_id_pairs[-1]) ] # validation batches dev_question_batches = [ b for b in MakeBatches( dev_questions, batch_size, fillvalue=dev_questions[-1]) ] dev_answer_batches = [ b for b in MakeBatches( dev_answers['labs'], batch_size, fillvalue=dev_answers['labs'][-1]) ] dev_choice_batches = [ b for b in MakeBatches( dev_choices, batch_size, fillvalue=dev_choices[-1]) ] dev_image_batches = [ b for b in MakeBatches( dev_image_ids, batch_size, fillvalue=dev_image_ids[-1]) ] dev_qatype_batches = [ b for b in MakeBatches( dev_qa_type, batch_size, fillvalue=dev_id_pairs[-1]) ] # testing batches #test_question_batches = [ b for b in MakeBatches(test_questions, batch_size, fillvalue=test_questions[-1]) ] #test_choice_batches = [ b for b in MakeBatches(test_choices, batch_size, fillvalue=test_choices[-1]) ] #test_image_batches = [ b for b in MakeBatches(test_image_ids, batch_size, fillvalue=test_image_ids[-1]) ] print('Finished making batches.') print('Time: %f s' % (time.time() - start_time)) ###################### # Testing # ###################### # evaluate on dev set pbar = generic_utils.Progbar(len(dev_question_batches) * batch_size) dev_correct = 0 for i in range(len(dev_question_batches)): # feed forward X_question_batch = GetQuestionsTensor(dev_question_batches[i], word_embedding, word_map) X_image_batch = GetImagesMatrix(dev_image_batches[i], inc_img_map, INC_features) prob = model.predict_proba([X_question_batch, X_image_batch], batch_size, verbose=0) # get word vecs of choices choice_feats = GetChoicesTensor(dev_choice_batches[i], word_embedding, word_map) similarity = np.zeros((5, batch_size), float) # calculate cosine distances for j in range(5): similarity[j] = np.diag(cosine_similarity(prob, choice_feats[j])) # take argmax of cosine distances pred = np.argmax(similarity, axis=0) + 1 #predictions.extend(pred.tolist()) dev_correct += np.count_nonzero(dev_answer_batches[i] == pred) # count the incorrect question and answer types incorrect = np.nonzero(dev_answer_batches[i] != pred)[0].tolist() #idpair_batch = dev_idpair_batches[i] qatype_batch = dev_qatype_batches[i] for idx in incorrect: q_type, a_type = qatype_batch[idx] dev_qtype_count[q_type] += 1 dev_atype_count[a_type] += 1 pbar.add(batch_size) print('Validation accuracy: %f' % (float(dev_correct) / len(dev_questions))) train_correct = 0 pbar = generic_utils.Progbar(len(train_question_batches) * batch_size) for i in range(len(train_question_batches)): # feed forward X_question_batch = GetQuestionsTensor(train_question_batches[i], word_embedding, word_map) X_image_batch = GetImagesMatrix(train_image_batches[i], inc_img_map, INC_features) prob = model.predict_proba([X_question_batch, X_image_batch], batch_size, verbose=0) # get word vecs of choices choice_feats = GetChoicesTensor(train_choice_batches[i], word_embedding, word_map) similarity = np.zeros((5, batch_size), float) # calculate cosine distances for j in range(5): similarity[j] = np.diag(cosine_similarity(prob, choice_feats[j])) # take argmax of cosine distances pred = np.argmax(similarity, axis=0) + 1 #predictions.extend(pred.tolist()) train_correct += np.count_nonzero(train_answer_batches[i] == pred) # count the incorrect question and answer types incorrect = np.nonzero(train_answer_batches[i] != pred)[0].tolist() qatype_batch = train_qatype_batches[i] for idx in incorrect: q_type, a_type = qatype_batch[idx] train_qtype_count[q_type] += 1 train_atype_count[a_type] += 1 pbar.add(batch_size) print('Training accuracy: %f' % (float(train_correct) / len(train_questions))) print('Validation QA types:') print(dev_qtype_count) print(dev_atype_count) print('Training QA types:') print(train_qtype_count) print(train_atype_count) print('Total QA types:') print(train_total_qtype_count) print(train_total_atype_count) print(dev_total_qtype_count) print(dev_total_atype_count) print('Time: %f s' % (time.time() - start_time)) print('Testing finished.')
def main(): start_time = time.time() signal.signal(signal.SIGINT, InterruptHandler) #signal.signal(signal.SIGKILL, InterruptHandler) signal.signal(signal.SIGTERM, InterruptHandler) parser = argparse.ArgumentParser( prog='trainMLP.py', description='Train MLP model for visual question answering') parser.add_argument('--mlp-hidden-units', type=int, default=1024, metavar='<mlp-hidden-units>') parser.add_argument('--mlp-hidden-layers', type=int, default=3, metavar='<mlp-hidden-layers>') parser.add_argument('--dropout', type=float, default=0.5, metavar='<dropout-rate>') parser.add_argument('--mlp-activation', type=str, default='relu', metavar='<activation-function>') parser.add_argument('--num-epochs', type=int, default=100, metavar='<num-epochs>') parser.add_argument('--model-save-interval', type=int, default=5, metavar='<interval>') parser.add_argument('--batch-size', type=int, default=128, metavar='<batch-size>') args = parser.parse_args() word_vec_dim = 300 img_dim = 4096 ###################### # Load Data # ###################### print('Loading data...') train_id_pairs, train_image_ids = LoadIds('train') dev_id_pairs, dev_image_ids = LoadIds('dev') train_choices = LoadChoices('train') dev_choices = LoadChoices('dev') train_answers = LoadAnswers('train') dev_answers = LoadAnswers('dev') print('Finished loading data.') print('Time: %f s' % (time.time() - start_time)) print('-' * 100, file=sys.stderr) print('Training Information', file=sys.stderr) print('# of MLP hidden units: %i' % args.mlp_hidden_units, file=sys.stderr) print('# of MLP hidden layers: %i' % args.mlp_hidden_layers, file=sys.stderr) print('Dropout: %f' % args.dropout, file=sys.stderr) print('MLP activation function: %s' % args.mlp_activation, file=sys.stderr) print('# of training epochs: %i' % args.num_epochs, file=sys.stderr) print('Batch size: %i' % args.batch_size, file=sys.stderr) print('# of train images: %i' % len(train_image_ids), file=sys.stderr) print('# of dev images: %i' % len(dev_image_ids), file=sys.stderr) print('-' * 100, file=sys.stderr) ###################### # Model Descriptions # ###################### # MLP model model = Sequential() model.add(Dense(output_dim=args.mlp_hidden_units, input_dim=img_dim)) model.add(Activation(args.mlp_activation)) model.add(Dropout(args.dropout)) for i in range(args.mlp_hidden_layers - 1): model.add(Dense(args.mlp_hidden_units)) model.add(Activation(args.mlp_activation)) model.add(Dropout(args.dropout)) model.add(Dense(word_vec_dim)) model.add(Activation('softmax')) json_string = model.to_json() model_filename = 'models/mlp_units_%i_layers_%i' % (args.mlp_hidden_units, args.mlp_hidden_layers) open(model_filename + '.json', 'w').write(json_string) # loss and optimizer model.compile(loss='categorical_crossentropy', optimizer='adagrad') print('Compilation finished.') print('Time: %f s' % (time.time() - start_time)) ######################################## # Load CNN Features and Word Vectors # ######################################## # load VGG features print('Loading VGG features...') VGG_features, img_map = LoadVGGFeatures() print('VGG features loaded') print('Time: %f s' % (time.time() - start_time)) # load GloVe vectors print('Loading GloVe vectors...') word_embedding, word_map = LoadGloVe() print('GloVe vectors loaded') print('Time: %f s' % (time.time() - start_time)) ###################### # Make Batches # ###################### print('Making batches...') # training batches train_answer_batches = [ b for b in MakeBatches(train_answers['toks'], args.batch_size, fillvalue=train_answers['toks'][-1]) ] train_image_batches = [ b for b in MakeBatches( train_image_ids, args.batch_size, fillvalue=train_image_ids[-1]) ] train_indices = list(range(len(train_image_ids))) # validation batches dev_answer_batches = [ b for b in MakeBatches(dev_answers['labs'], args.batch_size, fillvalue=dev_answers['labs'][-1]) ] dev_choice_batches = [ b for b in MakeBatches( dev_choices, args.batch_size, fillvalue=dev_choices[-1]) ] dev_image_batches = [ b for b in MakeBatches( dev_image_ids, args.batch_size, fillvalue=dev_image_ids[-1]) ] print('Finished making batches.') print('Time: %f s' % (time.time() - start_time)) ###################### # Training # ###################### dev_accs = [] max_acc = -1 max_acc_epoch = -1 print('Training started...') for k in range(args.num_epochs): print('Epoch %i' % (k + 1), file=sys.stderr) print('-' * 80) print('Epoch %i' % (k + 1)) progbar = generic_utils.Progbar(len(train_indices) * args.batch_size) # shuffle batch indices random.shuffle(train_indices) for i in train_indices: X_image_batch = GetImagesMatrix(train_image_batches[i], img_map, VGG_features) Y_answer_batch = GetAnswersMatrix(train_answer_batches[i], word_embedding, word_map) loss = model.train_on_batch(X_image_batch, Y_answer_batch) loss = loss[0].tolist() progbar.add(args.batch_size, values=[('train loss', loss)]) if k % args.model_save_interval == 0: model.save_weights(model_filename + '_epoch_{:03d}.hdf5'.format(k + 1), overwrite=True) # evaluate on dev set widgets = [ 'Evaluating ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] pbar = ProgressBar(widgets=widgets, redirect_stdout=True) dev_correct = 0 for i in pbar(range(len(dev_image_batches))): # feed forward X_image_batch = GetImagesMatrix(dev_image_batches[i], img_map, VGG_features) prob = model.predict_proba(X_image_batch, args.batch_size, verbose=0) # get word vecs of choices choice_feats = GetChoicesTensor(dev_choice_batches[i], word_embedding, word_map) similarity = np.zeros((5, args.batch_size), float) # calculate cosine distances for j in range(5): similarity[j] = np.diag( cosine_similarity(prob, choice_feats[j])) # take argmax of cosine distances pred = np.argmax(similarity, axis=0) + 1 dev_correct += np.count_nonzero(dev_answer_batches[i] == pred) dev_acc = float(dev_correct) / len(dev_image_ids) dev_accs.append(dev_acc) print('Validation Accuracy: %f' % dev_acc) print('Validation Accuracy: %f' % dev_acc, file=sys.stderr) print('Time: %f s' % (time.time() - start_time)) print('Time: %f s' % (time.time() - start_time), file=sys.stderr) if dev_acc > max_acc: max_acc = dev_acc max_acc_epoch = k model.save_weights(model_filename + '_best.hdf5', overwrite=True) model.save_weights(model_filename + '_epoch_{:03d}.hdf5'.format(k + 1)) print(dev_accs, file=sys.stderr) print('Best validation accuracy: epoch#%i' % max_acc_epoch) print('Training finished.') print('Training finished.', file=sys.stderr) print('Time: %f s' % (time.time() - start_time)) print('Time: %f s' % (time.time() - start_time), file=sys.stderr)
def main(): start_time = time.time() parser = argparse.ArgumentParser( prog='trainMemNN.py', description='Train MemmNN model for visual question answering') parser.add_argument('--mlp-hidden-units', type=int, default=1024, metavar='<mlp-hidden-units>') parser.add_argument('--mlp-hidden-layers', type=int, default=3, metavar='<mlp-hidden-layers>') parser.add_argument('--mlp-activation', type=str, default='tanh', metavar='<activation-function>') parser.add_argument('--emb-dimension', type=int, default=50, metavar='<embedding-dimension>') parser.add_argument('--num-epochs', type=int, default=100, metavar='<num-epochs>') parser.add_argument('--batch-size', type=int, default=128, metavar='<batch-size>') parser.add_argument('--hops', type=int, default=3, metavar='<memnet-hops>') parser.add_argument('--learning-rate', type=float, default=0.001, metavar='<learning-rate>') parser.add_argument('--dropout', type=float, default=0.2, metavar='<dropout-rate>') parser.add_argument('--dev-accuracy-path', type=str, required=True, metavar='<accuracy-path>') args = parser.parse_args() word_vec_dim = 300 img_dim = 300 max_len = 30 img_feature_num = 125 ###################### # Load Data # ###################### data_dir = '/home/mlds/data/0.05_val/' print('Loading data...') train_q_ids, train_image_ids = LoadIds('train', data_dir) dev_q_ids, dev_image_ids = LoadIds('dev', data_dir) #test_q_ids,test_image_ids = LoadIds('test', data_dir) train_questions = LoadQuestions('train', data_dir) dev_questions = LoadQuestions('dev', data_dir) train_choices = LoadChoices('train', data_dir) dev_choices = LoadChoices('dev', data_dir) train_answers = LoadAnswers('train', data_dir) dev_answers = LoadAnswers('dev', data_dir) caption_map = LoadCaptions('train') ''' caption_map_test = LoadCaptions('test') maxtrain=-1 maxdev=-1 maxtest=-1 for img_id in train_image_ids: sent = caption_map[img_id] if len(sent) > maxtrain: maxtrain = len(sent) for img_id in dev_image_ids: sent = caption_map[img_id] if len(sent) > maxdev: maxdev = len(sent) for img_id in test_image_ids: sent = caption_map_test[img_id] if len(sent) > maxtest: maxtest = len(sent) print(maxtrain) print(maxdev) print(maxtest) sys.exit() ''' print('Finished loading data.') print('Time: %f s' % (time.time() - start_time)) ###################### # Model Descriptions # ###################### print('Generating and compiling model...') model = CreateGraph(args.emb_dimension, args.hops, args.mlp_activation, args.mlp_hidden_units, args.mlp_hidden_layers, word_vec_dim, img_dim, img_feature_num, args.dropout) json_string = model.to_json() model_filename = 'models/memNN.mlp_units_%i_layers_%i_%s.emb_dim_%i.hops_%i.lr%.1e.dropout_%.1e' % ( args.mlp_hidden_units, args.mlp_hidden_layers, args.mlp_activation, args.emb_dimension, args.hops, args.learning_rate, args.dropout) open(model_filename + '.json', 'w').write(json_string) # loss and optimizer rmsprop = RMSprop(lr=args.learning_rate) #model.compile(loss='categorical_crossentropy', optimizer=rmsprop) model.compile(loss={'output': Loss}, optimizer=rmsprop) print('Compilation finished.') print('Time: %f s' % (time.time() - start_time)) ######################################## # Load CNN Features and Word Vectors # ######################################## # load VGG features ''' print('Loading VGG features...') VGG_features, img_map = LoadVGGFeatures() print('VGG features loaded') print('Time: %f s' % (time.time()-start_time)) ''' # load GloVe vectors print('Loading GloVe vectors...') word_embedding, word_map = LoadGloVe() print('GloVe vectors loaded') print('Time: %f s' % (time.time() - start_time)) ###################### # Make Batches # ###################### print('Making batches...') # training batches train_question_batches = [ b for b in MakeBatches( train_questions, args.batch_size, fillvalue=train_questions[-1]) ] train_answer_batches = [ b for b in MakeBatches(train_answers['toks'], args.batch_size, fillvalue=train_answers['toks'][-1]) ] train_image_batches = [ b for b in MakeBatches( train_image_ids, args.batch_size, fillvalue=train_image_ids[-1]) ] train_indices = list(range(len(train_question_batches))) # validation batches dev_question_batches = [ b for b in MakeBatches( dev_questions, args.batch_size, fillvalue=dev_questions[-1]) ] dev_answer_batches = [ b for b in MakeBatches(dev_answers['labs'], args.batch_size, fillvalue=dev_answers['labs'][-1]) ] dev_choice_batches = [ b for b in MakeBatches( dev_choices, args.batch_size, fillvalue=dev_choices[-1]) ] dev_image_batches = [ b for b in MakeBatches( dev_image_ids, args.batch_size, fillvalue=dev_image_ids[-1]) ] print('Finished making batches.') print('Time: %f s' % (time.time() - start_time)) ###################### # Training # ###################### acc_file = open(args.dev_accuracy_path, 'w') dev_accs = [] max_acc = -1 max_acc_epoch = -1 # define interrupt handler def PrintDevAcc(): print('Max validation accuracy epoch: %i' % max_acc_epoch) print(dev_accs) def InterruptHandler(sig, frame): print(str(sig)) PrintDevAcc() sys.exit(-1) signal.signal(signal.SIGINT, InterruptHandler) signal.signal(signal.SIGTERM, InterruptHandler) # print training information print('-' * 80) print('Training Information') print('# of MLP hidden units: %i' % args.mlp_hidden_units) print('# of MLP hidden layers: %i' % args.mlp_hidden_layers) print('MLP activation function: %s' % args.mlp_activation) print('# of training epochs: %i' % args.num_epochs) print('Batch size: %i' % args.batch_size) print('Learning rate: %f' % args.learning_rate) print('# of train questions: %i' % len(train_questions)) print('# of dev questions: %i' % len(dev_questions)) print('-' * 80) acc_file.write('-' * 80 + '\n') acc_file.write('Training Information\n') acc_file.write('# of MLP hidden units: %i\n' % args.mlp_hidden_units) acc_file.write('# of MLP hidden layers: %i\n' % args.mlp_hidden_layers) acc_file.write('MLP activation function: %s\n' % args.mlp_activation) acc_file.write('# of training epochs: %i\n' % args.num_epochs) acc_file.write('Batch size: %i\n' % args.batch_size) acc_file.write('Learning rate: %f\n' % args.learning_rate) acc_file.write('# of train questions: %i\n' % len(train_questions)) acc_file.write('# of dev questions: %i\n' % len(dev_questions)) acc_file.write('-' * 80 + '\n') acc_file.close() # start training print('Training started...') for k in range(args.num_epochs): print('-' * 80) print('Epoch %i' % (k + 1)) progbar = generic_utils.Progbar(len(train_indices) * args.batch_size) # shuffle batch indices random.shuffle(train_indices) for i in train_indices: X_question_batch = GetQuestionsTensor(train_question_batches[i], word_embedding, word_map) #X_image_batch = GetImagesMatrix(train_image_batches[i], img_map, VGG_features) X_caption_batch = GetCaptionsTensor2(train_image_batches[i], word_embedding, word_map, caption_map) Y_answer_batch = GetAnswersMatrix(train_answer_batches[i], word_embedding, word_map) loss = model.train_on_batch({ 'question': X_question_batch, 'image': X_caption_batch, 'output': Y_answer_batch }) loss = loss[0].tolist() progbar.add(args.batch_size, values=[('train loss', loss)]) print('Time: %f s' % (time.time() - start_time)) # evaluate on dev set pbar = generic_utils.Progbar( len(dev_question_batches) * args.batch_size) dev_correct = 0 # feed forward for i in range(len(dev_question_batches)): X_question_batch = GetQuestionsTensor(dev_question_batches[i], word_embedding, word_map) #X_image_batch = GetImagesMatrix(dev_image_batches[i], img_map, VGG_features) X_caption_batch = GetCaptionsTensor2(dev_image_batches[i], word_embedding, word_map, caption_map) prob = model.predict_on_batch({ 'question': X_question_batch, 'image': X_caption_batch }) prob = prob[0] # get word vecs of choices choice_feats = GetChoicesTensor(dev_choice_batches[i], word_embedding, word_map) similarity = np.zeros((5, args.batch_size), float) # calculate cosine distances for j in range(5): similarity[j] = np.diag( cosine_similarity(prob, choice_feats[j])) # take argmax of cosine distances pred = np.argmax(similarity, axis=0) + 1 if i != (len(dev_question_batches) - 1): dev_correct += np.count_nonzero(dev_answer_batches[i] == pred) else: num_padding = args.batch_size * len( dev_question_batches) - len(dev_questions) last_idx = args.batch_size - num_padding dev_correct += np.count_nonzero( dev_answer_batches[:last_idx] == pred[:last_idx]) pbar.add(args.batch_size) dev_acc = float(dev_correct) / len(dev_questions) dev_accs.append(dev_acc) with open(args.dev_accuracy_path, 'a') as acc_file: acc_file.write('%f\n' % dev_acc) print('Validation Accuracy: %f' % dev_acc) print('Time: %f s' % (time.time() - start_time)) if dev_acc > max_acc: max_acc = dev_acc max_acc_epoch = k model.save_weights(model_filename + '_best.hdf5', overwrite=True) #model.save_weights(model_filename + '_epoch_{:03d}.hdf5'.format(k+1)) acc_file = open(args.dev_accuracy_path, 'a') print(dev_accs) print('Best validation accuracy: %f; epoch#%i' % (max_acc, (max_acc_epoch + 1))) acc_file.write('Best validation accuracy: %f; epoch#%i\n' % (max_acc, (max_acc_epoch + 1))) print('Training finished.') acc_file.write('Training finished.\n') print('Time: %f s' % (time.time() - start_time)) acc_file.write('Time: %f s\n' % (time.time() - start_time)) acc_file.close()