def main(): parser = argparse.ArgumentParser() parser.add_argument( '--model', type=str, default= '../models/lstm_1_num_hidden_units_lstm_512_num_hidden_units_mlp_1024_num_hidden_layers_mlp_3.json' ) parser.add_argument( '--weights', type=str, default= '../models/lstm_1_num_hidden_units_lstm_512_num_hidden_units_mlp_1024_num_hidden_layers_mlp_3_epoch_070.hdf5' ) parser.add_argument('--sample_size', type=int, default=25) parser.add_argument('--caffe', help='path to caffe installation') parser.add_argument('--model_def', help='path to model definition prototxt') parser.add_argument('--vggmodel', default='VGG_ILSVRC_16_layers.caffemodel', help='path to model parameters') args = parser.parse_args() print 'Loading Word2vec' nlp = English() print 'Loaded word2vec features' labelencoder = joblib.load('../models/labelencoder.pkl') print 'Loading Model' model = model_from_json(open(args.model).read()) print 'Loading Weights' model.load_weights(args.weights) model.compile(loss='categorical_crossentropy', optimizer='rmsprop') print 'Loaded' q = True while q: path = str(raw_input('Enter path to image : ')) if path != 'same': base_dir = os.path.dirname(path) os.system('python extract_features.py --caffe ' + str(args.caffe) + ' --model_def vgg_features.prototxt --gpu --model ' + str(args.vggmodel) + ' --image ' + path) print 'Loading VGGfeats' vgg_model_path = os.path.join(base_dir + '/vgg_feats.mat') features_struct = scipy.io.loadmat(vgg_model_path) VGGfeatures = features_struct['feats'] print "Loaded" question = unicode(raw_input("Ask a question: ")) if question == "quit": q = False timesteps = len(nlp(question)) X_q = get_questions_tensor_timeseries([question], nlp, timesteps) X_i = np.reshape(VGGfeatures, (1, 4096)) X = [X_q, X_i] y_predict = model.predict_classes(X, verbose=0) print labelencoder.inverse_transform(y_predict)
def main(): """ Before runnning this demo ensure that you have some images from the MS COCO validation set saved somewhere, and update the image_dir variable accordingly Also, this demo is designed to run with the models released with the visual-qa repo, if you would like to get use it with some other model (say an MLP based model or a langauge-only model) you will have to make some changes. """ image_dir = "../../vqa_images/" local_images = [f for f in listdir(image_dir) if isfile(join(image_dir, f))] parser = argparse.ArgumentParser() parser.add_argument( "-model", type=str, default="../models/lstm_1_num_hidden_units_lstm_512_num_hidden_units_mlp_1024_num_hidden_layers_mlp_3.json", ) parser.add_argument( "-weights", type=str, default="../models/lstm_1_num_hidden_units_lstm_512_num_hidden_units_mlp_1024_num_hidden_layers_mlp_3_epoch_070.hdf5", ) parser.add_argument("-sample_size", type=int, default=25) args = parser.parse_args() model = model_from_json(open(args.model).read()) model.load_weights(args.weights) model.compile(loss="categorical_crossentropy", optimizer="rmsprop") print "Model loaded and compiled" images_val = open("../data/preprocessed/images_val2014.txt", "r").read().decode("utf8").splitlines() nlp = English() print "Loaded word2vec features" labelencoder = joblib.load("../models/labelencoder.pkl") vgg_model_path = "../features/coco/vgg_feats.mat" features_struct = scipy.io.loadmat(vgg_model_path) VGGfeatures = features_struct["feats"] print "Loaded vgg features" image_ids = open("../features/coco_vgg_IDMap.txt").read().splitlines() img_map = {} for ids in image_ids: id_split = ids.split() img_map[id_split[0]] = int(id_split[1]) image_sample = random.sample(local_images, args.sample_size) for image in image_sample: p = subprocess.Popen(["display", image_dir + image]) q = unicode(raw_input("Ask a question about the image:")) coco_id = str(int(image[-16:-4])) timesteps = len(nlp(q)) # questions sorted in descending order of length X_q = get_questions_tensor_timeseries([q], nlp, timesteps) X_i = get_images_matrix([coco_id], img_map, VGGfeatures) X = [X_q, X_i] y_predict = model.predict_classes(X, verbose=0) print labelencoder.inverse_transform(y_predict) raw_input("Press enter to continue...") p.kill()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model', type=str, default='../models/lstm_1_num_hidden_units_lstm_512_num_hidden_units_mlp_1024_num_hidden_layers_mlp_3.json') parser.add_argument('--weights', type=str, default='../models/lstm_1_num_hidden_units_lstm_512_num_hidden_units_mlp_1024_num_hidden_layers_mlp_3_epoch_070.hdf5') parser.add_argument('--sample_size', type=int, default=25) parser.add_argument('--caffe', help='path to caffe installation') parser.add_argument('--model_def', help='path to model definition prototxt') parser.add_argument('--vggmodel', default='VGG_ILSVRC_16_layers.caffemodel', help='path to model parameters') args = parser.parse_args() print 'Loading Word2vec' nlp = English() print 'Loaded word2vec features' labelencoder = joblib.load('../models/labelencoder.pkl') print 'Loading Model' model = model_from_json(open(args.model).read()) print 'Loading Weights' model.load_weights(args.weights) model.compile(loss='categorical_crossentropy', optimizer='rmsprop') print 'Loaded' q = True while q: path = str(raw_input('Enter path to image : ')) if path != 'same': base_dir = os.path.dirname(path) os.system('python extract_features.py --caffe ' + str(args.caffe) + ' --model_def vgg_features.prototxt --gpu --model ' + str(args.vggmodel) + ' --image ' + path ) print 'Loading VGGfeats' vgg_model_path = os.path.join(base_dir + '/vgg_feats.mat') features_struct = scipy.io.loadmat(vgg_model_path) VGGfeatures = features_struct['feats'] print "Loaded" question = unicode(raw_input("Ask a question: ")) if question == "quit": q = False timesteps = len(nlp(question)) X_q = get_questions_tensor_timeseries([question], nlp, timesteps) X_i = np.reshape(VGGfeatures, (1, 4096)) X = [X_q, X_i] y_predict = model.predict_classes(X, verbose=0) print labelencoder.inverse_transform(y_predict)
def predict(): path = str(raw_input('Enter path to image : ')) question = unicode(raw_input("Ask a question: ")) print(question, path) if path != 'same': base_dir = os.path.dirname(path) os.system('python extract_features.py --caffe ' + str(CAFFE_PATH) + ' --model_def vgg_features.prototxt --gpu --model ' + str(CAFFE_MODEL_PATH) + ' --image ' + path ) print 'Loading VGGfeats' vgg_model_path = os.path.join(base_dir + '/vgg_feats.mat') features_struct = scipy.io.loadmat(vgg_model_path) VGGfeatures = features_struct['feats'] print "Loaded" timesteps = len(nlp(question)) X_q = get_questions_tensor_timeseries([question], nlp, timesteps) X_i = np.reshape(VGGfeatures, (1, 4096)) X = [X_q, X_i] y_predict = model.predict_classes(X, verbose=0) ans = labelencoder.inverse_transform(y_predict) print(ans) return 'OK'
def main(): parser = argparse.ArgumentParser() parser.add_argument('-num_hidden_units', type=int, default=512) parser.add_argument('-num_lstm_layers', type=int, default=2) parser.add_argument('-dropout', type=float, default=0.2) parser.add_argument('-activation', type=str, default='tanh') parser.add_argument('-num_epochs', type=int, default=100) parser.add_argument('-model_save_interval', type=int, default=5) parser.add_argument('-batch_size', type=int, default=128) parser.add_argument('-word_vector', type=str, default='') args = parser.parse_args() questions_train = open('../data/preprocessed/questions_train2014.txt', 'r').read().decode('utf8').splitlines() questions_lengths_train = open('../data/preprocessed/questions_lengths_train2014.txt', 'r').read().decode('utf8').splitlines() answers_train = open('../data/preprocessed/answers_train2014.txt', 'r').read().decode('utf8').splitlines() images_train = open('../data/preprocessed/images_train2014.txt', 'r').read().decode('utf8').splitlines() max_answers = 1000 questions_train, answers_train, images_train = selectFrequentAnswers(questions_train,answers_train,images_train, max_answers) print 'Loaded questions, sorting by length...' questions_lengths_train, questions_train, answers_train = (list(t) for t in zip(*sorted(zip(questions_lengths_train, questions_train, answers_train)))) #encode the remaining answers labelencoder = preprocessing.LabelEncoder() labelencoder.fit(answers_train) nb_classes = len(list(labelencoder.classes_)) joblib.dump(labelencoder,'../models/labelencoder.pkl') max_len = 30 #25 is max for training, 27 is max for validation word_vec_dim = 300 model = Sequential() model.add(LSTM(output_dim = args.num_hidden_units, activation='tanh', return_sequences=True, input_shape=(max_len, word_vec_dim))) model.add(Dropout(args.dropout)) model.add(LSTM(args.num_hidden_units, return_sequences=False)) model.add(Dense(nb_classes, init='uniform')) model.add(Activation('softmax')) json_string = model.to_json() model_file_name = '../models/lstm_language_only_num_hidden_units_' + str(args.num_hidden_units) + '_num_lstm_layers_' + str(args.num_lstm_layers) + '_dropout_' + str(args.dropout) open(model_file_name + '.json', 'w').write(json_string) print 'Compiling model...' model.compile(loss='categorical_crossentropy', optimizer='rmsprop') print 'Compilation done...' #set up word vectors # Code to choose the word vectors, default is Goldberg but GLOVE is preferred if args.word_vector == 'glove': nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors') else: nlp = English() print 'loaded ' + args.word_vector + ' word2vec features...' ## training # Moved few variables to args.parser (num_epochs, batch_size, model_save_interval) print 'Training started...' for k in xrange(args.num_epochs): progbar = generic_utils.Progbar(len(questions_train)) for qu_batch,an_batch,im_batch in zip(grouper(questions_train, args.batch_size, fillvalue=questions_train[0]), grouper(answers_train, args.batch_size, fillvalue=answers_train[0]), grouper(images_train, args.batch_size, fillvalue=images_train[0])): timesteps = len(nlp(qu_batch[-1])) #questions sorted in descending order of length X_q_batch = get_questions_tensor_timeseries(qu_batch, nlp, timesteps) Y_batch = get_answers_matrix(an_batch, labelencoder) loss = model.train_on_batch(X_q_batch, Y_batch) # fix for the Keras v0.3 issue #9 progbar.add(args.batch_size, values=[("train loss", loss[0])]) if k%args.model_save_interval == 0: model.save_weights(model_file_name + '_epoch_{:02d}.hdf5'.format(k)) model.save_weights(model_file_name + '_epoch_{:02d}.hdf5'.format(k+1))
def main(): parser = argparse.ArgumentParser() parser.add_argument('-model', type=str, required=True) parser.add_argument('-weights', type=str, required=True) parser.add_argument('-results', type=str, required=True) args = parser.parse_args() model = model_from_json(open(args.model).read()) model.load_weights(args.weights) model.compile(loss='categorical_crossentropy', optimizer='rmsprop') questions_val = open('../data/preprocessed/questions_val2014.txt', 'r').read().decode('utf8').splitlines() questions_lengths_val = open('../data/preprocessed/questions_lengths_val2014.txt', 'r').read().decode('utf8').splitlines() answers_val = open('../data/preprocessed/answers_val2014.txt', 'r').read().decode('utf8').splitlines() images_val = open('../data/preprocessed/images_val2014.txt', 'r').read().decode('utf8').splitlines() vgg_model_path = '../features/coco/vgg_feats.mat' questions_lengths_val, questions_val, answers_val, images_val = (list(t) for t in zip(*sorted(zip(questions_lengths_val, questions_val, answers_val, images_val)))) print 'Model compiled, weights loaded' labelencoder = joblib.load('../models/labelencoder.pkl') features_struct = scipy.io.loadmat(vgg_model_path) VGGfeatures = features_struct['feats'] print 'Loaded vgg features' image_ids = open('../features/coco_vgg_IDMap.txt').read().splitlines() img_map = {} for ids in image_ids: id_split = ids.split() img_map[id_split[0]] = int(id_split[1]) nlp = English() print 'Loaded word2vec features' nb_classes = 1000 y_predict_text = [] batchSize = 128 widgets = ['Evaluating ', Percentage(), ' ', Bar(marker='#',left='[',right=']'), ' ', ETA()] pbar = ProgressBar(widgets=widgets) for qu_batch,an_batch,im_batch in pbar(zip(grouper(questions_val, batchSize, fillvalue=questions_val[0]), grouper(answers_val, batchSize, fillvalue=answers_val[0]), grouper(images_val, batchSize, fillvalue=images_val[0]))): timesteps = len(nlp(qu_batch[-1])) #questions sorted in descending order of length X_q_batch = get_questions_tensor_timeseries(qu_batch, nlp, timesteps) if 'language_only' in args.model: X_batch = X_q_batch else: X_i_batch = get_images_matrix(im_batch, img_map, VGGfeatures) X_batch = [X_q_batch, X_i_batch] y_predict = model.predict_classes(X_batch, verbose=0) y_predict_text.extend(labelencoder.inverse_transform(y_predict)) incorrect_val=0 correct_val=0 f1 = open(args.results, 'w') for prediction, truth, question, image in zip(y_predict_text, answers_val, questions_val, images_val): temp_count=0 for _truth in truth.split(';'): if prediction == _truth: temp_count+=1 if temp_count>2: correct_val+=1 else: incorrect_val+=1 f1.write(question.encode('utf-8')) f1.write('\n') f1.write(image.encode('utf-8')) f1.write('\n') f1.write(prediction) f1.write('\n') f1.write(truth.encode('utf-8')) f1.write('\n') f1.write('\n') f1.write('Final Accuracy is ' + str(float(correct_val)/(incorrect_val+correct_val))) f1.close() f1 = open('../results/overall_results.txt', 'a') f1.write(args.weights + '\n') f1.write(str(float(correct_val)/(incorrect_val+correct_val)) + '\n\n') f1.close() print 'Final Accuracy on the validation set is', float(correct_val)/(incorrect_val+correct_val)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-num_hidden_units_mlp', type=int, default=1024) parser.add_argument('-num_hidden_units_lstm', type=int, default=512) parser.add_argument('-num_hidden_layers_mlp', type=int, default=3) parser.add_argument('-num_hidden_layers_lstm', type=int, default=1) parser.add_argument('-dropout', type=float, default=0.5) parser.add_argument('-activation_mlp', type=str, default='tanh') parser.add_argument('-num_epochs', type=int, default=100) parser.add_argument('-model_save_interval', type=int, default=5) parser.add_argument('-batch_size', type=int, default=128) #TODO Feature parser.add_argument('-resume_training', type=str) #TODO Feature parser.add_argument('-language_only', type=bool, default= False) args = parser.parse_args() word_vec_dim = 300 img_dim = 4096 max_len = 30 nb_classes = 1000 #get the data questions_train = open('../data/preprocessed/questions_train2014.txt', 'r').read().decode('utf8').splitlines() questions_lengths_train = open( '../data/preprocessed/questions_lengths_train2014.txt', 'r').read().decode('utf8').splitlines() answers_train = open('../data/preprocessed/answers_train2014.txt', 'r').read().decode('utf8').splitlines() images_train = open('../data/preprocessed/images_train2014.txt', 'r').read().decode('utf8').splitlines() vgg_model_path = '../features/coco/vgg_feats.mat' max_answers = nb_classes questions_train, answers_train, images_train = selectFrequentAnswers( questions_train, answers_train, images_train, max_answers) questions_lengths_train, questions_train, answers_train, images_train = ( list(t) for t in zip(*sorted( zip(questions_lengths_train, questions_train, answers_train, images_train)))) #encode the remaining answers labelencoder = preprocessing.LabelEncoder() labelencoder.fit(answers_train) nb_classes = len(list(labelencoder.classes_)) joblib.dump(labelencoder, '../models/labelencoder.pkl') image_model = Sequential() image_model.add(Reshape(input_shape=(img_dim, ), dims=(img_dim, ))) language_model = Sequential() if args.num_hidden_layers_lstm == 1: language_model.add( LSTM(output_dim=args.num_hidden_units_lstm, return_sequences=False, input_shape=(max_len, word_vec_dim))) else: language_model.add( LSTM(output_dim=args.num_hidden_units_lstm, return_sequences=True, input_shape=(max_len, word_vec_dim))) for i in xrange(args.num_hidden_layers_lstm - 2): language_model.add( LSTM(output_dim=args.num_hidden_units_lstm, return_sequences=True)) language_model.add( LSTM(output_dim=args.num_hidden_units_lstm, return_sequences=False)) model = Sequential() model.add( Merge([language_model, image_model], mode='concat', concat_axis=1)) for i in xrange(args.num_hidden_layers_mlp): model.add(Dense(args.num_hidden_units_mlp, init='uniform')) model.add(Activation(args.activation_mlp)) model.add(Dropout(args.dropout)) model.add(Dense(nb_classes)) model.add(Activation('softmax')) json_string = model.to_json() model_file_name = '../models/lstm_1_num_hidden_units_lstm_' + str(args.num_hidden_units_lstm) + \ '_num_hidden_units_mlp_' + str(args.num_hidden_units_mlp) + '_num_hidden_layers_mlp_' + \ str(args.num_hidden_layers_mlp) + '_num_hidden_layers_lstm_' + str(args.num_hidden_layers_lstm) open(model_file_name + '.json', 'w').write(json_string) model.compile(loss='categorical_crossentropy', optimizer='rmsprop') print 'Compilation done' features_struct = scipy.io.loadmat(vgg_model_path) VGGfeatures = features_struct['feats'] print 'loaded vgg features' image_ids = open('../features/coco_vgg_IDMap.txt').read().splitlines() img_map = {} for ids in image_ids: id_split = ids.split() img_map[id_split[0]] = int(id_split[1]) nlp = English() print 'loaded word2vec features...' ## training print 'Training started...' for k in xrange(args.num_epochs): progbar = generic_utils.Progbar(len(questions_train)) for qu_batch, an_batch, im_batch in zip( grouper(questions_train, args.batch_size, fillvalue=questions_train[-1]), grouper(answers_train, args.batch_size, fillvalue=answers_train[-1]), grouper(images_train, args.batch_size, fillvalue=images_train[-1])): timesteps = len(nlp( qu_batch[-1])) #questions sorted in descending order of length X_q_batch = get_questions_tensor_timeseries( qu_batch, nlp, timesteps) X_i_batch = get_images_matrix(im_batch, img_map, VGGfeatures) Y_batch = get_answers_matrix(an_batch, labelencoder) loss = model.train_on_batch([X_q_batch, X_i_batch], Y_batch) progbar.add(args.batch_size, values=[("train loss", loss)]) if k % args.model_save_interval == 0: model.save_weights(model_file_name + '_epoch_{:03d}.hdf5'.format(k)) model.save_weights(model_file_name + '_epoch_{:03d}.hdf5'.format(k))
def main(): parser = argparse.ArgumentParser() parser.add_argument('-num_hidden_units', type=int, default=512) parser.add_argument('-num_lstm_layers', type=int, default=2) parser.add_argument('-dropout', type=float, default=0.2) parser.add_argument('-activation', type=str, default='tanh') args = parser.parse_args() questions_train = open('../data/preprocessed/questions_train2014.txt', 'r').read().decode('utf8').splitlines() questions_lengths_train = open( '../data/preprocessed/questions_lengths_train2014.txt', 'r').read().decode('utf8').splitlines() answers_train = open('../data/preprocessed/answers_train2014.txt', 'r').read().decode('utf8').splitlines() images_train = open('../data/preprocessed/images_train2014.txt', 'r').read().decode('utf8').splitlines() max_answers = 1000 questions_train, answers_train, images_train = selectFrequentAnswers( questions_train, answers_train, images_train, max_answers) print 'Loaded questions, sorting by length...' questions_lengths_train, questions_train, answers_train = ( list(t) for t in zip(*sorted( zip(questions_lengths_train, questions_train, answers_train)))) #encode the remaining answers labelencoder = preprocessing.LabelEncoder() labelencoder.fit(answers_train) nb_classes = len(list(labelencoder.classes_)) joblib.dump(labelencoder, '../models/labelencoder.pkl') max_len = 30 #25 is max for training, 27 is max for validation word_vec_dim = 300 model = Sequential() model.add( LSTM(output_dim=args.num_hidden_units, activation='tanh', return_sequences=True, input_shape=(max_len, word_vec_dim))) model.add(Dropout(args.dropout)) model.add(LSTM(args.num_hidden_units, return_sequences=False)) model.add(Dense(nb_classes, init='uniform')) model.add(Activation('softmax')) json_string = model.to_json() model_file_name = '../models/lstm_language_only_num_hidden_units_' + str( args.num_hidden_units) + '_num_lstm_layers_' + str( args.num_lstm_layers) + '_dropout_' + str(args.dropout) open(model_file_name + '.json', 'w').write(json_string) print 'Compiling model...' model.compile(loss='categorical_crossentropy', optimizer='rmsprop') print 'Compilation done...' #set up word vectors nlp = English() print 'loaded word2vec features...' ## training print 'Training started...' numEpochs = 100 model_save_interval = 5 batchSize = 128 for k in xrange(numEpochs): progbar = generic_utils.Progbar(len(questions_train)) for qu_batch, an_batch, im_batch in zip( grouper(questions_train, batchSize, fillvalue=questions_train[0]), grouper(answers_train, batchSize, fillvalue=answers_train[0]), grouper(images_train, batchSize, fillvalue=images_train[0])): timesteps = len(nlp( qu_batch[-1])) #questions sorted in descending order of length X_q_batch = get_questions_tensor_timeseries( qu_batch, nlp, timesteps) Y_batch = get_answers_matrix(an_batch, labelencoder) loss = model.train_on_batch(X_q_batch, Y_batch) progbar.add(batchSize, values=[("train loss", loss)]) if k % model_save_interval == 0: model.save_weights(model_file_name + '_epoch_{:02d}.hdf5'.format(k)) model.save_weights(model_file_name + '_epoch_{:02d}.hdf5'.format(k + 1))
def main(): parser = argparse.ArgumentParser() parser.add_argument('-model', type=str, required=True) parser.add_argument('-weights', type=str, required=True) parser.add_argument('-results', type=str, required=True) args = parser.parse_args() model = model_from_json(open(args.model).read()) model.load_weights(args.weights) model.compile(loss='categorical_crossentropy', optimizer='rmsprop') questions_val = open('../data/preprocessed/questions_val2014.txt', 'r').read().decode('utf8').splitlines() questions_lengths_val = open( '../data/preprocessed/questions_lengths_val2014.txt', 'r').read().decode('utf8').splitlines() answers_val = open('../data/preprocessed/answers_val2014_all.txt', 'r').read().decode('utf8').splitlines() images_val = open('../data/preprocessed/images_val2014.txt', 'r').read().decode('utf8').splitlines() vgg_model_path = '../features/coco/vgg_feats.mat' questions_lengths_val, questions_val, answers_val, images_val = ( list(t) for t in zip(*sorted( zip(questions_lengths_val, questions_val, answers_val, images_val)))) print 'Model compiled, weights loaded' labelencoder = joblib.load('../models/labelencoder.pkl') features_struct = scipy.io.loadmat(vgg_model_path) VGGfeatures = features_struct['feats'] print 'Loaded vgg features' image_ids = open('../features/coco_vgg_IDMap.txt').read().splitlines() img_map = {} for ids in image_ids: id_split = ids.split() img_map[id_split[0]] = int(id_split[1]) nlp = English() print 'Loaded word2vec features' nb_classes = 1000 y_predict_text = [] batchSize = 128 widgets = [ 'Evaluating ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] pbar = ProgressBar(widgets=widgets) for qu_batch, an_batch, im_batch in pbar( zip(grouper(questions_val, batchSize, fillvalue=questions_val[0]), grouper(answers_val, batchSize, fillvalue=answers_val[0]), grouper(images_val, batchSize, fillvalue=images_val[0]))): timesteps = len(nlp( qu_batch[-1])) #questions sorted in descending order of length X_q_batch = get_questions_tensor_timeseries(qu_batch, nlp, timesteps) if 'language_only' in args.model: X_batch = X_q_batch else: X_i_batch = get_images_matrix(im_batch, img_map, VGGfeatures) X_batch = [X_q_batch, X_i_batch] y_predict = model.predict_classes(X_batch, verbose=0) y_predict_text.extend(labelencoder.inverse_transform(y_predict)) total = 0 correct_val = 0.0 f1 = open(args.results, 'w') for prediction, truth, question, image in zip(y_predict_text, answers_val, questions_val, images_val): temp_count = 0 for _truth in truth.split(';'): if prediction == _truth: temp_count += 1 if temp_count > 2: correct_val += 1 else: correct_val += float(temp_count) / 3 total += 1 f1.write(question.encode('utf-8')) f1.write('\n') f1.write(image.encode('utf-8')) f1.write('\n') f1.write(prediction) f1.write('\n') f1.write(truth.encode('utf-8')) f1.write('\n') f1.write('\n') f1.write('Final Accuracy is ' + str(correct_val / total)) f1.close() f1 = open('../results/overall_results.txt', 'a') f1.write(args.weights + '\n') f1.write(str(correct_val / total) + '\n\n') f1.close() print 'Final Accuracy on the validation set is', correct_val / total
def main(): ''' Before runnning this demo ensure that you have some images from the MS COCO validation set saved somewhere, and update the image_dir variable accordingly Also, this demo is designed to run with the models released with the visual-qa repo, if you would like to get use it with some other model (say an MLP based model or a langauge-only model) you will have to make some changes. ''' image_dir = '../../vqa_images/' local_images = [ f for f in listdir(image_dir) if isfile(join(image_dir, f)) ] parser = argparse.ArgumentParser() parser.add_argument( '-model', type=str, default= '../models/lstm_1_num_hidden_units_lstm_512_num_hidden_units_mlp_1024_num_hidden_layers_mlp_3_num_hidden_layers_lstm_1.json' ) parser.add_argument( '-weights', type=str, default= '../models/lstm_1_num_hidden_units_lstm_512_num_hidden_units_mlp_1024_num_hidden_layers_mlp_3_num_hidden_layers_lstm_1_epoch_199.hdf5' ) parser.add_argument('-sample_size', type=int, default=25) args = parser.parse_args() model = model_from_json(open(args.model).read()) model.load_weights(args.weights) model.compile(loss='categorical_crossentropy', optimizer='rmsprop') print 'Model loaded and compiled' images_val = open('../data/preprocessed/images_val2014.txt', 'r').read().decode('utf8').splitlines() nlp = English() print 'Loaded word2vec features' labelencoder = joblib.load('../models/labelencoder.pkl') vgg_model_path = '../features/coco/vgg_feats.mat' features_struct = scipy.io.loadmat(vgg_model_path) VGGfeatures = features_struct['feats'] print 'Loaded vgg features' image_ids = open('../features/coco_vgg_IDMap.txt').read().splitlines() img_map = {} for ids in image_ids: id_split = ids.split() img_map[id_split[0]] = int(id_split[1]) image_sample = random.sample(local_images, args.sample_size) for image in image_sample: p = subprocess.Popen(["display", image_dir + image]) q = unicode(raw_input("Ask a question about the image:")) coco_id = str(int(image[-16:-4])) timesteps = len( nlp(q)) #questions sorted in descending order of length X_q = get_questions_tensor_timeseries([q], nlp, timesteps) X_i = get_images_matrix([coco_id], img_map, VGGfeatures) X = [X_q, X_i] y_predict = model.predict_classes(X, verbose=0) print labelencoder.inverse_transform(y_predict) raw_input('Press enter to continue...') p.kill()
def main(): parser = argparse.ArgumentParser() parser.add_argument('-num_hidden_units_mlp', type=int, default=1024) parser.add_argument('-num_hidden_units_lstm', type=int, default=512) parser.add_argument('-num_hidden_layers_mlp', type=int, default=3) parser.add_argument('-num_hidden_layers_lstm', type=int, default=1) parser.add_argument('-dropout', type=float, default=0.5) parser.add_argument('-activation_mlp', type=str, default='tanh') parser.add_argument('-num_epochs', type=int, default=100) parser.add_argument('-model_save_interval', type=int, default=5) parser.add_argument('-batch_size', type=int, default=128) parser.add_argument('-word_vector', type=str, default='') #TODO Feature parser.add_argument('-resume_training', type=str) #TODO Feature parser.add_argument('-language_only', type=bool, default= False) args = parser.parse_args() word_vec_dim= 300 img_dim = 4096 max_len = 30 nb_classes = 1000 #get the data questions_train = open('../data/preprocessed/questions_train2014.txt', 'r').read().decode('utf8').splitlines() questions_lengths_train = open('../data/preprocessed/questions_lengths_train2014.txt', 'r').read().decode('utf8').splitlines() answers_train = open('../data/preprocessed/answers_train2014_modal.txt', 'r').read().decode('utf8').splitlines() images_train = open('../data/preprocessed/images_train2014.txt', 'r').read().decode('utf8').splitlines() vgg_model_path = '../features/coco/vgg_feats.mat' max_answers = nb_classes questions_train, answers_train, images_train = selectFrequentAnswers(questions_train,answers_train,images_train, max_answers) questions_lengths_train, questions_train, answers_train, images_train = (list(t) for t in zip(*sorted(zip(questions_lengths_train, questions_train, answers_train, images_train)))) #encode the remaining answers labelencoder = preprocessing.LabelEncoder() labelencoder.fit(answers_train) nb_classes = len(list(labelencoder.classes_)) joblib.dump(labelencoder,'../models/labelencoder.pkl') image_model = Sequential() image_model.add(Reshape(input_shape = (img_dim,), dims=(img_dim,))) language_model = Sequential() if args.num_hidden_layers_lstm == 1: language_model.add(LSTM(output_dim = args.num_hidden_units_lstm, return_sequences=False, input_shape=(max_len, word_vec_dim))) else: language_model.add(LSTM(output_dim = args.num_hidden_units_lstm, return_sequences=True, input_shape=(max_len, word_vec_dim))) for i in xrange(args.num_hidden_layers_lstm-2): language_model.add(LSTM(output_dim = args.num_hidden_units_lstm, return_sequences=True)) language_model.add(LSTM(output_dim = args.num_hidden_units_lstm, return_sequences=False)) model = Sequential() model.add(Merge([language_model, image_model], mode='concat', concat_axis=1)) for i in xrange(args.num_hidden_layers_mlp): model.add(Dense(args.num_hidden_units_mlp, init='uniform')) model.add(Activation(args.activation_mlp)) model.add(Dropout(args.dropout)) model.add(Dense(nb_classes)) model.add(Activation('softmax')) json_string = model.to_json() model_file_name = '../models/lstm_1_num_hidden_units_lstm_' + str(args.num_hidden_units_lstm) + \ '_num_hidden_units_mlp_' + str(args.num_hidden_units_mlp) + '_num_hidden_layers_mlp_' + \ str(args.num_hidden_layers_mlp) + '_num_hidden_layers_lstm_' + str(args.num_hidden_layers_lstm) open(model_file_name + '.json', 'w').write(json_string) model.compile(loss='categorical_crossentropy', optimizer='rmsprop') print 'Compilation done' features_struct = scipy.io.loadmat(vgg_model_path) VGGfeatures = features_struct['feats'] print 'loaded vgg features' image_ids = open('../features/coco_vgg_IDMap.txt').read().splitlines() img_map = {} for ids in image_ids: id_split = ids.split() img_map[id_split[0]] = int(id_split[1]) # Code to choose the word vectors, default is Goldberg but GLOVE is preferred if args.word_vector == 'glove': nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors') else: nlp = English() print 'loaded ' + args.word_vector + ' word2vec features...' ## training print 'Training started...' for k in xrange(args.num_epochs): progbar = generic_utils.Progbar(len(questions_train)) for qu_batch,an_batch,im_batch in zip(grouper(questions_train, args.batch_size, fillvalue=questions_train[-1]), grouper(answers_train, args.batch_size, fillvalue=answers_train[-1]), grouper(images_train, args.batch_size, fillvalue=images_train[-1])): timesteps = len(nlp(qu_batch[-1])) #questions sorted in descending order of length X_q_batch = get_questions_tensor_timeseries(qu_batch, nlp, timesteps) X_i_batch = get_images_matrix(im_batch, img_map, VGGfeatures) Y_batch = get_answers_matrix(an_batch, labelencoder) loss = model.train_on_batch([X_q_batch, X_i_batch], Y_batch) # fix for the Keras v0.3 issue #9 progbar.add(args.batch_size, values=[("train loss", loss[0])]) if k%args.model_save_interval == 0: model.save_weights(model_file_name + '_epoch_{:03d}.hdf5'.format(k)) model.save_weights(model_file_name + '_epoch_{:03d}.hdf5'.format(k))
def process_input(self, question): return get_questions_tensor_timeseries(question, self._nlp, self._max_len)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-model', type=str, required=True, help="JSON dump of saved model structure.") parser.add_argument('-weights', type=str, required=True, help="Saved weights (checkpoint).") parser.add_argument('-results', type=str, required=True, help="File where to write the results.") parser.add_argument('-results_json', type=str, required=True, help="File where to dump the evaluation results in " "JSON format, so that the official VQA toolkit " "can read it.") parser.add_argument('-dataroot', type=str, default='/data/vqa') args = parser.parse_args() root = args.dataroot model = model_from_json(open(args.model).read()) model.load_weights(args.weights) model.compile(loss='categorical_crossentropy', optimizer='rmsprop') questions_val = lines(pjoin(root, 'Preprocessed', 'questions_val2014.txt')) questions_id = lines( pjoin(root, 'Preprocessed', 'questions_id_val2014.txt')) answers_val = lines(pjoin(root, 'Preprocessed', 'answers_val2014_all.txt')) images_val = lines(pjoin(root, 'Preprocessed', 'images_val2014_all.txt')) vgg_model_path = pjoin(root, 'coco', 'vgg_feats.mat') print('Model compiled, weights loaded...') # Load the encoder which converts answers to IDs, saved in the same # folder as the rest of the dumps. exp_root = args.weights[:args.weights.rfind('/')] labelencoder = joblib.load(pjoin(exp_root, 'labelencoder.pkl')) features_struct = scipy.io.loadmat(vgg_model_path) VGGfeatures = features_struct['feats'] print('loaded vgg features') image_ids = lines(pjoin(root, 'coco_vgg_IDMap.txt')) img_map = {} for ids in image_ids: id_split = ids.split() img_map[id_split[0]] = int(id_split[1]) nlp = English() print('loaded word2vec features') nb_classes = 1000 y_predict_text = [] # TODO(andrei): Configure this via args. batchSize = 512 stuff = batchify(batchSize, questions_val, answers_val, images_val) with click.progressbar(stuff) as pbar: for (qu_batch, an_batch, im_batch) in pbar: # TODO(Bernhard): make this choose the right preprocessing and right model, # for now you have to plug it in manually #X_q_batch = get_questions_matrix_sum(qu_batch, nlp) # for sum up model X_q_batch = get_questions_tensor_timeseries(qu_batch, nlp, 20) # for LSTM model if 'language_only' in args.model: y_predict = model.predict_classes([X_q_batch], verbose=0) else: X_i_batch = get_images_matrix(im_batch, img_map, VGGfeatures) y_predict = model.predict_classes([X_q_batch, X_i_batch], verbose=0) # TODO(Bernhard): verify that predict_classes sets dropout to 0 y_predict_text.extend(labelencoder.inverse_transform(y_predict)) correct_val = 0.0 total = 0 f1 = open(args.results, 'w') print("Will dump resulting answers in JSON format to file: [{0}]".format( args.results_json)) result_file_json = open(args.results_json, 'w') result_file_json.write("[") all_preds = list( zip(y_predict_text, answers_val, questions_val, questions_id, images_val)) for idx, (prediction, truth, question, question_id, image) in enumerate(all_preds): temp_count = 0 for _truth in truth.split(';'): if prediction == _truth: temp_count += 1 if temp_count > 2: correct_val += 1 else: correct_val += float(temp_count) / 3 total += 1 f1.write(question) f1.write('\n') f1.write(image) f1.write('\n') f1.write(prediction) f1.write('\n') f1.write(truth) f1.write('\n') f1.write('\n') # Note: Double-braces are escaped braces in Python format strings. result_file_json.write( '{{"answer": "{0}", "question_id": {1}}}{2}\n'.format( prediction, question_id, ',' if idx < len(all_preds) - 1 else '')) result_file_json.write("]\n") f1.write('Final Accuracy is ' + str(correct_val / total)) f1.close() # TODO(andrei): Re-add this, so we are neat about keeping track of all our # results. # f1 = open('../results/overall_results.txt', 'a') # f1.write(args.weights + '\n') # f1.write(str(correct_val / total) + '\n') # f1.close() print('Final Accuracy on the validation set is', correct_val / total)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-model', type=str, required=True) parser.add_argument('-weights', type=str, required=True) parser.add_argument('-results', type=str, required=True) args = parser.parse_args() model = model_from_json(open(args.model).read()) model.load_weights(args.weights) model.compile(loss='categorical_crossentropy', optimizer='rmsprop') questions_test = open('../data/preprocessed/questions_test-dev2015.txt', 'r').read().decode('utf8').splitlines() questions_lengths_test = open('../data/preprocessed/questions_lengths_test-dev2015.txt', 'r').read().decode('utf8').splitlines() questions_id_test = open('../data/preprocessed/questions_id_test-dev2015.txt', 'r').read().decode('utf8').splitlines() images_test = open('../data/preprocessed/images_test-dev2015.txt', 'r').read().decode('utf8').splitlines() vgg_model_path = '../features/coco/vgg_feats_test.mat' questions_lengths_test, questions_test, images_test, questions_id_test = (list(t) for t in zip(*sorted(zip(questions_lengths_test, questions_test, images_test, questions_id_test)))) print 'Model compiled, weights loaded' labelencoder = joblib.load('../models/labelencoder_trainval.pkl') features_struct = scipy.io.loadmat(vgg_model_path) VGGfeatures = features_struct['feats'] print 'Loaded vgg features' image_ids = open('../features/coco_vgg_IDMap_test.txt').read().splitlines() img_map = {} for ids in image_ids: id_split = ids.split() img_map[id_split[0]] = int(id_split[1]) nlp = English() print 'Loaded word2vec features' nb_classes = 1000 y_predict_text = [] batchSize = 128 widgets = ['Evaluating ', Percentage(), ' ', Bar(marker='#',left='[',right=']'), ' ', ETA()] pbar = ProgressBar(widgets=widgets) for qu_batch,im_batch in pbar(zip(grouper(questions_test, batchSize, fillvalue=questions_test[-1]), grouper(images_test, batchSize, fillvalue=images_test[-1]))): timesteps = len(nlp(qu_batch[-1])) #questions sorted in descending order of length X_q_batch = get_questions_tensor_timeseries(qu_batch, nlp, timesteps) if 'language_only' in args.model: X_batch = X_q_batch else: X_i_batch = get_images_matrix(im_batch, img_map, VGGfeatures) X_batch = [X_q_batch, X_i_batch] y_predict = model.predict_classes(X_batch, verbose=0) y_predict_text.extend(labelencoder.inverse_transform(y_predict)) results = [] f1 = open(args.results, 'w') for prediction, question, question_id, image in zip(y_predict_text, questions_test, questions_id_test, images_test): answer = {} answer['question_id'] = int(question_id) answer['answer'] = prediction results.append(answer) f1.write(question.encode('utf-8')) f1.write('\n') f1.write(image.encode('utf-8')) f1.write('\n') f1.write(prediction) f1.write('\n') f1.write(question_id.encode('utf-8')) f1.write('\n') f1.write('\n') f1.close() f2 = open('../results/submission_test-dev2015.json', 'w') f2.write(json.dumps(results)) f2.close() print 'Results saved to', args.results
def main(): parser = argparse.ArgumentParser() parser.add_argument("-model", type=str, required=True) parser.add_argument("-weights", type=str, required=True) parser.add_argument("-results", type=str, required=True) args = parser.parse_args() model = model_from_json(open(args.model).read()) model.load_weights(args.weights) model.compile(loss="categorical_crossentropy", optimizer="rmsprop") questions_val = open("../data/preprocessed/questions_val2014.txt", "r").read().decode("utf8").splitlines() questions_lengths_val = ( open("../data/preprocessed/questions_lengths_val2014.txt", "r").read().decode("utf8").splitlines() ) answers_val = open("../data/preprocessed/answers_val2014.txt", "r").read().decode("utf8").splitlines() images_val = open("../data/preprocessed/images_val2014.txt", "r").read().decode("utf8").splitlines() vgg_model_path = "../features/coco/vgg_feats.mat" questions_lengths_val, questions_val, answers_val, images_val = ( list(t) for t in zip(*sorted(zip(questions_lengths_val, questions_val, answers_val, images_val))) ) print "Model compiled, weights loaded" labelencoder = joblib.load("../models/labelencoder.pkl") features_struct = scipy.io.loadmat(vgg_model_path) VGGfeatures = features_struct["feats"] print "Loaded vgg features" image_ids = open("../features/coco/coco_vgg_IDMap.txt").read().splitlines() img_map = {} for ids in image_ids: id_split = ids.split() img_map[id_split[0]] = int(id_split[1]) nlp = English() print "Loaded word2vec features" nb_classes = 1000 y_predict_text = [] batchSize = 128 widgets = ["Evaluating ", Percentage(), " ", Bar(marker="#", left="[", right="]"), " ", ETA()] pbar = ProgressBar(widgets=widgets) for qu_batch, an_batch, im_batch in pbar( zip( grouper(questions_val, batchSize, fillvalue=questions_val[0]), grouper(answers_val, batchSize, fillvalue=answers_val[0]), grouper(images_val, batchSize, fillvalue=images_val[0]), ) ): timesteps = len(nlp(qu_batch[-1])) # questions sorted in descending order of length X_q_batch = get_questions_tensor_timeseries(qu_batch, nlp, timesteps) if "language_only" in args.model: X_batch = X_q_batch else: X_i_batch = get_images_matrix(im_batch, img_map, VGGfeatures) X_batch = [X_q_batch, X_i_batch] y_predict = model.predict_classes(X_batch, verbose=0) y_predict_text.extend(labelencoder.inverse_transform(y_predict)) incorrect_val = 0 correct_val = 0 f1 = open(args.results, "w") for prediction, truth, question, image in zip(y_predict_text, answers_val, questions_val, images_val): temp_count = 0 for _truth in truth.split(";"): if prediction == _truth: temp_count += 1 if temp_count > 2: correct_val += 1 else: incorrect_val += 1 f1.write(question.encode("utf-8")) f1.write("\n") f1.write(image.encode("utf-8")) f1.write("\n") f1.write(prediction) f1.write("\n") f1.write(truth.encode("utf-8")) f1.write("\n") f1.write("\n") f1.write("Final Accuracy is " + str(float(correct_val) / (incorrect_val + correct_val))) f1.close() f1 = open("../results/overall_results.txt", "a") f1.write(args.weights + "\n") f1.write(str(float(correct_val) / (incorrect_val + correct_val)) + "\n\n") f1.close() print "Final Accuracy on the validation set is", float(correct_val) / (incorrect_val + correct_val)