Ejemplo n.º 1
0
def update_output(n_clicks, value):
    vocab_data = data_loader.get_question_answer_vocab("2")
    qvocab = vocab_data['question_vocab']
    q_map = { vocab_data['question_vocab'][qw] : qw for qw in vocab_data['question_vocab']}
    print('filename::',filen)
    fc7_features = utils.extract_fc7_features(filen, 'Data/vgg16.tfmodel')
    model_options = {
		'num_lstm_layers' : 2,
		'rnn_size' : 512,
		'embedding_size' : 512,
		'word_emb_dropout' : 0.5,
		'image_dropout' : 0.5,
		'fc7_feature_length' : 4096,
		'lstm_steps' : vocab_data['max_question_length'] + 1,
		'q_vocab_size' : len(vocab_data['question_vocab']),
		'ans_vocab_size' : len(vocab_data['answer_vocab'])
	}
    
    question_vocab = vocab_data['question_vocab']
    word_regex = re.compile(r'\w+')
    question_ids = np.zeros((1, vocab_data['max_question_length']), dtype = 'int32')
    print('qst',value)
    question_words = re.findall(word_regex, value)
    base = vocab_data['max_question_length'] - len(question_words)
    for i in range(0, len(question_words)):
        if question_words[i] in question_vocab:
            question_ids[0][base + i] = question_vocab[ question_words[i] ]
        else:
            question_ids[0][base + i] = question_vocab['UNK']
    ans_map = { vocab_data['answer_vocab'][ans] : ans for ans in vocab_data['answer_vocab']}
    model = vis_lstm_model.Vis_lstm_model(model_options)
    input_tensors, t_prediction, t_ans_probab = model.build_generator()
    sess = tf.InteractiveSession()
    saver = tf.train.Saver()
    saver.restore(sess, 'Data/Models/modelnew99.ckpt')
    pred, answer_probab = sess.run([t_prediction, t_ans_probab], feed_dict={
        input_tensors['fc7']:fc7_features,
        input_tensors['sentence']:question_ids,
    })
    print("answerprediction",pred[0])
    #model.summary()
    #plot_model(model,to_file='predictmodel.png')
    print ("Ans:", ans_map[pred[0]])
    answer_probab_tuples = [(-answer_probab[0][idx], idx) for idx in range(len(answer_probab[0]))]
    answer_probab_tuples.sort()
    print ("Top Answers")
    for i in range(1):
        print (ans_map[ answer_probab_tuples[0][1] ])
        #ans=(ans_map[answer_probab_tuples[i][1] ])
        lang = "en"
        text="This is a "+ans_map[ answer_probab_tuples[0][1] ]
        speech = Speech(text, lang)

        sox_effects = ("speed", "0.8")
        speech.play(sox_effects)
        
    return ans_map[answer_probab_tuples[0][1]]
Ejemplo n.º 2
0
def calcFeatures(image_path):
    data_dir = 'Data'
    vocab_data = data_loader.get_question_answer_vocab(data_dir)
    qvocab = vocab_data['question_vocab']
    q_map = {
        vocab_data['question_vocab'][qw]: qw
        for qw in vocab_data['question_vocab']
    }

    fc7_features = utils.extract_fc7_features(image_path,
                                              join(data_dir, 'vgg16.tfmodel'))
    return fc7_features
Ejemplo n.º 3
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--image_path',
                        type=str,
                        default='Data/cat.jpeg',
                        help='Image Path')
    parser.add_argument('--model_path',
                        type=str,
                        default='Data/Models/model2.ckpt',
                        help='Model Path')
    parser.add_argument('--num_lstm_layers',
                        type=int,
                        default=2,
                        help='num_lstm_layers')
    parser.add_argument('--fc7_feature_length',
                        type=int,
                        default=4096,
                        help='fc7_feature_length')
    parser.add_argument('--rnn_size', type=int, default=512, help='rnn_size')
    parser.add_argument('--embedding_size',
                        type=int,
                        default=512,
                        help='embedding_size'),
    parser.add_argument('--word_emb_dropout',
                        type=float,
                        default=0.5,
                        help='word_emb_dropout')
    parser.add_argument('--image_dropout',
                        type=float,
                        default=0.5,
                        help='image_dropout')
    parser.add_argument('--data_dir',
                        type=str,
                        default='Data',
                        help='Data directory')
    parser.add_argument('--question',
                        type=str,
                        default='Which animal is this?',
                        help='Question')

    args = parser.parse_args()

    print("Image:", args.image_path)
    print("Question:", args.question)

    vocab_data = data_loader.get_question_answer_vocab(args.data_dir)
    qvocab = vocab_data['question_vocab']
    q_map = {
        vocab_data['question_vocab'][qw]: qw
        for qw in vocab_data['question_vocab']
    }

    fc7_features = utils.extract_fc7_features(
        args.image_path, join(args.data_dir, 'vgg16.tfmodel'))

    model_options = {
        'num_lstm_layers': args.num_lstm_layers,
        'rnn_size': args.rnn_size,
        'embedding_size': args.embedding_size,
        'word_emb_dropout': args.word_emb_dropout,
        'image_dropout': args.image_dropout,
        'fc7_feature_length': args.fc7_feature_length,
        'lstm_steps': vocab_data['max_question_length'] + 1,
        'q_vocab_size': len(vocab_data['question_vocab']),
        'ans_vocab_size': len(vocab_data['answer_vocab'])
    }

    question_vocab = vocab_data['question_vocab']
    word_regex = re.compile(r'\w+')
    question_ids = np.zeros((1, vocab_data['max_question_length']),
                            dtype='int32')
    question_words = re.findall(word_regex, args.question)
    base = vocab_data['max_question_length'] - len(question_words)
    for i in range(0, len(question_words)):
        if question_words[i] in question_vocab:
            question_ids[0][base + i] = question_vocab[question_words[i]]
        else:
            question_ids[0][base + i] = question_vocab['UNK']

    ans_map = {
        vocab_data['answer_vocab'][ans]: ans
        for ans in vocab_data['answer_vocab']
    }
    model = vis_lstm_model.Vis_lstm_model(model_options)
    input_tensors, t_prediction, t_ans_probab = model.build_generator()
    sess = tf.InteractiveSession()
    saver = tf.train.Saver()
    saver.restore(sess, args.model_path)

    pred, answer_probab = sess.run(
        [t_prediction, t_ans_probab],
        feed_dict={
            input_tensors['fc7']: fc7_features,
            input_tensors['sentence']: question_ids,
        })

    print("Ans:", ans_map[pred[0]])
    answer_probab_tuples = [(-answer_probab[0][idx], idx)
                            for idx in range(len(answer_probab[0]))]
    answer_probab_tuples.sort()
    print("Top Answers")
    for i in range(5):
        print(ans_map[answer_probab_tuples[i][1]])
Ejemplo n.º 4
0
def main():
	parser = argparse.ArgumentParser()
	parser.add_argument('--image_path', type=str, default = 'Data/cat.jpeg',
                       help='Image Path')
	parser.add_argument('--model_path', type=str, default = 'Data/Models/model2.ckpt',
                       help='Model Path')
	parser.add_argument('--num_lstm_layers', type=int, default=2,
                       help='num_lstm_layers')
	parser.add_argument('--fc7_feature_length', type=int, default=4096,
                       help='fc7_feature_length')
	parser.add_argument('--rnn_size', type=int, default=512,
                       help='rnn_size')
	parser.add_argument('--embedding_size', type=int, default=512,
                       help='embedding_size'),
	parser.add_argument('--word_emb_dropout', type=float, default=0.5,
                       help='word_emb_dropout')
	parser.add_argument('--image_dropout', type=float, default=0.5,
                       help='image_dropout')
	parser.add_argument('--data_dir', type=str, default='Data',
                       help='Data directory')
	parser.add_argument('--question', type=str, default='Which animal is this?',
                       help='Question')
	
	

	args = parser.parse_args()

	print "Image:", args.image_path
	print "Question:", args.question

	vocab_data = data_loader.get_question_answer_vocab(args.data_dir)
	qvocab = vocab_data['question_vocab']
	q_map = { vocab_data['question_vocab'][qw] : qw for qw in vocab_data['question_vocab']}
	
	fc7_features = utils.extract_fc7_features(args.image_path, join(args.data_dir, 'vgg16.tfmodel'))
	
	model_options = {
		'num_lstm_layers' : args.num_lstm_layers,
		'rnn_size' : args.rnn_size,
		'embedding_size' : args.embedding_size,
		'word_emb_dropout' : args.word_emb_dropout,
		'image_dropout' : args.image_dropout,
		'fc7_feature_length' : args.fc7_feature_length,
		'lstm_steps' : vocab_data['max_question_length'] + 1,
		'q_vocab_size' : len(vocab_data['question_vocab']),
		'ans_vocab_size' : len(vocab_data['answer_vocab'])
	}
	
	question_vocab = vocab_data['question_vocab']
	word_regex = re.compile(r'\w+')
	question_ids = np.zeros((1, vocab_data['max_question_length']), dtype = 'int32')
	question_words = re.findall(word_regex, args.question)
	base = vocab_data['max_question_length'] - len(question_words)
	for i in range(0, len(question_words)):
		if question_words[i] in question_vocab:
			question_ids[0][base + i] = question_vocab[ question_words[i] ]
		else:
			question_ids[0][base + i] = question_vocab['UNK']

	ans_map = { vocab_data['answer_vocab'][ans] : ans for ans in vocab_data['answer_vocab']}
	model = vis_lstm_model.Vis_lstm_model(model_options)
	input_tensors, t_prediction, t_ans_probab = model.build_generator()
	sess = tf.InteractiveSession()
	saver = tf.train.Saver()
	saver.restore(sess, args.model_path)
	
	pred, answer_probab = sess.run([t_prediction, t_ans_probab], feed_dict={
        input_tensors['fc7']:fc7_features,
        input_tensors['sentence']:question_ids,
    })

	
	print "Ans:", ans_map[pred[0]]
	answer_probab_tuples = [(-answer_probab[0][idx], idx) for idx in range(len(answer_probab[0]))]
	answer_probab_tuples.sort()
	print "Top Answers"
	for i in range(5):
		print ans_map[ answer_probab_tuples[i][1] ]
Ejemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--image_path',
                        type=str,
                        default='/home/vmhatre/vqa_supervised/Data/cat.jpeg',
                        help='Image Path')
    parser.add_argument(
        '--model_path',
        type=str,
        default='/home/vmhatre/vqa_supervised/Data/Models/model2.ckpt',
        help='Model Path')
    parser.add_argument('--num_lstm_layers',
                        type=int,
                        default=2,
                        help='num_lstm_layers')
    parser.add_argument('--fc7_feature_length',
                        type=int,
                        default=4096,
                        help='fc7_feature_length')
    parser.add_argument('--rnn_size', type=int, default=512, help='rnn_size')
    parser.add_argument('--embedding_size',
                        type=int,
                        default=512,
                        help='embedding_size'),
    parser.add_argument('--word_emb_dropout',
                        type=float,
                        default=0.5,
                        help='word_emb_dropout')
    parser.add_argument('--image_dropout',
                        type=float,
                        default=0.5,
                        help='image_dropout')
    parser.add_argument('--data_dir',
                        type=str,
                        default='Data',
                        help='Data directory')
    parser.add_argument('--question',
                        type=str,
                        default='----Question to ask?',
                        help='Question')

    args = parser.parse_args()
    vocab_data = data_loader.get_question_answer_vocab(args.data_dir)
    #Load Question and answer dataset
    qvocab = vocab_data['question_vocab']
    q_map = {
        vocab_data['question_vocab'][qw]: qw
        for qw in vocab_data['question_vocab']
    }
    # Extract features from VGGmodel
    fc7_features = utils.extract_fc7_features(
        args.image_path, join(args.data_dir, 'vgg16.tfmodel'))

    model_options = {
        'num_lstm_layers': args.num_lstm_layers,
        'rnn_size': args.rnn_size,
        'embedding_size': args.embedding_size,
        'word_emb_dropout': args.word_emb_dropout,
        'image_dropout': args.image_dropout,
        'fc7_feature_length': args.fc7_feature_length,
        'lstm_steps': vocab_data['max_question_length'] + 1,
        'q_vocab_size': len(vocab_data['question_vocab']),
        'ans_vocab_size': len(vocab_data['answer_vocab'])
    }
    # Split data
    question_vocab = vocab_data['question_vocab']
    word_regex = re.compile(r'\w+')
    question_ids = np.zeros((1, vocab_data['max_question_length']),
                            dtype='int32')
    question_words = re.findall(word_regex, args.question)
    base = vocab_data['max_question_length'] - len(question_words)
    for i in range(0, len(question_words)):
        if question_words[i] in question_vocab:
            question_ids[0][base + i] = question_vocab[question_words[i]]
        else:
            question_ids[0][base + i] = question_vocab['UNK']


# map question with answer
    ans_map = {
        vocab_data['answer_vocab'][ans]: ans
        for ans in vocab_data['answer_vocab']
    }
    model = vis_lstm_model.Vis_lstm_model(model_options)
    input_tensors, t_prediction, t_ans_probab = model.build_generator()
    # Save the session to help retrieve the trained model at checkpoints
    sess = tf.InteractiveSession()
    saver = tf.train.Saver()
    saver.restore(sess, args.model_path)
    # Run input features from fc7 sentence.
    pred, answer_probab = sess.run(
        [t_prediction, t_ans_probab],
        feed_dict={
            input_tensors['fc7']: fc7_features,
            input_tensors['sentence']: question_ids,
        })
    #Check thte most probable answers for image and questions
    answer_probab_tuples = [(-answer_probab[0][idx], idx)
                            for idx in range(len(answer_probab[0]))]
    answer_probab_tuples.sort()
    for i in range(5):
        print ans_map[answer_probab_tuples[i][1]]
Ejemplo n.º 6
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--image_path',
        type=str,
        default='Data/train_2014/COCO_train2014_000000581922.jpg',
        help='Image Path')
    parser.add_argument(
        '--model_path',
        type=str,
        default='Data/train2014/Tri Training 1/Models/model11.ckpt',
        help='Model Path')
    parser.add_argument('--num_lstm_layers',
                        type=int,
                        default=2,
                        help='num_lstm_layers')
    parser.add_argument('--fc7_feature_length',
                        type=int,
                        default=4096,
                        help='fc7_feature_length')
    parser.add_argument('--rnn_size', type=int, default=512, help='rnn_size')
    parser.add_argument('--embedding_size',
                        type=int,
                        default=512,
                        help='embedding_size'),
    parser.add_argument('--word_emb_dropout',
                        type=float,
                        default=0.5,
                        help='word_emb_dropout')
    parser.add_argument('--image_dropout',
                        type=float,
                        default=0.5,
                        help='image_dropout')
    parser.add_argument('--data_dir',
                        type=str,
                        default='Data/train2014/Tri Training 1/',
                        help='Data directory')
    parser.add_argument('--question',
                        type=str,
                        default='What is this product?',
                        help='Question')

    args = parser.parse_args()
    vizwiz_file_path = 'Data/Images'
    vocab_data = data_loader.get_question_answer_vocab(data_dir=args.data_dir)
    qvocab = vocab_data['question_vocab']
    #print(qvocab)
    #print(0/0)
    q_map = {
        vocab_data['question_vocab'][qw]: qw
        for qw in vocab_data['question_vocab']
    }
    vizwiz_questions_path = 'VizWiz_to_VQA_Questions.json'
    with open(vizwiz_questions_path, 'r') as input_file:
        vizwiz_questions = json.loads(input_file.read())
    question_vocab = vocab_data['question_vocab']
    word_regex = re.compile(r'\w+')
    question_ids = np.zeros((1, vocab_data['max_question_length']),
                            dtype='int32')
    i = 0
    for file in os.listdir(vizwiz_file_path):
        if file.endswith(".jpg"):
            args.image_path = join(vizwiz_file_path, file)
            args.question = vizwiz_questions['questions'][i]['question']
            i += 1
            print("Image:", args.image_path)
            print("Question:", args.question)

            #fc7_features, image_id_list = data_loader.load_fc7_features(args.data_dir, 'val')
            fc7_features = utils.extract_fc7_features(
                args.image_path,
                'Data/train2014/Tri Training 1/vgg16-20160129.tfmodel')

            model_options = {
                'num_lstm_layers': args.num_lstm_layers,
                'rnn_size': args.rnn_size,
                'embedding_size': args.embedding_size,
                'word_emb_dropout': args.word_emb_dropout,
                'image_dropout': args.image_dropout,
                'fc7_feature_length': args.fc7_feature_length,
                'lstm_steps': vocab_data['max_question_length'] + 1,
                'q_vocab_size': len(vocab_data['question_vocab']),
                'ans_vocab_size': len(vocab_data['answer_vocab'])
            }

            question_words = re.findall(word_regex, args.question)
            base = vocab_data['max_question_length'] - len(question_words)
            for i in range(0, len(question_words)):
                if question_words[i] in question_vocab:
                    question_ids[0][base +
                                    i] = question_vocab[question_words[i]]
                else:
                    question_ids[0][base + i] = question_vocab['UNK']

            ans_map = {
                vocab_data['answer_vocab'][ans]: ans
                for ans in vocab_data['answer_vocab']
            }
            model = vis_lstm_model.Vis_lstm_model(model_options)
            input_tensors, t_prediction, t_ans_probab = model.build_generator()
            sess = tf.InteractiveSession()
            saver = tf.train.Saver()
            saver.restore(sess, args.model_path)

            print(question_ids.shape)
            print(fc7_features.shape)
            print(0 / 0)
            pred, answer_probab = sess.run(
                [t_prediction, t_ans_probab],
                feed_dict={
                    input_tensors['fc7']: fc7_features,
                    input_tensors['sentence']: question_ids,
                })

            print("Ans:", ans_map[pred[0]])
            answer_probab_tuples = [(-answer_probab[0][idx], idx)
                                    for idx in range(len(answer_probab[0]))]
            answer_probab_tuples.sort()
            print("Top Answers")
            for i in range(5):
                #print(ans_map[answer_probab_tuples[i]])
                print(ans_map[answer_probab_tuples[i][1]])