Example #1
0
 def __init__(self, model_ctx=mx.cpu(), data_ctx=mx.cpu()):
     self.model = None
     self.version = '0'
     self.model_ctx = model_ctx
     self.data_ctx = data_ctx
     self.input_mode_answer = 'int'
     self.input_mode_question = 'add'
     self.nb_classes = 1001
     self.meta = None
     self.glove_model = GloveModel()
     self.fe = Vgg16FeatureExtractor()
Example #2
0
class VQANet(object):
    model_name = 'vqa-net-2'

    def __init__(self, model_ctx=mx.cpu(), data_ctx=mx.cpu()):
        self.model = None
        self.version = '0'
        self.model_ctx = model_ctx
        self.data_ctx = data_ctx
        self.input_mode_answer = 'int'
        self.input_mode_question = 'add'
        self.nb_classes = 1001
        self.batch_size = 64
        self.out_dim = 10000
        self.meta = None
        self.glove_model = GloveModel()
        self.fe = Vgg16FeatureExtractor()

    def get_config_file_path(self, model_dir_path):
        return os.path.join(model_dir_path, VQANet.model_name + '-v' + self.version + '-config.npy')

    def get_params_file_path(self, model_dir_path):
        return os.path.join(model_dir_path, VQANet.model_name + '-v' + self.version + '-net.params')

    def evaluate_accuracy(self, data_iterator):
        metric = mx.metric.Accuracy()
        data_iterator.reset()
        for i, batch in enumerate(data_iterator):
            data1 = batch.data[0].as_in_context(self.model_ctx)
            data2 = batch.data[1].as_in_context(self.model_ctx)
            data = [data1, data2]
            label = batch.label[0].as_in_context(self.model_ctx)
            output = self.model(data)

            # metric.update(preds=output, labels=label)
            metric.update([label], [output])
        return metric.get()[1]

    def load_model(self, model_dir_path):
        config = np.load(self.get_config_file_path(model_dir_path)).item()
        self.input_mode_answer = config['input_mode_answer']
        self.input_mode_question = config['input_mode_question']
        self.nb_classes = config['nb_classes']
        self.batch_size = config['batch_size']
        self.meta = config['meta']
        self.out_dim = config['out_dim']
        self.model = Net2(model_ctx=self.model_ctx, out_dim=self.out_dim, nb_classes=self.nb_classes, batch_size=self.batch_size)
        self.model.load_params(self.get_params_file_path(model_dir_path), ctx=self.model_ctx)

    def checkpoint(self, model_dir_path):
        self.model.save_params(self.get_params_file_path(model_dir_path))

    def save_history(self, history, model_dir_path):
        return np.save(os.path.join(model_dir_path, VQANet.model_name + '-v' + self.version + '-history.npy'), history)

    def fit(self, data_train, data_eva, meta, model_dir_path, epochs=10, batch_size=64, learning_rate=0.01):

        self.batch_size = batch_size

        config = dict()
        config['batch_size'] = batch_size
        config['input_mode_answer'] = self.input_mode_answer
        config['input_mode_question'] = self.input_mode_question
        config['nb_classes'] = self.nb_classes
        config['out_dim'] = self.out_dim
        config['meta'] = meta
        np.save(self.get_config_file_path(model_dir_path), config)

        loss = gluon.loss.SoftmaxCrossEntropyLoss()

        self.model = Net2(model_ctx=self.model_ctx, out_dim=self.out_dim, batch_size=batch_size, nb_classes=self.nb_classes)
        self.model.collect_params().initialize(init=mx.init.Xavier(), ctx=self.model_ctx)
        trainer = gluon.Trainer(self.model.collect_params(), 'adam', {'learning_rate': learning_rate})

        history = dict()
        history['train_acc'] = list()
        history['val_acc'] = list()

        moving_loss = 0.
        best_eva = 0
        for e in range(epochs):
            data_train.reset()
            for i, batch in enumerate(data_train):
                data1 = batch.data[0].as_in_context(self.model_ctx)
                data2 = batch.data[1].as_in_context(self.model_ctx)
                data = [data1, data2]
                label = batch.label[0].as_in_context(self.model_ctx)
                with autograd.record():
                    output = self.model(data)
                    cross_entropy = loss(output, label)
                    cross_entropy.backward()
                trainer.step(batch_size)

                if i == 0:
                    moving_loss = np.mean(cross_entropy.asnumpy()[0])
                else:
                    moving_loss = .99 * moving_loss + .01 * np.mean(cross_entropy.asnumpy()[0])
                if i % 50 == 0:
                    logging.debug("Epoch %s, batch %s. Moving avg of loss: %s", e, i, moving_loss)
            eva_accuracy = self.evaluate_accuracy(data_iterator=data_eva)
            train_accuracy = self.evaluate_accuracy(data_iterator=data_train)
            history['train_acc'].append(train_accuracy)
            history['val_acc'].append(eva_accuracy)
            print("Epoch %s. Loss: %s, Train_acc %s, Eval_acc %s" % (e, moving_loss, train_accuracy, eva_accuracy))
            if eva_accuracy > best_eva:
                best_eva = eva_accuracy
                logging.info('Best validation acc found. Checkpointing...')
                self.checkpoint(model_dir_path)

        self.save_history(history, model_dir_path)
        return history

    def predict_answer_class(self, img_path, question):
        f = self.fe.extract_image_features(img_path)
        questions_matrix_shape = self.meta['questions_matrix_shape']
        if len(questions_matrix_shape) == 2:
            max_seq_length = questions_matrix_shape[0]
            question_matrix = np.zeros(shape=(1, max_seq_length, 300))
            words = word_tokenize(question.lower())
            for i, word in enumerate(words[0:min(max_seq_length, len(words))]):
                question_matrix[0, i, :] = self.glove_model.encode_word(word)
            input_data = [f.as_in_context(self.model_ctx),
                          nd.array(question_matrix, ctx=self.model_ctx).reshape(1, max_seq_length * 300)]
            output = self.model(input_data)
            return nd.argmax(output, axis=1).astype(np.uint8).asscalar()
        else:
            words = word_tokenize(question.lower())
            E = np.zeros(shape=(300, len(words)))
            for j, word in enumerate(words):
                E[:, j] = self.glove_model.encode_word(word)
            question_matrix = np.sum(E, axis=1)
            input_data = [f.as_in_context(self.model_ctx),
                          nd.array(question_matrix, ctx=self.model_ctx).reshape(1, 300)]
            output = self.model(input_data)
            return nd.argmax(output, axis=1).astype(np.uint8).asscalar()

    def load_glove_300(self, data_dir_path):
        self.glove_model.load(data_dir_path, embedding_dim=300)