def load_model(model_path, wv):
    setup = json.load(open(model_path))

    vocab = json.load(open(setup['vocab_path']))
    n_class = setup['n_class']

    # Setup a model
    if setup['model'] == 'rnn':
        Encoder = nets.RNNEncoder
    elif setup['model'] == 'cnn':
        Encoder = nets.CNNEncoder
    elif setup['model'] == 'bow':
        Encoder = nets.BOWMLPEncoder
    encoder = Encoder(n_layers=setup['layer'],
                      n_vocab=len(vocab),
                      n_units=setup['unit'],
                      dropout=setup['dropout'],
                      wv=wv)
    model = nets.TextClassifier(encoder, n_class)
    chainer.serializers.load_npz(setup['model_path'], model)

    gpu = -1  # todo gpu
    if gpu >= 0:
        # Make a specified GPU current
        chainer.backends.cuda.get_device_from_id(args.gpu).use()
        model.to_gpu()  # Copy the model to the GPU

    return model, vocab, setup
    def get_result(self,
                   embs,
                   path_dataset,
                   path_output='/tmp/text_classification/'):
        self.out = path_output
        self.unit = embs.matrix.shape[1]

        if not os.path.isdir(path_output):
            os.makedirs(path_output)

        # Load a dataset
        self.path_dataset = path_dataset
        if self.path_dataset == 'dbpedia':
            train, test, vocab = text_datasets.get_dbpedia(
                char_based=self.char_based,
                vocab=embs.vocabulary.dic_words_ids,
                shrink=self.shrink)
        elif self.path_dataset.startswith('imdb.'):
            train, test, vocab = text_datasets.get_imdb(
                fine_grained=self.path_dataset.endswith('.fine'),
                char_based=self.char_based,
                vocab=embs.vocabulary.dic_words_ids,
                shrink=self.shrink)
        elif self.path_dataset in [
                'TREC', 'stsa.binary', 'stsa.fine', 'custrev', 'mpqa',
                'rt-polarity', 'subj'
        ]:
            train, test, vocab = text_datasets.get_other_text_dataset(
                self.path_dataset,
                char_based=self.char_based,
                vocab=embs.vocabulary.dic_words_ids,
                shrink=self.shrink)
        else:  # finallly, if file is not downloadable, load from local path
            train, test, vocab = text_datasets.get_dataset_from_path(
                path_dataset,
                vocab=embs.vocabulary.dic_words_ids,
                char_based=self.char_based,
                shrink=self.shrink)

        print('# train data: {}'.format(len(train)))
        print('# test  data: {}'.format(len(test)))
        print('# vocab: {}'.format(len(vocab)))
        n_class = len(set([int(d[1]) for d in train]))
        print('# class: {}'.format(n_class))

        train_iter = chainer.iterators.SerialIterator(train, self.batchsize)
        test_iter = chainer.iterators.SerialIterator(test,
                                                     self.batchsize,
                                                     repeat=False,
                                                     shuffle=False)

        # Setup a model
        if self.model == 'rnn':
            Encoder = nets.RNNEncoder
        elif self.model == 'cnn':
            Encoder = nets.CNNEncoder
        elif self.model == 'bow':
            Encoder = nets.BOWMLPEncoder
        encoder = Encoder(n_layers=self.layer,
                          n_vocab=len(vocab),
                          n_units=self.unit,
                          dropout=self.dropout,
                          wv=embs.matrix)
        model = nets.TextClassifier(encoder, n_class)
        if self.gpu >= 0:
            # Make a specified GPU current
            chainer.backends.cuda.get_device_from_id(self.gpu).use()
            model.to_gpu()  # Copy the model to the GPU

        # Setup an optimizer
        optimizer = chainer.optimizers.Adam()
        optimizer.setup(model)
        optimizer.add_hook(chainer.optimizer.WeightDecay(1e-4))

        # Set up a trainer
        updater = training.StandardUpdater(train_iter,
                                           optimizer,
                                           converter=convert_seq,
                                           device=self.gpu)
        trainer = training.Trainer(updater, (self.epoch, 'epoch'),
                                   out=self.out)

        # Evaluate the model with the test dataset for each epoch
        trainer.extend(
            extensions.Evaluator(test_iter,
                                 model,
                                 converter=convert_seq,
                                 device=self.gpu))

        # Take a best snapshot
        record_trigger = training.triggers.MaxValueTrigger(
            'validation/main/accuracy', (1, 'epoch'))
        trainer.extend(extensions.snapshot_object(model, 'best_model.npz'),
                       trigger=record_trigger)

        # Write a log of evaluation statistics for each epoch
        trainer.extend(extensions.LogReport())
        trainer.extend(
            extensions.PrintReport([
                'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy',
                'validation/main/accuracy', 'elapsed_time'
            ]))

        # Print a progress bar to stdout
        trainer.extend(extensions.ProgressBar())

        # Save vocabulary and model's setting
        if not os.path.isdir(self.out):
            os.mkdir(self.out)
        vocab_path = os.path.join(self.out, 'vocab.json')
        with open(vocab_path, 'w') as f:
            json.dump(vocab, f)
        model_path = os.path.join(self.out, 'best_model.npz')
        experiment_setup = self.__dict__
        experiment_setup['vocab_path'] = vocab_path
        experiment_setup['model_path'] = model_path
        experiment_setup['n_class'] = n_class
        experiment_setup['datetime'] = self.current_datetime
        with open(os.path.join(self.out, 'args.json'), 'w') as f:
            json.dump(self.__dict__, f)

        # Run the training
        trainer.run()

        result = {}
        result['experiment_setup'] = experiment_setup
        result['log'] = load_json(os.path.join(self.out, 'log'))
        result['result'] = result['log'][-1]['validation/main/accuracy']
        return result
Esempio n. 3
0
    def get_result(self,
                   embeddings,
                   path_dataset,
                   path_output='/tmp/text_classification/'):
        self.out = path_output
        self.unit = embeddings.matrix.shape[1]

        if not os.path.isdir(path_output):
            os.makedirs(path_output)

        # TODO: move this to protonn ds management
        self.path_dataset = path_dataset
        # if self.path_dataset == 'dbpedia':
        #     train, test, vocab = text_datasets.get_dbpedia(
        #         char_based=self.char_based,
        #         vocab=embeddings.vocabulary.dic_words_ids,
        #         shrink=self.shrink)
        # elif self.path_dataset.startswith('imdb.'):
        #     train, test, vocab = text_datasets.get_imdb(
        #         fine_grained=self.path_dataset.endswith('.fine'),
        #         char_based=self.char_based,
        #         vocab=embeddings.vocabulary.dic_words_ids,
        #         shrink=self.shrink)
        # elif self.path_dataset in ['TREC', 'stsa.binary', 'stsa.fine',
        #                            'custrev', 'mpqa', 'rt-polarity', 'subj']:
        #     train, test, vocab = text_datasets.get_other_text_dataset(
        #         self.path_dataset,
        #         char_based=self.char_based,
        #         vocab=embeddings.vocabulary.dic_words_ids,
        #         shrink=self.shrink)
        # else:  # finallly, if file is not downloadable, load from local path
        print(path_dataset)
        path_adapter = os.path.join(path_dataset, "adapter.py")
        if os.path.isfile(path_adapter):
            spec = importlib.util.spec_from_file_location(
                "ds_adapter", path_adapter)
            module = importlib.util.module_from_spec(spec)
            spec.loader.exec_module(module)
            adapter = module.Adapter()
            train, test, _ = adapter.read()
            vocab = embeddings.vocabulary.dic_words_ids
            train = nlp_utils.transform_to_array(train, vocab)
            test = nlp_utils.transform_to_array(test, vocab)

            # exit(0)
        else:
            train, test, vocab = text_datasets.get_dataset_from_path(
                path_dataset,
                vocab=embeddings.vocabulary.dic_words_ids,
                char_based=self.char_based,
                shrink=self.shrink)

        print('# cnt train samples: {}'.format(len(train)))
        print('# cnt test  samples: {}'.format(len(test)))
        print('# size vocab: {}'.format(len(vocab)))
        n_class = len(set([int(d[1]) for d in train]))
        print('# cnt classes: {}'.format(n_class))
        # print(train[0])
        # exit(0)

        train_iter = chainer.iterators.SerialIterator(train, self.batchsize)
        test_iter = chainer.iterators.SerialIterator(test,
                                                     self.batchsize,
                                                     repeat=False,
                                                     shuffle=False)

        # Setup a model
        if self.model == 'rnn':
            Encoder = nets.RNNEncoder
        elif self.model == 'cnn':
            Encoder = nets.CNNEncoder
        elif self.model == 'bow':
            Encoder = nets.BOWMLPEncoder
        encoder = Encoder(n_layers=self.layer,
                          n_vocab=len(vocab),
                          n_units=self.unit,
                          dropout=self.dropout,
                          wv=embeddings.matrix)
        model = nets.TextClassifier(encoder, n_class)
        if self.gpu >= 0:
            # Make a specified GPU current
            chainer.backends.cuda.get_device_from_id(self.gpu).use()
            model.to_gpu()  # Copy the model to the GPU

        # Setup an optimizer
        optimizer = chainer.optimizers.Adam()
        optimizer.setup(model)
        optimizer.add_hook(chainer.optimizer.WeightDecay(1e-4))

        # Set up a trainer
        updater = training.StandardUpdater(train_iter,
                                           optimizer,
                                           converter=nlp_utils.convert_seq,
                                           device=self.gpu)
        trainer = training.Trainer(updater, (self.epoch, 'epoch'),
                                   out=self.out)

        # Evaluate the model with the test dataset for each epoch
        trainer.extend(
            extensions.Evaluator(test_iter,
                                 model,
                                 converter=nlp_utils.convert_seq,
                                 device=self.gpu))

        # Take a best snapshot
        record_trigger = training.triggers.MaxValueTrigger(
            'validation/main/accuracy', (1, 'epoch'))
        trainer.extend(extensions.snapshot_object(model, 'best_model.npz'),
                       trigger=record_trigger)

        # Write a log of evaluation statistics for each epoch
        trainer.extend(extensions.LogReport())
        trainer.extend(
            extensions.PrintReport([
                'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy',
                'validation/main/accuracy', 'elapsed_time'
            ]))

        # Print a progress bar to stdout
        trainer.extend(extensions.ProgressBar())

        # Save vocabulary and model's setting
        if not os.path.isdir(self.out):
            os.mkdir(self.out)
        vocab_path = os.path.join(self.out, 'vocab.json')
        with open(vocab_path, 'w') as f:
            json.dump(vocab, f)
        model_path = os.path.join(self.out, 'best_model.npz')
        experiment_setup = self.__dict__
        # TODO: move all this to the parent class
        experiment_setup['task'] = "text classification"
        experiment_setup['vocab_path'] = vocab_path
        experiment_setup['model_path'] = model_path
        experiment_setup['n_class'] = n_class
        experiment_setup['datetime'] = self.current_datetime
        with open(os.path.join(self.out, 'args.json'), 'w') as f:
            json.dump(self.__dict__, f)

        # Run the training
        trainer.run()

        result = {}
        result['experiment_setup'] = experiment_setup
        result['experiment_setup']['default_measurement'] = 'accuracy'
        result['experiment_setup']['dataset'] = os.path.basename(
            os.path.normpath(path_dataset))
        result['experiment_setup']['method'] = self.model
        result['experiment_setup']['embeddings'] = embeddings.metadata
        result['log'] = load_json(os.path.join(self.out, 'log'))

        # TODO: old version was returning last test value, make a footnote
        # result['result'] = {"accuracy": result['log'][-1]['validation/main/accuracy']}
        accuracy = max(_["validation/main/accuracy"] for _ in result['log'])
        result['result'] = {"accuracy": accuracy}
        return [result]