Example #1
0
 def test_load_spm_split2(self):
     x_train, y_train, x_valid, y_valid, x_test, y_test = \
         load_spm_data_and_labels(self.test_file, split_mode=2, split_size=0.4)
     assert len(x_train[0]) == len(x_train[1]) == len(y_train)
     assert len(x_valid[0]) == len(x_valid[1]) == len(y_valid)
     assert len(x_test[0]) == len(x_test[1]) == len(y_test)
     assert len(x_train[0]) > 0 and len(x_test[0]) > 0
Example #2
0
 def test_load_spm_split1(self):
     x_train, y_train, x_test, y_test = load_spm_data_and_labels(
         self.test_file, split_mode=1)
     assert len(x_train[0]) == len(x_train[1])
     assert len(x_test[0]) == len(x_test[1])
     assert len(x_train[0]) == len(y_train) and len(
         x_test[0]) == len(y_test)
     assert len(x_train[0]) > 0 and len(x_test[0]) > 0
Example #3
0
    def setup_class(self):
        self.train_data, self.train_labels, self.valid_data, self.valid_labels = \
            load_spm_data_and_labels(self.test_file, split_mode=1, split_size=0.3)

        self.checkpoint_dir = os.path.dirname(__file__)
        self.model_name = 'siamese_cnn_spm'
        self.json_file = os.path.join(self.checkpoint_dir,
                                      'siamese_cnn_spm.json')
        self.weights_file = os.path.join(self.checkpoint_dir,
                                         'siamese_cnn_spm.hdf5')
        self.swa_weights_file = os.path.join(self.checkpoint_dir,
                                             'siamese_cnn_spm_swa.hdf5')
        self.preprocessor_file = os.path.join(self.checkpoint_dir,
                                              'siamese_cnn_preprocessor.pkl')
Example #4
0
    def test_spm_generator(self):
        test_file = os.path.join(os.path.dirname(__file__), '../../../data/spm/webank/example.txt')
        x_train, y_train = load_spm_data_and_labels(test_file)

        preprocessor = SPMPreprocessor(x_train, y_train)
        generator = SPMGenerator(preprocessor, x_train, batch_size=64)
        assert len(generator) == math.ceil(len(x_train[0]) / 64)
        for i, (features, y) in enumerate(generator):
            if i < len(generator) - 1:
                assert features[0].shape[0] == features[1].shape[0] == 64
                assert y is None
            else:
                assert features[0].shape[0] == features[1].shape[0] == \
                       len(x_train[0]) - 64 * (len(generator) - 1)
                assert y is None
Example #5
0
    def setup_class(self):
        x_train, y_train = load_spm_data_and_labels(self.test_file)
        self.preprocessor = SPMPreprocessor(
            x_train,
            y_train,
            use_word=True,
            use_char=True,
            use_bert=False,
            bert_vocab_file=self.bert_vocab_file,
            char_embed_type='word2vec',
            word_embed_type='word2vec',
            max_len=10)
        self.num_class = self.preprocessor.num_class
        self.char_embeddings = self.preprocessor.char_embeddings
        self.char_vocab_size = self.preprocessor.char_vocab_size
        self.char_embed_dim = self.preprocessor.char_embed_dim

        self.word_embeddings = self.preprocessor.word_embeddings
        self.word_vocab_size = self.preprocessor.word_vocab_size
        self.word_embed_dim = self.preprocessor.word_embed_dim
        self.checkpoint_dir = os.path.dirname(__file__)
Example #6
0
 def test_load_spm(self):
     x_train, y_train = load_spm_data_and_labels(self.test_file)
     assert len(x_train) == 2
     assert len(x_train[0]) == len(x_train[1])
     assert len(x_train[0]) == len(y_train)
     assert len(x_train[0]) > 0
Example #7
0
import os

from fancy_nlp.utils import load_spm_data_and_labels
from fancy_nlp.applications import SPM

train_file = 'datasets/spm/webank/BQ_train.txt'
valid_file = 'datasets/spm/webank/BQ_dev.txt'
test_file = 'datasets/spm/webank/BQ_test.txt'

model_name = 'webank_spm_siamese_cnn_word'
checkpoint_dir = 'pretrained_models'

if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)

train_data, train_labels = load_spm_data_and_labels(train_file)
valid_data, valid_labels = load_spm_data_and_labels(valid_file)
test_data, test_labels = load_spm_data_and_labels(test_file)

spm_app = SPM(use_pretrained=False)

spm_app.fit(train_data,
            train_labels,
            valid_data,
            valid_labels,
            spm_model_type='siamese_cnn',
            word_embed_trainable=True,
            callback_list=['modelcheckpoint', 'earlystopping', 'swa'],
            checkpoint_dir=checkpoint_dir,
            model_name=model_name,
            max_len=60,
Example #8
0
    def setup_class(self):
        self.train_data, self.train_labels, self.valid_data, self.valid_labels = \
            load_spm_data_and_labels(self.test_file, split_mode=1)
        self.preprocessor = SPMPreprocessor(
            (self.train_data[0] + self.valid_data[0],
             self.train_data[1] + self.valid_data[1]),
            self.train_labels + self.valid_labels,
            use_word=True,
            use_char=True,
            bert_vocab_file=self.bert_vocab_file,
            word_embed_type='word2vec',
            char_embed_type='word2vec',
            max_len=10)
        self.num_class = self.preprocessor.num_class
        self.char_embeddings = self.preprocessor.char_embeddings
        self.char_vocab_size = self.preprocessor.char_vocab_size
        self.char_embed_dim = self.preprocessor.char_embed_dim

        self.word_embeddings = self.preprocessor.word_embeddings
        self.word_vocab_size = self.preprocessor.word_vocab_size
        self.word_embed_dim = self.preprocessor.word_embed_dim
        self.checkpoint_dir = os.path.dirname(__file__)

        self.spm_model = SiameseCNN(
            num_class=self.num_class,
            use_word=True,
            word_embeddings=self.word_embeddings,
            word_vocab_size=self.word_vocab_size,
            word_embed_dim=self.word_embed_dim,
            word_embed_trainable=False,
            use_char=True,
            char_embeddings=self.char_embeddings,
            char_vocab_size=self.char_vocab_size,
            char_embed_dim=self.char_embed_dim,
            char_embed_trainable=False,
            use_bert=False,
            bert_config_file=self.bert_config_file,
            bert_checkpoint_file=self.bert_model_file,
            bert_trainable=True,
            max_len=self.preprocessor.max_len,
            max_word_len=self.preprocessor.max_word_len).build_model()

        self.swa_model = SiameseCNN(
            num_class=self.num_class,
            use_word=True,
            word_embeddings=self.word_embeddings,
            word_vocab_size=self.word_vocab_size,
            word_embed_dim=self.word_embed_dim,
            word_embed_trainable=False,
            use_char=True,
            char_embeddings=self.char_embeddings,
            char_vocab_size=self.char_vocab_size,
            char_embed_dim=self.char_embed_dim,
            char_embed_trainable=False,
            use_bert=False,
            bert_config_file=self.bert_config_file,
            bert_checkpoint_file=self.bert_model_file,
            bert_trainable=True,
            max_len=self.preprocessor.max_len,
            max_word_len=self.preprocessor.max_word_len).build_model()

        self.spm_trainer = SPMTrainer(self.spm_model, self.preprocessor)

        self.json_file = os.path.join(self.checkpoint_dir,
                                      'siamese_cnn_spm.json')
        self.weights_file = os.path.join(self.checkpoint_dir,
                                         'siamese_cnn_spm.hdf5')