def test_load_spm_split2(self): x_train, y_train, x_valid, y_valid, x_test, y_test = \ load_spm_data_and_labels(self.test_file, split_mode=2, split_size=0.4) assert len(x_train[0]) == len(x_train[1]) == len(y_train) assert len(x_valid[0]) == len(x_valid[1]) == len(y_valid) assert len(x_test[0]) == len(x_test[1]) == len(y_test) assert len(x_train[0]) > 0 and len(x_test[0]) > 0
def test_load_spm_split1(self): x_train, y_train, x_test, y_test = load_spm_data_and_labels( self.test_file, split_mode=1) assert len(x_train[0]) == len(x_train[1]) assert len(x_test[0]) == len(x_test[1]) assert len(x_train[0]) == len(y_train) and len( x_test[0]) == len(y_test) assert len(x_train[0]) > 0 and len(x_test[0]) > 0
def setup_class(self): self.train_data, self.train_labels, self.valid_data, self.valid_labels = \ load_spm_data_and_labels(self.test_file, split_mode=1, split_size=0.3) self.checkpoint_dir = os.path.dirname(__file__) self.model_name = 'siamese_cnn_spm' self.json_file = os.path.join(self.checkpoint_dir, 'siamese_cnn_spm.json') self.weights_file = os.path.join(self.checkpoint_dir, 'siamese_cnn_spm.hdf5') self.swa_weights_file = os.path.join(self.checkpoint_dir, 'siamese_cnn_spm_swa.hdf5') self.preprocessor_file = os.path.join(self.checkpoint_dir, 'siamese_cnn_preprocessor.pkl')
def test_spm_generator(self): test_file = os.path.join(os.path.dirname(__file__), '../../../data/spm/webank/example.txt') x_train, y_train = load_spm_data_and_labels(test_file) preprocessor = SPMPreprocessor(x_train, y_train) generator = SPMGenerator(preprocessor, x_train, batch_size=64) assert len(generator) == math.ceil(len(x_train[0]) / 64) for i, (features, y) in enumerate(generator): if i < len(generator) - 1: assert features[0].shape[0] == features[1].shape[0] == 64 assert y is None else: assert features[0].shape[0] == features[1].shape[0] == \ len(x_train[0]) - 64 * (len(generator) - 1) assert y is None
def setup_class(self): x_train, y_train = load_spm_data_and_labels(self.test_file) self.preprocessor = SPMPreprocessor( x_train, y_train, use_word=True, use_char=True, use_bert=False, bert_vocab_file=self.bert_vocab_file, char_embed_type='word2vec', word_embed_type='word2vec', max_len=10) self.num_class = self.preprocessor.num_class self.char_embeddings = self.preprocessor.char_embeddings self.char_vocab_size = self.preprocessor.char_vocab_size self.char_embed_dim = self.preprocessor.char_embed_dim self.word_embeddings = self.preprocessor.word_embeddings self.word_vocab_size = self.preprocessor.word_vocab_size self.word_embed_dim = self.preprocessor.word_embed_dim self.checkpoint_dir = os.path.dirname(__file__)
def test_load_spm(self): x_train, y_train = load_spm_data_and_labels(self.test_file) assert len(x_train) == 2 assert len(x_train[0]) == len(x_train[1]) assert len(x_train[0]) == len(y_train) assert len(x_train[0]) > 0
import os from fancy_nlp.utils import load_spm_data_and_labels from fancy_nlp.applications import SPM train_file = 'datasets/spm/webank/BQ_train.txt' valid_file = 'datasets/spm/webank/BQ_dev.txt' test_file = 'datasets/spm/webank/BQ_test.txt' model_name = 'webank_spm_siamese_cnn_word' checkpoint_dir = 'pretrained_models' if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) train_data, train_labels = load_spm_data_and_labels(train_file) valid_data, valid_labels = load_spm_data_and_labels(valid_file) test_data, test_labels = load_spm_data_and_labels(test_file) spm_app = SPM(use_pretrained=False) spm_app.fit(train_data, train_labels, valid_data, valid_labels, spm_model_type='siamese_cnn', word_embed_trainable=True, callback_list=['modelcheckpoint', 'earlystopping', 'swa'], checkpoint_dir=checkpoint_dir, model_name=model_name, max_len=60,
def setup_class(self): self.train_data, self.train_labels, self.valid_data, self.valid_labels = \ load_spm_data_and_labels(self.test_file, split_mode=1) self.preprocessor = SPMPreprocessor( (self.train_data[0] + self.valid_data[0], self.train_data[1] + self.valid_data[1]), self.train_labels + self.valid_labels, use_word=True, use_char=True, bert_vocab_file=self.bert_vocab_file, word_embed_type='word2vec', char_embed_type='word2vec', max_len=10) self.num_class = self.preprocessor.num_class self.char_embeddings = self.preprocessor.char_embeddings self.char_vocab_size = self.preprocessor.char_vocab_size self.char_embed_dim = self.preprocessor.char_embed_dim self.word_embeddings = self.preprocessor.word_embeddings self.word_vocab_size = self.preprocessor.word_vocab_size self.word_embed_dim = self.preprocessor.word_embed_dim self.checkpoint_dir = os.path.dirname(__file__) self.spm_model = SiameseCNN( num_class=self.num_class, use_word=True, word_embeddings=self.word_embeddings, word_vocab_size=self.word_vocab_size, word_embed_dim=self.word_embed_dim, word_embed_trainable=False, use_char=True, char_embeddings=self.char_embeddings, char_vocab_size=self.char_vocab_size, char_embed_dim=self.char_embed_dim, char_embed_trainable=False, use_bert=False, bert_config_file=self.bert_config_file, bert_checkpoint_file=self.bert_model_file, bert_trainable=True, max_len=self.preprocessor.max_len, max_word_len=self.preprocessor.max_word_len).build_model() self.swa_model = SiameseCNN( num_class=self.num_class, use_word=True, word_embeddings=self.word_embeddings, word_vocab_size=self.word_vocab_size, word_embed_dim=self.word_embed_dim, word_embed_trainable=False, use_char=True, char_embeddings=self.char_embeddings, char_vocab_size=self.char_vocab_size, char_embed_dim=self.char_embed_dim, char_embed_trainable=False, use_bert=False, bert_config_file=self.bert_config_file, bert_checkpoint_file=self.bert_model_file, bert_trainable=True, max_len=self.preprocessor.max_len, max_word_len=self.preprocessor.max_word_len).build_model() self.spm_trainer = SPMTrainer(self.spm_model, self.preprocessor) self.json_file = os.path.join(self.checkpoint_dir, 'siamese_cnn_spm.json') self.weights_file = os.path.join(self.checkpoint_dir, 'siamese_cnn_spm.hdf5')