def test_reuse_text_processor_by_json(self): sample_text = ['sample text sentence'] processor = text_processor(sample_text) target = processor(sample_text) json_cfg = processor.to_json() # rebuild with json config processor_from_json = text_processor(json_cfg, from_json=True) processed = processor_from_json(sample_text) self.assertTrue(np.all(processed == target))
def test_text_processor(self): sample = ['test sentence', 'new sentence'] vocab = set([word for sent in sample for word in sent.split()] + ['<pad>', '<unk>']) init_size = len(vocab) processor = text_processor(sample) processed = processor(sample) self.assertEqual(processor.vocab_size, init_size) # check if each word in sentences is processed for s, p in zip(sample, processed): self.assertEqual(len(s.split()), len(p)) # new one sentence with unseen words new_sample = ['test with new words'] processed = processor(new_sample) # new sample has words wihich are not in original one # and '<unk>' token should be stored in the index self.assertEqual(len(processed[0]), len(new_sample[0].split())) for sent in new_sample: for word in sent.split(): vocab.add(word) self.assertEqual(processor.vocab_size, init_size) processor.update(new_sample) self.assertEqual(processor.vocab_size, len(vocab)) # updated vocab so that it has to match the length processed = processor(new_sample) self.assertEqual(len(processed[0]), len(new_sample[0].split()))
def test_text_processor_passing_str(self): sample = 'test sentence' vocab = set(sample.split() + ['<pad>', '<unk>']) init_size = len(vocab) processor = text_processor(sample) self.assertEqual(processor.vocab_size, init_size)
def test_processed_sentence_lengths_are_fixed_by_given_number(self): maxlen = 4 sample_text = [ 'sample text sentence', 'This is another sentence', 'This is not processed yet and this is treated as long sentence example for test' ] processor = text_processor(sample_text, maxlen=maxlen) processed = processor(sample_text) self.assertEqual(processed.shape, (len(sample_text), maxlen))
def test_angle_brackets_not_filtered(self): sample = '<these> <are> <sample> <tags>' processor = text_processor(sample) processed = processor(sample) self.assertEqual(processed.shape, (1, len(sample.split()))) # check all registered vocabulary has <> tags for word in processor.word_index.keys(): self.assertTrue(re.search(r'^<\w+>$', word))
async def run_server(host, port): global _processor _processor = text_processor(num_words=15, from_config=True) server = await asyncio.start_server( run_prediction, host, port) addr = server.sockets[0].getsockname() log.info(f'Serving on {addr!r}') async with server: await server.serve_forever()
def test_processed_sentence_lengths_are_same(self): sample_text = [ 'sample text sentence', 'This is another sentence', 'This is not processed yet' ] processor = text_processor(sample_text) examples = [ 'this should be processed by tokenizer', 'this is also be processed' ] processed = processor(examples) self.assertEqual(len(processed[0]), len(processed[1]))
def test_fixed_vocab_size(self): num_words = 100 start = 99 processor = text_processor(num_words=num_words, from_config=True) # only words whose ids are less than 100 work test_sents = ' '.join(processor.index_word[i] for i in range(start, 200)) processed = processor(test_sents) num_kinds_of_words = len(set(id for id in processed[0])) self.assertEqual(num_kinds_of_words, num_words - start + 1)
def __init__(self): if PredictionModel.__instance is None: self._build_model() self._processor = text_processor( num_words=20000, maxlen=Config.MODELS.get('QTYPE').get('seq_length'), from_config=True) PredictionModel.__instance = self else: raise RuntimeError( f'This object can not be instantiated. Use {self.__class__.__name__}.get_model() instead' )
print() print(' Parameters') print(' ----------') print(' Data size:') print(f' Train: {len(train):>7}') print(f' Val: {len(val):>7}') print() print(f' Epoch: {epochs}') print(f' Batch Size: {batch_size}') print(f' Hidden unit size: {units}') print(f' Vocabulary size: {vocab_size}') print() # use all words from training set processed primarily processor = text_processor(maxlen=seq_length, from_config=True) ans_processor = text_processor(maxlen=ans_length, from_config=True) print('Time to setup: {:.4f}s'.format(time.time() - st)) print() run(model_type, train, val, units=units, embedding_dim=embedding_dim, vocab_size=vocab_size, learning_rate=learning_rate, sequence_length=ans_length, save=save) print('Training completed')
save = args.no_save st = time.time() print('Setting up dataset') with open('./data/answer_yes_no.json', 'r') as f: dataset = json.load(f) print('Total loaded data size:', len(dataset)) random.shuffle(dataset) train, val = dataset[:data_size], dataset[data_size:data_size + val_size] print('Data size: Train: {} Val: {}'.format(len(train), len(val))) if args.no_config: # use only if words appeared in training set words = [d['question'] for d in train] processor = text_processor(words, maxlen=pad_max_len) assert processor(words).shape[1] == pad_max_len else: processor = text_processor(num_words=vocab_size, maxlen=pad_max_len, from_config=True) print('Time to setup: {:.4f}s'.format(time.time() - st)) main(train, val, save=save) print('Training completed') print('Total running time: {:.4f}s'.format(time.time() - st))
def main(*, training=True, save_to=None, load_from=None, val=0.2): global data_size global num_classes global processor vqa = VQA() vqa.load_data(num_data=data_size) questions, question_types, _, _ = next(vqa.data_generator()) labels = [ q2id[q] if q in q2id else q2id['none of the above'] for q in question_types ] # build processor based on training dataset # if processor is not reused if training: # preprocessing dataset # split train and test set train_size = int(data_size * (1 - val)) # inputs inputs_train = questions[:train_size] inputs_val = questions[train_size:] # process inputs # if tokenizer is not loaded, create new one if processor is None: processor = text_processor(inputs_train) # iinitialize model model = QuestionTypeClassification( embedding_dim=embedding_dim, units=hidden_units, vocab_size=vocab_size, # need to add 1 due to Embedding implementation num_classes=num_classes) # set initial weights to the model if load_from is not None: print('Loading weights...') model.load_weights(load_from) # TRAINING STEP if training: min_loss_val = 1.0 print('Start training') inputs_train = processor(inputs_train) inputs_val = [processor(inputs_val)] # labels labels = np.array(labels, dtype=np.int32) labels_train = labels[:train_size] labels_val = labels[train_size:] loss = 0 optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate) train_cls_step = make_training_cls_model( model, optimizer, loss='sparse_categorical_crossentropy') # execute training for epoch in range(epochs): print('=====' * 10) print(' Epoch {}'.format(epoch + 1)) print('=====' * 10) dataset = data_generator(inputs_train, labels_train, batch_size) for batch, (ins, outs) in enumerate(dataset): st = time.time() ins = [ins] batch_loss, accuracy = train_cls_step(ins, outs) end = time.time() if batch % 100 == 0: out_val = model(*inputs_val) cost_val = tf.keras.losses.sparse_categorical_crossentropy( labels_val, out_val, from_logits=True) loss_val = tf.reduce_mean(cost_val) acc_val = calculate_accuracy(out_val, labels_val) if DEBUG: print('[DEBUG] Batch:', batch) for layer in model.layers: print(' Layer:', model.name + ':' + layer.name) print(' Weights:') print(' mean:', np.mean(layer.get_weights()[0])) print(' std:', np.std(layer.get_weights()[0])) print() batch_loss = batch_loss.numpy() print(' Batch:', batch) # TODO: add accuracy print( ' Loss: {:.4f} Accuracy(Train): {:.4f} Loss(Val): {:.4f} Accuracy(Val): {:.4f} Time(batch): {:.4f}s' .format(batch_loss, accuracy, loss_val, acc_val, end - st)) if loss_val < min_loss_val: min_loss_val = loss_val print('Saving models...') # save tokenizer info for resuse processor.to_json('./.env/tokenizer_config.json') model.save_weights(save_to) print('Saved!!') print() print('Training completed') else: # if not training mode test with all given data st = time.time() inputs = processor(questions) out = model(inputs) labels = tf.Variable(labels, dtype=tf.int32) accuracy = calculate_accuracy(out, labels) end = time.time() print('Evaluated score: Accuracy: {:.4f} Time: {:.4f}s'.format( accuracy, end - st)) return model
import tensorflow as tf from main.settings import Config from main.utils.loader import fetch_question_types, load_image from main.utils.preprocess import text_processor from .common import get_mobilenet_encoder from ._base import BaseModel from ._models import ( QuestionTypeClassification, ClassificationModel, QuestionAnswerModel, ) log = logging.getLogger(__name__) processor = text_processor(num_words=20000, from_config=True) img_encoder = get_mobilenet_encoder() # TODO: tmp classes = fetch_question_types() id2q = [q for q in classes] # tokens storing IDs for specific tokens _tokens = { 'bos': processor.word_index['<bos>'], 'eos': processor.word_index['<eos>'], 'unk': processor.word_index['<unk>'], 'pad': processor.word_index['<pad>'], } class PredictionModel(BaseModel):
def test_from_config(self): # use preprocessed config processor = text_processor(from_config=True) self.assertGreater(processor.vocab_size, 0)
def test_text_processor_load_data_by_config(self): processor = text_processor(from_config=True) self.assertGreater(processor.vocab_size, 0)
print() print( ' Validation(approx.): Loss - {:.4f} Acc - {:.4f} Time - {:.4f}s' .format(loss_val, acc_val, end_val - st_val)) print(' Total time per epoch: {:.4f}s'.format(time.time() - epoch_start)) print() if __name__ == '__main__': st = time.time() print('Setting up dataset') with open('./data/answer_yes_no.json', 'r') as f: dataset = json.load(f) random.shuffle(dataset) train, val = dataset[:data_size], dataset[data_size:data_size + val_size] # use only if words appeared in training set words = [d['question'] for d in train] processor = text_processor(words, maxlen=pad_max_len) assert processor(words).shape[1] == pad_max_len print('Time to setup: {:.4f}s'.format(time.time() - st)) main(train, val) print('Total running time: {:.4f}s'.format(time.time() - st))