def main(args): token_to_index, index_to_token = Vocabulary.load(args.vocab_file) root, _ = os.path.splitext(args.vocab_file) basepath, basename = os.path.split(root) embed_path = f'{basepath}/embedding_{basename}.npy' embeddings = np.load(embed_path) if os.path.exists(embed_path) else None model = FastQA(len(token_to_index), args.embed, args.hidden, question_limit=args.q_len, context_limit=args.c_len, dropout=args.dropout, pretrained_embeddings=embeddings, with_feature=not args.without_feature).build() opt = Adam() model.compile(optimizer=opt, loss_weights=[1, 1, 0, 0], loss=['sparse_categorical_crossentropy', 'sparse_categorical_crossentropy', None, None]) train_dataset = SquadReader(args.train_path) dev_dataset = SquadReader(args.dev_path) tokenizer = get_tokenizer(lower=args.lower, as_str=False) converter = SquadConverter(token_to_index, PAD_TOKEN, UNK_TOKEN, tokenizer, question_max_len=args.q_len, context_max_len=args.c_len) eval_converter = SquadEvalConverter( token_to_index, PAD_TOKEN, UNK_TOKEN, tokenizer, question_max_len=args.q_len, context_max_len=args.c_len) train_generator = Iterator(train_dataset, args.batch, converter) dev_generator_loss = Iterator(dev_dataset, args.batch, converter, shuffle=False) dev_generator_f1 = Iterator(dev_dataset, args.batch, eval_converter, repeat=False, shuffle=False) trainer = SquadTrainer(model, train_generator, args.epoch, dev_generator_loss, './models/fastqa.{epoch:02d}-{val_loss:.2f}.h5') trainer.add_callback(FastQALRScheduler( dev_generator_f1, val_answer_file=args.answer_path, steps=args.steps)) trainer.add_callback(FastQACheckpoint('./models/fastqa.{steps:06d}.h5', steps=args.steps)) if args.use_tensorboard: trainer.add_callback(TensorBoard(log_dir='./graph', batch_size=args.batch)) history = trainer.run() dump_graph(history, 'loss_graph.png')
def main(args): token_to_index, _ = Vocabulary.load(args.vocab_file) model = FastQA(len(token_to_index), args.embed, args.hidden, question_limit=args.q_len, context_limit=args.c_len, with_feature=not args.without_feature).build() model.load_weights(args.model_path) test_dataset = SquadReader(args.test_path) tokenizer = get_tokenizer(lower=args.lower, as_str=False) converter = SquadEvalConverter(token_to_index, PAD_TOKEN, UNK_TOKEN, tokenizer, question_max_len=args.q_len, context_max_len=args.c_len) test_generator = Iterator(test_dataset, args.batch, converter, False, False) predictions = {} for inputs, (contexts, ids) in test_generator: _, _, start_indices, end_indices = model.predict_on_batch(inputs) for i, (start, end) in enumerate(zip(start_indices, end_indices)): prediction = ' '.join(contexts[i][j] for j in range(start, end + 1)) predictions[ids[i]] = prediction basename = osp.splitext(osp.basename(args.model_path))[0] save_path = osp.join(args.save_dir, f'predictions_{basename}.json') with open(save_path, 'w') as f: json.dump(predictions, f, indent=2)
def test_load(self): filename = '/path/to/vocab.pkl' open_ = patch('data.open', mock_open()).start() pickle_load = patch('data.pickle.load').start() pickle_load.return_value = ('token_to_index', 'index_to_token') token_to_index, index_to_token = Vocabulary.load(filename) self.assertEqual(token_to_index, 'token_to_index') self.assertEqual(index_to_token, 'index_to_token') open_.assert_called_with(filename, mode='rb') pickle_load.assert_called_with(open_.return_value)
def main(args): token_to_index, index_to_token = Vocabulary.load(args.vocab_file) root, _ = os.path.splitext(args.vocab_file) basepath, basename = os.path.split(root) embed_path = f'{basepath}/embedding_{basename}.npy' embeddings = np.load(embed_path) if os.path.exists(embed_path) else None batch_size = args.batch # Batch size for training. epochs = args.epoch # Number of epochs to train for. converter = SquadDepConverter(token_to_index, PAD_TOKEN, UNK_TOKEN) if args.model == 'qanet': model = DependencyQANet(len(token_to_index), args.embed, len(converter._dep_to_index), args.hidden, args.num_heads, dropout=args.dropout, num_blocks=args.encoder_layer, num_convs=args.encoder_conv, embeddings=embeddings).build() elif args.model == 'lstm': model = DependencyLSTM(len(token_to_index), args.embed, len(converter._dep_to_index), args.hidden, dropout=args.dropout, embeddings=embeddings).build() opt = Adam(lr=0.001, beta_1=0.8, beta_2=0.999, epsilon=1e-7, clipnorm=5.) model.compile(optimizer=opt, loss=['sparse_categorical_crossentropy'], metrics=['sparse_categorical_accuracy']) train_dataset = SquadReader(args.train_path) dev_dataset = SquadReader(args.dev_path) train_generator = Iterator(train_dataset, batch_size, converter) dev_generator = Iterator(dev_dataset, batch_size, converter) trainer = SquadTrainer(model, train_generator, epochs, dev_generator, './model/dep.{epoch:02d}-{val_loss:.2f}.h5') trainer.add_callback(BatchLearningRateScheduler()) trainer.add_callback(ExponentialMovingAverage(0.999)) if args.use_tensorboard: trainer.add_callback( TensorBoard(log_dir='./graph', batch_size=batch_size)) history = trainer.run() dump_graph(history, 'loss_graph.png') test_dataset = SquadReader(args.test_path) test_generator = Iterator(test_dataset, args.batch, converter, False, False) print(model.evaluate_generator(test_generator, steps=len(test_generator)))
def main(args): token_to_index, index_to_token = Vocabulary.load(args.vocab_file) root, _ = os.path.splitext(args.vocab_file) basepath, basename = os.path.split(root) embed_path = f'{basepath}/embedding_{basename}.npy' embeddings = np.load(embed_path) if os.path.exists(embed_path) else None batch_size = args.batch # Batch size for training. epochs = args.epoch # Number of epochs to train for. model = QANet(len(token_to_index), args.embed, args.hidden, args.num_heads, encoder_num_blocks=args.encoder_layer, encoder_num_convs=args.encoder_conv, output_num_blocks=args.output_layer, output_num_convs=args.output_conv, dropout=args.dropout, embeddings=embeddings).build() opt = Adam(lr=0.001, beta_1=0.8, beta_2=0.999, epsilon=1e-7, clipnorm=5.) model.compile(optimizer=opt, loss=[ 'sparse_categorical_crossentropy', 'sparse_categorical_crossentropy', None, None ], loss_weights=[1, 1, 0, 0]) train_dataset = SquadReader(args.train_path) dev_dataset = SquadReader(args.dev_path) converter = SquadConverter(token_to_index, PAD_TOKEN, UNK_TOKEN, lower=args.lower) train_generator = Iterator(train_dataset, batch_size, converter) dev_generator = Iterator(dev_dataset, batch_size, converter) trainer = SquadTrainer(model, train_generator, epochs, dev_generator, './model/qanet.{epoch:02d}-{val_loss:.2f}.h5') trainer.add_callback(BatchLearningRateScheduler()) # trainer.add_callback(ExponentialMovingAverage(0.999)) if args.use_tensorboard: trainer.add_callback( TensorBoard(log_dir='./graph', batch_size=batch_size)) history = trainer.run() dump_graph(history, 'loss_graph.png')
def main(args): token_to_index, _ = Vocabulary.load(args.vocab_path) if os.path.exists(args.embed_array_path) and os.path.exists( args.embed_dict_path): with open(args.embed_dict_path, 'rb') as f: pretrained_token_to_index = pickle.load(f) embeddings = extract_embeddings(token_to_index, pretrained_token_to_index, np.load(args.embed_array_path)) else: if os.path.exists(args.embed_path): pretrained_token_to_index, embeddings = save_word_embedding_as_npy( args.embed_path, args.dim) else: raise FileNotFoundError( 'Please download pre-trained embedding file') root, _ = os.path.splitext(args.vocab_path) basepath, basename = os.path.split(root) filename = f'{basepath}/embedding_{basename}.npy' np.save(filename, embeddings)