Beispiel #1
0
def main(args):
    token_to_index, index_to_token = Vocabulary.load(args.vocab_file)

    root, _ = os.path.splitext(args.vocab_file)
    basepath, basename = os.path.split(root)
    embed_path = f'{basepath}/embedding_{basename}.npy'
    embeddings = np.load(embed_path) if os.path.exists(embed_path) else None

    model = FastQA(len(token_to_index), args.embed, args.hidden,
                   question_limit=args.q_len, context_limit=args.c_len,
                   dropout=args.dropout, pretrained_embeddings=embeddings,
                   with_feature=not args.without_feature).build()
    opt = Adam()
    model.compile(optimizer=opt, loss_weights=[1, 1, 0, 0],
                  loss=['sparse_categorical_crossentropy', 'sparse_categorical_crossentropy', None, None])
    train_dataset = SquadReader(args.train_path)
    dev_dataset = SquadReader(args.dev_path)
    tokenizer = get_tokenizer(lower=args.lower, as_str=False)
    converter = SquadConverter(token_to_index, PAD_TOKEN, UNK_TOKEN, tokenizer,
                               question_max_len=args.q_len, context_max_len=args.c_len)
    eval_converter = SquadEvalConverter(
        token_to_index, PAD_TOKEN, UNK_TOKEN, tokenizer,
        question_max_len=args.q_len, context_max_len=args.c_len)
    train_generator = Iterator(train_dataset, args.batch, converter)
    dev_generator_loss = Iterator(dev_dataset, args.batch, converter, shuffle=False)
    dev_generator_f1 = Iterator(dev_dataset, args.batch, eval_converter, repeat=False, shuffle=False)
    trainer = SquadTrainer(model, train_generator, args.epoch, dev_generator_loss,
                           './models/fastqa.{epoch:02d}-{val_loss:.2f}.h5')
    trainer.add_callback(FastQALRScheduler(
        dev_generator_f1, val_answer_file=args.answer_path, steps=args.steps))
    trainer.add_callback(FastQACheckpoint('./models/fastqa.{steps:06d}.h5', steps=args.steps))
    if args.use_tensorboard:
        trainer.add_callback(TensorBoard(log_dir='./graph', batch_size=args.batch))
    history = trainer.run()
    dump_graph(history, 'loss_graph.png')
Beispiel #2
0
def main(args):
    token_to_index, _ = Vocabulary.load(args.vocab_file)

    model = FastQA(len(token_to_index),
                   args.embed,
                   args.hidden,
                   question_limit=args.q_len,
                   context_limit=args.c_len,
                   with_feature=not args.without_feature).build()
    model.load_weights(args.model_path)

    test_dataset = SquadReader(args.test_path)
    tokenizer = get_tokenizer(lower=args.lower, as_str=False)
    converter = SquadEvalConverter(token_to_index,
                                   PAD_TOKEN,
                                   UNK_TOKEN,
                                   tokenizer,
                                   question_max_len=args.q_len,
                                   context_max_len=args.c_len)
    test_generator = Iterator(test_dataset, args.batch, converter, False,
                              False)
    predictions = {}
    for inputs, (contexts, ids) in test_generator:
        _, _, start_indices, end_indices = model.predict_on_batch(inputs)

        for i, (start, end) in enumerate(zip(start_indices, end_indices)):
            prediction = ' '.join(contexts[i][j]
                                  for j in range(start, end + 1))
            predictions[ids[i]] = prediction

    basename = osp.splitext(osp.basename(args.model_path))[0]
    save_path = osp.join(args.save_dir, f'predictions_{basename}.json')

    with open(save_path, 'w') as f:
        json.dump(predictions, f, indent=2)
Beispiel #3
0
 def test_load(self):
     filename = '/path/to/vocab.pkl'
     open_ = patch('data.open', mock_open()).start()
     pickle_load = patch('data.pickle.load').start()
     pickle_load.return_value = ('token_to_index', 'index_to_token')
     token_to_index, index_to_token = Vocabulary.load(filename)
     self.assertEqual(token_to_index, 'token_to_index')
     self.assertEqual(index_to_token, 'index_to_token')
     open_.assert_called_with(filename, mode='rb')
     pickle_load.assert_called_with(open_.return_value)
Beispiel #4
0
def main(args):
    token_to_index, index_to_token = Vocabulary.load(args.vocab_file)

    root, _ = os.path.splitext(args.vocab_file)
    basepath, basename = os.path.split(root)
    embed_path = f'{basepath}/embedding_{basename}.npy'
    embeddings = np.load(embed_path) if os.path.exists(embed_path) else None

    batch_size = args.batch  # Batch size for training.
    epochs = args.epoch  # Number of epochs to train for.
    converter = SquadDepConverter(token_to_index, PAD_TOKEN, UNK_TOKEN)

    if args.model == 'qanet':
        model = DependencyQANet(len(token_to_index),
                                args.embed,
                                len(converter._dep_to_index),
                                args.hidden,
                                args.num_heads,
                                dropout=args.dropout,
                                num_blocks=args.encoder_layer,
                                num_convs=args.encoder_conv,
                                embeddings=embeddings).build()
    elif args.model == 'lstm':
        model = DependencyLSTM(len(token_to_index),
                               args.embed,
                               len(converter._dep_to_index),
                               args.hidden,
                               dropout=args.dropout,
                               embeddings=embeddings).build()

    opt = Adam(lr=0.001, beta_1=0.8, beta_2=0.999, epsilon=1e-7, clipnorm=5.)
    model.compile(optimizer=opt,
                  loss=['sparse_categorical_crossentropy'],
                  metrics=['sparse_categorical_accuracy'])
    train_dataset = SquadReader(args.train_path)
    dev_dataset = SquadReader(args.dev_path)
    train_generator = Iterator(train_dataset, batch_size, converter)
    dev_generator = Iterator(dev_dataset, batch_size, converter)
    trainer = SquadTrainer(model, train_generator, epochs, dev_generator,
                           './model/dep.{epoch:02d}-{val_loss:.2f}.h5')
    trainer.add_callback(BatchLearningRateScheduler())
    trainer.add_callback(ExponentialMovingAverage(0.999))
    if args.use_tensorboard:
        trainer.add_callback(
            TensorBoard(log_dir='./graph', batch_size=batch_size))
    history = trainer.run()
    dump_graph(history, 'loss_graph.png')

    test_dataset = SquadReader(args.test_path)
    test_generator = Iterator(test_dataset, args.batch, converter, False,
                              False)
    print(model.evaluate_generator(test_generator, steps=len(test_generator)))
Beispiel #5
0
def main(args):
    token_to_index, index_to_token = Vocabulary.load(args.vocab_file)

    root, _ = os.path.splitext(args.vocab_file)
    basepath, basename = os.path.split(root)
    embed_path = f'{basepath}/embedding_{basename}.npy'
    embeddings = np.load(embed_path) if os.path.exists(embed_path) else None

    batch_size = args.batch  # Batch size for training.
    epochs = args.epoch  # Number of epochs to train for.

    model = QANet(len(token_to_index),
                  args.embed,
                  args.hidden,
                  args.num_heads,
                  encoder_num_blocks=args.encoder_layer,
                  encoder_num_convs=args.encoder_conv,
                  output_num_blocks=args.output_layer,
                  output_num_convs=args.output_conv,
                  dropout=args.dropout,
                  embeddings=embeddings).build()
    opt = Adam(lr=0.001, beta_1=0.8, beta_2=0.999, epsilon=1e-7, clipnorm=5.)
    model.compile(optimizer=opt,
                  loss=[
                      'sparse_categorical_crossentropy',
                      'sparse_categorical_crossentropy', None, None
                  ],
                  loss_weights=[1, 1, 0, 0])
    train_dataset = SquadReader(args.train_path)
    dev_dataset = SquadReader(args.dev_path)
    converter = SquadConverter(token_to_index,
                               PAD_TOKEN,
                               UNK_TOKEN,
                               lower=args.lower)
    train_generator = Iterator(train_dataset, batch_size, converter)
    dev_generator = Iterator(dev_dataset, batch_size, converter)
    trainer = SquadTrainer(model, train_generator, epochs, dev_generator,
                           './model/qanet.{epoch:02d}-{val_loss:.2f}.h5')
    trainer.add_callback(BatchLearningRateScheduler())
    # trainer.add_callback(ExponentialMovingAverage(0.999))
    if args.use_tensorboard:
        trainer.add_callback(
            TensorBoard(log_dir='./graph', batch_size=batch_size))
    history = trainer.run()
    dump_graph(history, 'loss_graph.png')
Beispiel #6
0
def main(args):
    token_to_index, _ = Vocabulary.load(args.vocab_path)

    if os.path.exists(args.embed_array_path) and os.path.exists(
            args.embed_dict_path):
        with open(args.embed_dict_path, 'rb') as f:
            pretrained_token_to_index = pickle.load(f)
        embeddings = extract_embeddings(token_to_index,
                                        pretrained_token_to_index,
                                        np.load(args.embed_array_path))
    else:
        if os.path.exists(args.embed_path):
            pretrained_token_to_index, embeddings = save_word_embedding_as_npy(
                args.embed_path, args.dim)
        else:
            raise FileNotFoundError(
                'Please download pre-trained embedding file')
    root, _ = os.path.splitext(args.vocab_path)
    basepath, basename = os.path.split(root)
    filename = f'{basepath}/embedding_{basename}.npy'
    np.save(filename, embeddings)