Ejemplo n.º 1
0
def evaluate(args):
    paddle.set_device(args.device)

    # create dataset.
    test_ds = load_dataset(datafiles=(os.path.join(args.data_dir, 'test.tsv')))
    word_vocab = load_vocab(os.path.join(args.data_dir, 'word.dic'))
    label_vocab = load_vocab(os.path.join(args.data_dir, 'tag.dic'))
    # q2b.dic is used to replace DBC case to SBC case
    normlize_vocab = load_vocab(os.path.join(args.data_dir, 'q2b.dic'))

    trans_func = partial(
        convert_example,
        max_seq_len=args.max_seq_len,
        word_vocab=word_vocab,
        label_vocab=label_vocab,
        normlize_vocab=normlize_vocab)
    test_ds.map(trans_func)

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=0, dtype='int64'),  # word_ids
        Stack(dtype='int64'),  # length
        Pad(axis=0, pad_val=0, dtype='int64'),  # label_ids
    ): fn(samples)

    # Create sampler for dataloader
    test_sampler = paddle.io.BatchSampler(
        dataset=test_ds,
        batch_size=args.batch_size,
        shuffle=False,
        drop_last=False)
    test_loader = paddle.io.DataLoader(
        dataset=test_ds,
        batch_sampler=test_sampler,
        return_list=True,
        collate_fn=batchify_fn)

    # Define the model network and metric evaluator
    model = BiGruCrf(args.emb_dim, args.hidden_size,
                     len(word_vocab), len(label_vocab))
    chunk_evaluator = ChunkEvaluator(label_list=label_vocab.keys(), suffix=True)

    # Load the model and start predicting
    model_dict = paddle.load(args.init_checkpoint)
    model.load_dict(model_dict)

    model.eval()
    chunk_evaluator.reset()
    for batch in test_loader:
        token_ids, length, labels = batch
        preds = model(token_ids, length)
        num_infer_chunks, num_label_chunks, num_correct_chunks = chunk_evaluator.compute(
            length, preds, labels)
        chunk_evaluator.update(num_infer_chunks.numpy(),
                               num_label_chunks.numpy(),
                               num_correct_chunks.numpy())
        precision, recall, f1_score = chunk_evaluator.accumulate()
    print("eval precision: %f, recall: %f, f1: %f" %
          (precision, recall, f1_score))
Ejemplo n.º 2
0
def main():
    word_vocab = load_vocab(os.path.join(args.data_dir, 'word.dic'))
    label_vocab = load_vocab(os.path.join(args.data_dir, 'tag.dic'))

    model = BiGruCrf(args.emb_dim, args.hidden_size,
                     len(word_vocab), len(label_vocab))

    state_dict = paddle.load(args.params_path)
    model.set_dict(state_dict)
    model.eval()

    model = paddle.jit.to_static(
        model,
        input_spec=[
            InputSpec(
                shape=[None, None], dtype="int64", name='token_ids'), InputSpec(
                    shape=[None], dtype="int64", name='length')
        ])
    # Save in static graph model.
    paddle.jit.save(model, args.output_path)
Ejemplo n.º 3
0
def infer(args):
    paddle.set_device(args.device)

    # create dataset.
    infer_ds = load_dataset(datafiles=(os.path.join(args.data_dir,
                                                    'infer.tsv')))
    word_vocab = load_vocab(os.path.join(args.data_dir, 'word.dic'))
    label_vocab = load_vocab(os.path.join(args.data_dir, 'tag.dic'))
    # q2b.dic is used to replace DBC case to SBC case
    normlize_vocab = load_vocab(os.path.join(args.data_dir, 'q2b.dic'))

    trans_func = partial(
        convert_example,
        max_seq_len=args.max_seq_len,
        word_vocab=word_vocab,
        label_vocab=label_vocab,
        normlize_vocab=normlize_vocab)
    infer_ds.map(trans_func)

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=0, dtype='int64'),  # word_ids
        Stack(dtype='int64'),  # length
    ): fn(samples)

    # Create sampler for dataloader
    infer_sampler = paddle.io.BatchSampler(
        dataset=infer_ds,
        batch_size=args.batch_size,
        shuffle=False,
        drop_last=False)
    infer_loader = paddle.io.DataLoader(
        dataset=infer_ds,
        batch_sampler=infer_sampler,
        return_list=True,
        collate_fn=batchify_fn)

    # Define the model network
    model = BiGruCrf(args.emb_dim, args.hidden_size,
                     len(word_vocab), len(label_vocab))

    # Load the model and start predicting
    model_dict = paddle.load(args.init_checkpoint)
    model.load_dict(model_dict)

    model.eval()
    results = []
    for batch in infer_loader:
        token_ids, length = batch
        preds = model(token_ids, length)
        result = parse_result(token_ids.numpy(),
                              preds.numpy(),
                              length.numpy(), word_vocab, label_vocab)
        results += result

    sent_tags = []
    for sent, tags in results:
        sent_tag = ['(%s, %s)' % (ch, tag) for ch, tag in zip(sent, tags)]
        sent_tags.append(''.join(sent_tag))

    file_path = "results.txt"
    with open(file_path, "w", encoding="utf8") as fout:
        fout.write("\n".join(sent_tags))

    # Print some examples
    print(
        "The results have been saved in the file: %s, some examples are shown below: "
        % file_path)
    print("\n".join(sent_tags[:10]))