def evaluate(args): paddle.set_device(args.device) # create dataset. test_ds = load_dataset(datafiles=(os.path.join(args.data_dir, 'test.tsv'))) word_vocab = load_vocab(os.path.join(args.data_dir, 'word.dic')) label_vocab = load_vocab(os.path.join(args.data_dir, 'tag.dic')) # q2b.dic is used to replace DBC case to SBC case normlize_vocab = load_vocab(os.path.join(args.data_dir, 'q2b.dic')) trans_func = partial( convert_example, max_seq_len=args.max_seq_len, word_vocab=word_vocab, label_vocab=label_vocab, normlize_vocab=normlize_vocab) test_ds.map(trans_func) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=0, dtype='int64'), # word_ids Stack(dtype='int64'), # length Pad(axis=0, pad_val=0, dtype='int64'), # label_ids ): fn(samples) # Create sampler for dataloader test_sampler = paddle.io.BatchSampler( dataset=test_ds, batch_size=args.batch_size, shuffle=False, drop_last=False) test_loader = paddle.io.DataLoader( dataset=test_ds, batch_sampler=test_sampler, return_list=True, collate_fn=batchify_fn) # Define the model network and metric evaluator model = BiGruCrf(args.emb_dim, args.hidden_size, len(word_vocab), len(label_vocab)) chunk_evaluator = ChunkEvaluator(label_list=label_vocab.keys(), suffix=True) # Load the model and start predicting model_dict = paddle.load(args.init_checkpoint) model.load_dict(model_dict) model.eval() chunk_evaluator.reset() for batch in test_loader: token_ids, length, labels = batch preds = model(token_ids, length) num_infer_chunks, num_label_chunks, num_correct_chunks = chunk_evaluator.compute( length, preds, labels) chunk_evaluator.update(num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy()) precision, recall, f1_score = chunk_evaluator.accumulate() print("eval precision: %f, recall: %f, f1: %f" % (precision, recall, f1_score))
def train(args): paddle.set_device(args.device) # Create dataset. train_ds, test_ds = load_dataset( datafiles=(os.path.join(args.data_dir, 'train.tsv'), os.path.join(args.data_dir, 'test.tsv'))) word_vocab = load_vocab(os.path.join(args.data_dir, 'word.dic')) label_vocab = load_vocab(os.path.join(args.data_dir, 'tag.dic')) # q2b.dic is used to replace DBC case to SBC case normlize_vocab = load_vocab(os.path.join(args.data_dir, 'q2b.dic')) trans_func = partial(convert_example, max_seq_len=args.max_seq_len, word_vocab=word_vocab, label_vocab=label_vocab, normlize_vocab=normlize_vocab) train_ds.map(trans_func) test_ds.map(trans_func) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=0, dtype='int64'), # word_ids Stack(dtype='int64'), # length Pad(axis=0, pad_val=0, dtype='int64'), # label_ids ): fn(samples) # Create sampler for dataloader train_sampler = paddle.io.DistributedBatchSampler( dataset=train_ds, batch_size=args.batch_size, shuffle=True, drop_last=True) train_loader = paddle.io.DataLoader(dataset=train_ds, batch_sampler=train_sampler, return_list=True, collate_fn=batchify_fn) test_sampler = paddle.io.BatchSampler(dataset=test_ds, batch_size=args.batch_size, shuffle=False, drop_last=False) test_loader = paddle.io.DataLoader(dataset=test_ds, batch_sampler=test_sampler, return_list=True, collate_fn=batchify_fn) # Define the model netword and its loss model = BiGruCrf(args.emb_dim, args.hidden_size, len(word_vocab), len(label_vocab)) # Prepare optimizer, loss and metric evaluator optimizer = paddle.optimizer.Adam(learning_rate=args.base_lr, parameters=model.parameters()) chunk_evaluator = ChunkEvaluator(label_list=label_vocab.keys(), suffix=True) if args.init_checkpoint: model_dict = paddle.load(args.init_checkpoint) model.load_dict(model_dict) # Start training global_step = 0 last_step = args.epochs * len(train_loader) tic_train = time.time() for epoch in range(args.epochs): for step, batch in enumerate(train_loader): global_step += 1 token_ids, length, label_ids = batch loss = model(token_ids, length, label_ids) avg_loss = paddle.mean(loss) if global_step % args.logging_steps == 0: print("global step %d / %d, loss: %f, speed: %.2f step/s" % (global_step, last_step, avg_loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() avg_loss.backward() optimizer.step() optimizer.clear_grad() if global_step % args.save_steps == 0 or global_step == last_step: if paddle.distributed.get_rank() == 0: evaluate(model, chunk_evaluator, test_loader) paddle.save( model.state_dict(), os.path.join(args.model_save_dir, "model_%d.pdparams" % global_step))
def train(args): paddle.set_device(args.device) set_seed(102) trainer_num = paddle.distributed.get_world_size() if trainer_num > 1: paddle.distributed.init_parallel_env() rank = paddle.distributed.get_rank() word_vocab, label_vocab, train_loader, test_loader = create_data_loader( args) # Define the model netword and its loss model = BiGruCrf(args.emb_dim, args.hidden_size, len(word_vocab), len(label_vocab), crf_lr=args.crf_lr) # Prepare optimizer, loss and metric evaluator optimizer = paddle.optimizer.Adam(learning_rate=args.base_lr, parameters=model.parameters()) chunk_evaluator = ChunkEvaluator(label_list=label_vocab.keys(), suffix=True) if args.init_checkpoint: if os.path.exists(args.init_checkpoint): logger.info("Init checkpoint from %s" % args.init_checkpoint) model_dict = paddle.load(args.init_checkpoint) model.load_dict(model_dict) else: logger.info("Cannot init checkpoint from %s which doesn't exist" % args.init_checkpoint) logger.info("Start training") # Start training global_step = 0 last_step = args.epochs * len(train_loader) train_reader_cost = 0.0 train_run_cost = 0.0 total_samples = 0 reader_start = time.time() max_f1_score = -1 for epoch in range(args.epochs): for step, batch in enumerate(train_loader): train_reader_cost += time.time() - reader_start global_step += 1 token_ids, length, label_ids = batch train_start = time.time() loss = model(token_ids, length, label_ids) avg_loss = paddle.mean(loss) train_run_cost += time.time() - train_start total_samples += args.batch_size if global_step % args.logging_steps == 0: logger.info( "global step %d / %d, loss: %f, avg_reader_cost: %.5f sec, avg_batch_cost: %.5f sec, avg_samples: %.5f, ips: %.5f sequences/sec" % (global_step, last_step, avg_loss, train_reader_cost / args.logging_steps, (train_reader_cost + train_run_cost) / args.logging_steps, total_samples / args.logging_steps, total_samples / (train_reader_cost + train_run_cost))) train_reader_cost = 0.0 train_run_cost = 0.0 total_samples = 0 avg_loss.backward() optimizer.step() optimizer.clear_grad() if global_step % args.save_steps == 0 or global_step == last_step: if rank == 0: paddle.save( model.state_dict(), os.path.join(args.model_save_dir, "model_%d.pdparams" % global_step)) logger.info("Save %d steps model." % (global_step)) if args.do_eval: precision, recall, f1_score = evaluate( model, chunk_evaluator, test_loader) if f1_score > max_f1_score: max_f1_score = f1_score paddle.save( model.state_dict(), os.path.join(args.model_save_dir, "best_model.pdparams")) logger.info("Save best model.") reader_start = time.time()
def infer(args): paddle.set_device(args.device) # create dataset. infer_ds = load_dataset(datafiles=(os.path.join(args.data_dir, 'infer.tsv'))) word_vocab = load_vocab(os.path.join(args.data_dir, 'word.dic')) label_vocab = load_vocab(os.path.join(args.data_dir, 'tag.dic')) # q2b.dic is used to replace DBC case to SBC case normlize_vocab = load_vocab(os.path.join(args.data_dir, 'q2b.dic')) trans_func = partial( convert_example, max_seq_len=args.max_seq_len, word_vocab=word_vocab, label_vocab=label_vocab, normlize_vocab=normlize_vocab) infer_ds.map(trans_func) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=0, dtype='int64'), # word_ids Stack(dtype='int64'), # length ): fn(samples) # Create sampler for dataloader infer_sampler = paddle.io.BatchSampler( dataset=infer_ds, batch_size=args.batch_size, shuffle=False, drop_last=False) infer_loader = paddle.io.DataLoader( dataset=infer_ds, batch_sampler=infer_sampler, return_list=True, collate_fn=batchify_fn) # Define the model network model = BiGruCrf(args.emb_dim, args.hidden_size, len(word_vocab), len(label_vocab)) # Load the model and start predicting model_dict = paddle.load(args.init_checkpoint) model.load_dict(model_dict) model.eval() results = [] for batch in infer_loader: token_ids, length = batch preds = model(token_ids, length) result = parse_result(token_ids.numpy(), preds.numpy(), length.numpy(), word_vocab, label_vocab) results += result sent_tags = [] for sent, tags in results: sent_tag = ['(%s, %s)' % (ch, tag) for ch, tag in zip(sent, tags)] sent_tags.append(''.join(sent_tag)) file_path = "results.txt" with open(file_path, "w", encoding="utf8") as fout: fout.write("\n".join(sent_tags)) # Print some examples print( "The results have been saved in the file: %s, some examples are shown below: " % file_path) print("\n".join(sent_tags[:10]))
def train(args): paddle.set_device(args.device) trainer_num = paddle.distributed.get_world_size() if trainer_num > 1: paddle.distributed.init_parallel_env() rank = paddle.distributed.get_rank() # Create dataset. train_ds, test_ds = load_dataset( datafiles=(os.path.join(args.data_dir, 'train.tsv'), os.path.join(args.data_dir, 'test.tsv'))) word_vocab = load_vocab(os.path.join(args.data_dir, 'word.dic')) label_vocab = load_vocab(os.path.join(args.data_dir, 'tag.dic')) # q2b.dic is used to replace DBC case to SBC case normlize_vocab = load_vocab(os.path.join(args.data_dir, 'q2b.dic')) trans_func = partial(convert_example, max_seq_len=args.max_seq_len, word_vocab=word_vocab, label_vocab=label_vocab, normlize_vocab=normlize_vocab) train_ds.map(trans_func) test_ds.map(trans_func) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=word_vocab.get("[PAD]", 0), dtype='int64' ), # word_ids Stack(dtype='int64'), # length Pad(axis=0, pad_val=label_vocab.get("O", 0), dtype='int64' ), # label_ids ): fn(samples) # Create sampler for dataloader train_sampler = paddle.io.DistributedBatchSampler( dataset=train_ds, batch_size=args.batch_size, shuffle=True, drop_last=True) train_loader = paddle.io.DataLoader(dataset=train_ds, batch_sampler=train_sampler, return_list=True, collate_fn=batchify_fn) test_sampler = paddle.io.BatchSampler(dataset=test_ds, batch_size=args.batch_size, shuffle=False, drop_last=False) test_loader = paddle.io.DataLoader(dataset=test_ds, batch_sampler=test_sampler, return_list=True, collate_fn=batchify_fn) # Define the model netword and its loss model = BiGruCrf(args.emb_dim, args.hidden_size, len(word_vocab), len(label_vocab), crf_lr=args.crf_lr) # Prepare optimizer, loss and metric evaluator optimizer = paddle.optimizer.Adam(learning_rate=args.base_lr, parameters=model.parameters()) chunk_evaluator = ChunkEvaluator(label_list=label_vocab.keys(), suffix=True) if args.init_checkpoint: if os.path.exists(args.init_checkpoint): logger.info("Init checkpoint from %s" % args.init_checkpoint) model_dict = paddle.load(args.init_checkpoint) model.load_dict(model_dict) else: logger.info("Cannot init checkpoint from %s which doesn't exist" % args.init_checkpoint) logger.info("Start training") # Start training global_step = 0 last_step = args.epochs * len(train_loader) train_reader_cost = 0.0 train_run_cost = 0.0 total_samples = 0 reader_start = time.time() max_f1_score = -1 for epoch in range(args.epochs): for step, batch in enumerate(train_loader): train_reader_cost += time.time() - reader_start global_step += 1 token_ids, length, label_ids = batch train_start = time.time() loss = model(token_ids, length, label_ids) avg_loss = paddle.mean(loss) train_run_cost += time.time() - train_start total_samples += args.batch_size if global_step % args.logging_steps == 0: logger.info( "global step %d / %d, loss: %f, avg_reader_cost: %.5f sec, avg_batch_cost: %.5f sec, avg_samples: %.5f, ips: %.5f sequences/sec" % (global_step, last_step, avg_loss, train_reader_cost / args.logging_steps, (train_reader_cost + train_run_cost) / args.logging_steps, total_samples / args.logging_steps, total_samples / (train_reader_cost + train_run_cost))) train_reader_cost = 0.0 train_run_cost = 0.0 total_samples = 0 avg_loss.backward() optimizer.step() optimizer.clear_grad() if global_step % args.save_steps == 0 or global_step == last_step: if rank == 0: paddle.save( model.state_dict(), os.path.join(args.model_save_dir, "model_%d.pdparams" % global_step)) logger.info("Save %d steps model." % (global_step)) if args.do_eval: precision, recall, f1_score = evaluate( model, chunk_evaluator, test_loader) if f1_score > max_f1_score: max_f1_score = f1_score paddle.save( model.state_dict(), os.path.join(args.model_save_dir, "best_model.pdparams")) logger.info("Save best model.") reader_start = time.time()