def __init__(self, model_name="wordtag", term_linking=True, tag_path=None): """Initialize method of the predictor. Args: model_name (`str`): The pre-trained model name. tag_path (`str`): The tag vocab path. """ term_schema_path = self._download_termtree("termtree_type.csv") term_data_path = self._download_termtree("termtree.rawbase") if tag_path is None: tag_path = self._download_termtree("termtree_tags.txt") self._tags_to_index, self._index_to_tags = self._load_labels(tag_path) self._model = ErnieCtmWordtagModel.from_pretrained( model_name, num_cls_label=4, num_tag=len(self._tags_to_index), ignore_index=self._tags_to_index["O"]) self._model.eval() self._tokenizer = ErnieCtmTokenizer.from_pretrained(model_name) self._summary_num = self._model.ernie_ctm.content_summary_index + 1 if term_schema_path is not None: self._term_schema = self._load_schema(term_schema_path) if term_data_path is not None: self._term_dict = self._load_term_tree_data(term_data_path) if term_data_path is not None and term_schema_path is not None and term_linking: self._linking = True else: self._linking = False
def __init__(self, model_dir, tag_path, linking_path=None): """Initialize method of the predictor. Args: model_dir: The pre-trained model checkpoint dir. tag_path: The tag vocab path. linking_path:if you want to use linking mode, you should load link feature using. """ self._tags_to_index, self._index_to_tags = self._load_labels(tag_path) self._model = ErnieCtmWordtagModel.from_pretrained( model_dir, num_cls_label=4, num_tag=len(self._tags_to_index), ignore_index=self._tags_to_index["O"]) self._model.eval() self._tokenizer = ErnieCtmTokenizer.from_pretrained(model_dir) self._summary_num = self._model.ernie_ctm.content_summary_index + 1 self.linking = False if linking_path is not None: self.linking_dict = {} with open(linking_path, encoding="utf-8") as fp: for line in fp: data = json.loads(line) if data["label"] not in self.linking_dict: self.linking_dict[data["label"]] = [] self.linking_dict[data["label"]].append({ "sid": data["sid"], "cls": paddle.to_tensor(data["cls1"]).unsqueeze(0), "term": paddle.to_tensor(data["term"]).unsqueeze(0) }) self.linking = True self.sim_fct = nn.CosineSimilarity(dim=1)
def __init__(self, model_dir, tag_path, term_schema_path=None, term_data_path=None): """Initialize method of the predictor. Args: model_dir (`str`): The pre-trained model checkpoint dir. tag_path (`str`): The tag vocab path. term_schema_path (`str`, optional): if you want to use linking mode, you should load term schema. Defaults to ``None``. term_data_path (`str`, optional): if you want to use linking mode, you should load term data. Defaults to ``None``. """ self._tags_to_index, self._index_to_tags = self._load_labels(tag_path) self._model = ErnieCtmWordtagModel.from_pretrained( model_dir, num_cls_label=4, num_tag=len(self._tags_to_index), ignore_index=self._tags_to_index["O"]) self._model.eval() self._tokenizer = ErnieCtmTokenizer.from_pretrained(model_dir) self._summary_num = self._model.ernie_ctm.content_summary_index + 1 if term_schema_path is not None: self._term_schema = self._load_schema(term_schema_path) if term_data_path is not None: self._term_dict = self._load_term_tree_data(term_data_path) if term_data_path is not None and term_schema_path is not None: self._linking = True else: self._linking = False
if cls_label_can in name_dict: result['label'] = cls_label_can break else: labels_can = bk_tree.search_similar_word(label) result['label'] = labels_can[0][0] result['category'] = name_dict[result['label']] results.append(result) return results if __name__ == "__main__": paddle.set_device(args.device) data = [ '刘德华', '快乐薯片', '自适应共振理论映射', ] model = ErnieCtmNptagModel.from_pretrained("nptag") tokenizer = ErnieCtmTokenizer.from_pretrained("nptag") if args.params_path and os.path.isfile(args.params_path): state_dict = paddle.load(args.params_path) model.set_dict(state_dict) print("Loaded parameters from %s" % args.params_path) results = do_predict(data, model, tokenizer, batch_size=args.batch_size) print(results)
def do_train(args): paddle.set_device(args.device) rank = paddle.distributed.get_rank() if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args.seed) train_ds = load_dataset(read_custom_data, filename=os.path.join(args.data_dir, "train.txt"), is_test=False, lazy=False) dev_ds = load_dataset(read_custom_data, filename=os.path.join(args.data_dir, "dev.txt"), is_test=False, lazy=False) tokenizer = ErnieCtmTokenizer.from_pretrained("nptag") model = ErnieCtmNptagModel.from_pretrained("nptag") vocab_size = model.ernie_ctm.config["vocab_size"] trans_func = partial(convert_example, tokenzier=tokenizer, max_seq_len=args.max_seq_len) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64' ), # input_ids Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int64' ), # token_type_ids Pad(axis=0, pad_val=-100, dtype='int64'), # labels ): fn(samples) train_data_loader = create_dataloader(train_ds, mode="train", batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) dev_data_loader = create_dataloader(dev_ds, mode="dev", batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt): state_dict = paddle.load(args.init_from_ckpt) model.set_dict(state_dict) model = paddle.DataParallel(model) num_training_steps = len(train_data_loader) * args.num_train_epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion) decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) logger.info("Total steps: %s" % num_training_steps) metric = NPTagAccuracy() criterion = paddle.nn.CrossEntropyLoss() global_step = 0 for epoch in range(1, args.num_train_epochs + 1): logger.info(f"Epoch {epoch} beginnig") start_time = time.time() for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, token_type_ids, labels = batch logits = model(input_ids, token_type_ids) loss = criterion(logits.reshape([-1, vocab_size]), labels.reshape([-1])) loss.backward() optimizer.step() optimizer.clear_grad() lr_scheduler.step() if global_step % args.logging_steps == 0 and rank == 0: end_time = time.time() speed = float(args.logging_steps) / (end_time - start_time) logger.info( "global step %d, epoch: %d, loss: %.5f, speed: %.2f step/s" % (global_step, epoch, loss.numpy().item(), speed)) start_time = time.time() if (global_step % args.save_steps == 0 or global_step == num_training_steps) and rank == 0: output_dir = os.path.join(args.output_dir, "model_%d" % (global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model._layers.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) evaluate(model, metric, criterion, dev_data_loader, vocab_size)
def do_train(args): paddle.set_device(args.device) rank = paddle.distributed.get_rank() if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args.seed) train_ds = load_dataset(read_custom_data, filename=os.path.join(args.data_dir, "train.txt"), is_test=False, lazy=False) dev_ds = load_dataset(read_custom_data, filename=os.path.join(args.data_dir, "dev.txt"), is_test=False, lazy=False) tags_to_idx = load_dict(os.path.join(args.data_dir, "tags.txt")) tokenizer = ErnieCtmTokenizer.from_pretrained("wordtag") model = ErnieCtmWordtagModel.from_pretrained("wordtag", num_tag=len(tags_to_idx)) model.crf_loss = LinearChainCrfLoss( LinearChainCrf(len(tags_to_idx), 0.1, with_start_stop_tag=False)) trans_func = partial(convert_example, tokenizer=tokenizer, max_seq_len=args.max_seq_len, tags_to_idx=tags_to_idx) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64' ), # input_ids Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int64' ), # token_type_ids Stack(dtype='int64'), # seq_len Pad(axis=0, pad_val=tags_to_idx["O"], dtype='int64'), # tags ): fn(samples) train_data_loader = create_dataloader(train_ds, mode="train", batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) dev_data_loader = create_dataloader(dev_ds, mode="dev", batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt): state_dict = paddle.load(args.init_from_ckpt) model.set_dict(state_dict) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) num_training_steps = len(train_data_loader) * args.num_train_epochs warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, warmup) num_train_optimization_steps = len( train_ds) / args.batch_size * args.num_train_epochs decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) logger.info("Total steps: %s" % num_training_steps) logger.info("WarmUp steps: %s" % warmup) metric = SequenceAccuracy() total_loss = 0 global_step = 0 for epoch in range(1, args.num_train_epochs + 1): logger.info(f"Epoch {epoch} beginnig") start_time = time.time() for total_step, batch in enumerate(train_data_loader): global_step += 1 input_ids, token_type_ids, seq_len, tags = batch loss, _ = model(input_ids, token_type_ids, lengths=seq_len, tag_labels=tags) loss = loss.mean() total_loss += loss loss.backward() optimizer.step() optimizer.clear_grad() lr_scheduler.step() if global_step % args.logging_steps == 0 and rank == 0: end_time = time.time() speed = float(args.logging_steps) / (end_time - start_time) logger.info( "global step %d, epoch: %d, loss: %.5f, speed: %.2f step/s" % (global_step, epoch, total_loss / args.logging_steps, speed)) start_time = time.time() total_loss = 0 if (global_step % args.save_steps == 0 or global_step == num_training_steps) and rank == 0: output_dir = os.path.join(args.output_dir, "model_%d" % (global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) evaluate(model, metric, dev_data_loader, tags, tags_to_idx)
def do_train(args): paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() train_ds = load_dataset(datafiles=('./data/train.json')) tags_to_idx = load_dict("./data/tags.txt") labels_to_idx = load_dict("./data/classifier_labels.txt") tokenizer = ErnieCtmTokenizer.from_pretrained(args.model_dir) trans_func = partial(convert_example, tokenizer=tokenizer, max_seq_len=args.max_seq_len, tags_to_idx=tags_to_idx, labels_to_idx=labels_to_idx) train_ds.map(trans_func) ignore_label = tags_to_idx["O"] batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64' ), # input_ids Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int64' ), # token_type_ids Stack(dtype='int64'), # seq_len Pad(axis=0, pad_val=ignore_label, dtype='int64'), # tags Stack(dtype='int64'), # cls_label ): fn(samples) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=False, drop_last=True) train_data_loader = DataLoader(train_ds, batch_sampler=train_batch_sampler, num_workers=0, collate_fn=batchify_fn, return_list=True) model = ErnieCtmWordtagModel.from_pretrained( args.model_dir, num_cls_label=len(labels_to_idx), num_tag=len(tags_to_idx), ignore_index=tags_to_idx["O"]) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) num_training_steps = args.max_steps if args.max_steps > 0 else ( len(train_data_loader) * args.num_train_epochs) warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, warmup) num_train_optimization_steps = len( train_ds) / args.batch_size * args.num_train_epochs decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) logger.info("Total steps: %s" % num_training_steps) logger.info("WarmUp steps: %s" % warmup) cls_acc = paddle.metric.Accuracy() seq_acc = SequenceAccuracy() total_loss = 0 global_step = 0 for epoch in range(1, args.num_train_epochs + 1): logger.info(f"Epoch {epoch} beginnig") start_time = time.time() for total_step, batch in enumerate(train_data_loader): global_step += 1 input_ids, token_type_ids, seq_len, tags, cls_label = batch outputs = model(input_ids, token_type_ids, lengths=seq_len, tag_labels=tags, cls_label=cls_label) loss, seq_logits, cls_logits = outputs[0], outputs[1], outputs[2] loss = loss.mean() total_loss += loss loss.backward() optimizer.step() optimizer.clear_grad() lr_scheduler.step() cls_correct = cls_acc.compute(pred=cls_logits.reshape( [-1, len(labels_to_idx)]), label=cls_label.reshape([-1])) cls_acc.update(cls_correct) seq_correct = seq_acc.compute(pred=seq_logits.reshape( [-1, len(tags_to_idx)]), label=tags.reshape([-1]), ignore_index=tags_to_idx["O"]) seq_acc.update(seq_correct) if global_step % args.logging_steps == 0 and global_step != 0: end_time = time.time() speed = float(args.logging_steps) / (end_time - start_time) logger.info( "[Training][" "epoch: %s/%s][step: %s/%s] loss: %6f, Classification Accuracy: %6f, Sequence Labeling Accuracy: %6f, speed: %6f" % (epoch, args.num_train_epochs, global_step, num_training_steps, total_loss / args.logging_steps, cls_acc.accumulate(), seq_acc.accumulate(), speed)) start_time = time.time() cls_acc.reset() seq_acc.reset() total_loss = 0 if (global_step % args.save_steps == 0 or global_step == num_training_steps ) and paddle.distributed.get_rank() == 0: output_dir = os.path.join( args.output_dir, "ernie_ctm_ft_model_%d.pdparams" % (global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir)