def dump_node_feat(config): log.info("Dump node feat starting...") id2str = [ line.strip("\n").split("\t")[-1] for line in io.open(os.path.join(config.graph_work_path, "terms.txt"), encoding=config.encoding) ] if "tiny" in config.ernie_name: tokenizer = ErnieTinyTokenizer.from_pretrained(config.ernie_name) #tokenizer.vocab = tokenizer.sp_model.vocab term_ids = [ partial(term2id, tokenizer=tokenizer, max_seqlen=config.max_seqlen)(s) for s in id2str ] else: tokenizer = ErnieTokenizer.from_pretrained(config.ernie_name) pool = multiprocessing.Pool() term_ids = pool.map( partial(term2id, tokenizer=tokenizer, max_seqlen=config.max_seqlen), id2str) pool.terminate() node_feat_path = os.path.join(config.graph_work_path, "node_feat") if not os.path.exists(node_feat_path): os.makedirs(node_feat_path) np.save(os.path.join(config.graph_work_path, "node_feat", "term_ids.npy"), np.array(term_ids, np.uint16)) log.info("Dump node feat done.")
def _initialize(self): """ initialize with the necessary elements """ self.tokenizer = ErnieTokenizer.from_pretrained( "ernie-1.0", mask_token=None) self.rev_dict = {v: k for k, v in self.tokenizer.vocab.items()} self.rev_lookup = np.vectorize(lambda i: self.rev_dict[i]) self._model = None
def init_lstm_var(args): #different language has different tokenizer if args.language == "ch": tokenizer = ErnieTokenizer.from_pretrained(args.vocab_path) padding_idx = tokenizer.vocab.get('[PAD]') tokenizer.inverse_vocab = [ item[0] for item in sorted(tokenizer.vocab.items(), key=lambda x: x[1]) ] else: vocab = Vocab.load_vocabulary(args.vocab_path, unk_token='[UNK]', pad_token='[PAD]') tokenizer = CharTokenizer(vocab) padding_idx = vocab.token_to_idx.get('[PAD]', 0) trans_fn = partial(convert_example, tokenizer=tokenizer, is_test=True, language=args.language) #init attention layer lstm_hidden_size = 196 attention = SelfInteractiveAttention(hidden_size=2 * lstm_hidden_size) model = BiLSTMAttentionModel(attention_layer=attention, vocab_size=len(tokenizer.vocab), lstm_hidden_size=lstm_hidden_size, num_classes=2, padding_idx=padding_idx) # Reads data and generates mini-batches. dev_ds = SentiData().read(os.path.join(args.data_dir, 'dev'), args.language) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=padding_idx), # input_ids Stack(dtype="int64"), # seq len ): [data for data in fn(samples)] dev_loader = create_dataloader(dev_ds, trans_fn=trans_fn, batch_size=args.batch_size, mode='validation', batchify_fn=batchify_fn) return model, tokenizer, dev_loader
help='max_train_steps, set this to EPOCH * NUM_SAMPLES / BATCH_SIZE') parser.add_argument('--warmup_proportion', type=float, default=0.1) parser.add_argument('--lr', type=float, default=5e-5, help='learning rate') parser.add_argument('--eval', action='store_true') parser.add_argument('--save_dir', type=str, default=None, help='model output directory') parser.add_argument('--wd', type=float, default=0.01, help='weight decay, aka L2 regularizer') args = parser.parse_args() tokenizer = ErnieTokenizer.from_pretrained(args.from_pretrained) #tokenizer = ErnieTinyTokenizer.from_pretrained(args.from_pretrained) place = F.CUDAPlace(0) with FD.guard(place): model = ErnieModelForSequenceClassification.from_pretrained( args.from_pretrained, num_labels=3, name='') if not args.eval: feature_column = propeller.data.FeatureColumns([ propeller.data.TextColumn('seg_a', unk_id=tokenizer.unk_id, vocab_dict=tokenizer.vocab, tokenizer=tokenizer.tokenize), propeller.data.LabelColumn('label'), ])
parser.add_argument('--init_checkpoint', type=str, default=None, help='checkpoint to warm start from') parser.add_argument('--debug', action='store_true') parser.add_argument('--use_type', action='store_true') parser.add_argument('--ohem_ratio', type=float, default=0) parser.add_argument('--use_test_data', action='store_true') parser.add_argument('--use_nil_as_cand', action='store_true') parser.add_argument('--kfold', type=int, default=None) parser.add_argument('--save_steps', type=int, default=1000) parser.add_argument('--eval_steps', type=int, default=1000) parser.add_argument('--use_dev_data', action='store_true') args = parser.parse_args() tokenizer = ErnieTokenizer.from_pretrained(args.from_pretrained, special_token_list=['###']) #tokenizer = ErnieTinyTokenizer.from_pretrained(args.from_pretrained) kb = read_json('./data/data60899/kb.json') if args.use_type: train = read_json('work/result/train_nil_all.json') dev = read_json('work/result/dev_nil_all.json') else: train = read_json('./data/data60899/train.json') dev = read_json('./data/data60899/dev.json') if args.kfold is not None: print('reading fold %s data...' % args.kfold) train = read_json('./work/data/train_fold%s.json' % args.kfold) if args.use_test_data: train = train + read_json('work/result/result.json')
default=None, help='inference model output directory') parser.add_argument('--init_checkpoint', type=str, default=None) parser.add_argument('--save_dir', type=str, default=None, help='model output directory') parser.add_argument('--wd', type=float, default=0.01, help='weight decay, aka L2 regularizer') args = parser.parse_args() place = F.CUDAPlace(D.parallel.Env().dev_id) D.guard(place).__enter__() ernie = ErnieModelForGeneration.from_pretrained(args.from_pretrained) tokenizer = ErnieTokenizer.from_pretrained(args.from_pretrained, mask_token=None) rev_dict = {v: k for k, v in tokenizer.vocab.items()} rev_dict[tokenizer.pad_id] = '' # replace [PAD] rev_dict[tokenizer.unk_id] = '' # replace [PAD] if args.init_checkpoint is not None: log.info('loading checkpoint from %s' % args.init_checkpoint) sd, _ = D.load_dygraph(args.init_checkpoint) ernie.set_dict(sd) seq2seq(ernie, tokenizer, args)
from ernie.optimization import AdamW, LinearDecay # 本例子采用chnsenticorp中文情感识别任务作为示范;并且事先通过数据增强扩充了蒸馏所需的无监督数据 # # 请从“”下载数据;并数据存放在 ./chnsenticorp-data/ # 数据分为3列:原文;空格切词;情感标签 # 其中第一列为ERNIE的输入;第二列为BoW词袋模型的输入 # 事先统计好的BoW 词典在 ./chnsenticorp-data/vocab.bow.txt # 定义finetune teacher模型所需要的超参数 SEQLEN = 256 BATCH = 32 EPOCH = 10 LR = 5e-5 tokenizer = ErnieTokenizer.from_pretrained('ernie-1.0') student_vocab = { i.strip(): l for l, i in enumerate( open('./chnsenticorp-data/vocab.bow.txt').readlines()) } def space_tokenizer(i): return i.decode('utf8').split() feature_column = propeller.data.FeatureColumns([ propeller.data.TextColumn('seg_a', unk_id=tokenizer.unk_id,
MODEL_DIR = './model/ernie1.0.1' OUTPUT_PATH = './data/ernie_output.csv' SUBSITUTION_NUM = 10 eval_path = EVAL_PATH model_dir = MODEL_DIR substitution_num = SUBSITUTION_NUM output_path = OUTPUT_PATH sentences, difficult_words = read_dataset(eval_path) place = F.CUDAPlace(D.parallel.Env().dev_id) D.guard(place).__enter__() # 初始化tokenizer tokenizer = ErnieTokenizer.from_pretrained(model_dir) rev_dict = {v: k for k, v in tokenizer.vocab.items()} rev_dict[tokenizer.pad_id] = '' # replace [PAD] rev_dict[tokenizer.unk_id] = '' # replace [PAD] @np.vectorize def rev_lookup(i): return rev_dict[i] ernie = ErnieGenerate.from_pretrained(model_dir) for sentence, difficult_word in zip(sentences, difficult_words): print(sentence, difficult_word) # 词预测
def load_tokenizer(ernie_name): if "tiny" in config.ernie_name: tokenizer = ErnieTinyTokenizer.from_pretrained(ernie_name) else: tokenizer = ErnieTokenizer.from_pretrained(ernie_name) return tokenizer
else: sampler = paddle.io.BatchSampler(dataset=dataset, batch_size=batch_size, shuffle=shuffle) dataloader = paddle.io.DataLoader(dataset, batch_sampler=sampler, collate_fn=batchify_fn) return dataloader if __name__ == "__main__": paddle.set_device(args.device) set_seed() if args.language == 'ch': tokenizer = ErnieTokenizer.from_pretrained(args.vocab_path) # Loads dataset. train_ds, dev_ds = load_dataset( "chnsenticorp", splits=["train", "dev"] ) # train_ds, dev_ds: <class 'paddlenlp.datasets.dataset.MapDataset'> # Constructs the newtork. vocab_size = len(tokenizer.vocab) num_classes = len(train_ds.label_list) pad_token_id = tokenizer.vocab.get('[PAD]') pad_value = tokenizer.vocab.get('[PAD]', 0) else: # Loads vocab. if not os.path.exists(args.vocab_path): raise RuntimeError(