class TestTokenEmbeddingSimilarity(TestTokenEmbedding): def setUp(self): super().setUp() self.config["extended_vocab_path"] = self.test_data_file self.config["keep_extended_vocab_only"] = True def get_dot(self, vec_a, vec_b): return np.sum(vec_a * vec_b) def get_cosine(self, vec_a, vec_b): return self.get_dot(vec_a, vec_b) / (np.sqrt( self.get_dot(vec_a, vec_a) * self.get_dot(vec_b, vec_b))) def get_random_word_vec(self, vocab_list): vocab_size = len(vocab_list) ids = np.random.randint(vocab_size, size=2) word_a, word_b = vocab_list[ids[0]], vocab_list[ids[1]] vec_a, vec_b = self.embedding.search([word_a, word_b]) return word_a, word_b, vec_a, vec_b def test_cosine_sim(self): self.embedding = TokenEmbedding(**self.config) vocab_list = get_vocab_list(self.config["extended_vocab_path"]) word_a, word_b, vec_a, vec_b = self.get_random_word_vec(vocab_list) result = self.embedding.cosine_sim(word_a, word_b) expected_result = self.get_cosine(vec_a, vec_b) self.check_output_equal(result, expected_result) def test_dot(self): self.embedding = TokenEmbedding(**self.config) vocab_list = get_vocab_list(self.config["extended_vocab_path"]) word_a, word_b, vec_a, vec_b = self.get_random_word_vec(vocab_list) result = self.embedding.dot(word_a, word_b) expected_result = self.get_dot(vec_a, vec_b) self.check_output_equal(result, expected_result)
def test_extended_vocab(self): self.embedding = TokenEmbedding(**self.config) vocab_list = get_vocab_list(self.config["extended_vocab_path"]) emb_idx = set(self.embedding.get_idx_list_from_words(vocab_list)) vocab_idx = set([i for i in range(len(vocab_list))]) self.assertEqual(emb_idx, vocab_idx) self.check_output_equal(emb_idx, vocab_idx)
def test_unk_token(self): self.embedding = TokenEmbedding(**self.config) self.check_output_equal(self.config["unknown_token"], self.embedding.unknown_token) self.check_output_equal( self.config["unknown_token_vector"], self.embedding.search(self.embedding.unknown_token)[0])
def test_dot(self): self.embedding = TokenEmbedding(**self.config) vocab_list = get_vocab_list(self.config["extended_vocab_path"]) word_a, word_b, vec_a, vec_b = self.get_random_word_vec(vocab_list) result = self.embedding.dot(word_a, word_b) expected_result = self.get_dot(vec_a, vec_b) self.check_output_equal(result, expected_result)
def test_extended_vocab(self): self.embedding = TokenEmbedding(**self.config) vocab_list = get_vocab_list(self.config["extended_vocab_path"]) vocab_size = len(vocab_list) # +1 means considering [PAD] self.check_output_equal(vocab_size + 1, len(self.embedding._word_to_idx))
class TestTokenEmbeddingExtendedVocab(TestTokenEmbedding): def setUp(self): super().setUp() self.config["extended_vocab_path"] = self.test_data_file def test_extended_vocab(self): self.embedding = TokenEmbedding(**self.config) vocab_list = get_vocab_list(self.config["extended_vocab_path"]) emb_idx = set(self.embedding.get_idx_list_from_words(vocab_list)) vocab_idx = set([i for i in range(len(vocab_list))]) self.assertEqual(emb_idx, vocab_idx) self.check_output_equal(emb_idx, vocab_idx)
def __init__(self,embedding_name): super(Embedding, self).__init__() self.embedding =TokenEmbedding(embedding_name) self.embedding_dim = self.embedding.embedding_dim weight_attr = paddle.framework.ParamAttr( name="linear_weight", initializer=paddle.nn.initializer.XavierNormal()) bias_attr = paddle.framework.ParamAttr( name="linear_bias", initializer=paddle.nn.initializer.XavierNormal()) self.mlp = paddle.nn.Linear(self.embedding_dim*2, self.embedding_dim, weight_attr=weight_attr, bias_attr=bias_attr) self.gru = nn.GRU(input_size=self.embedding_dim,hidden_size=self.embedding_dim//2,num_layers=1, direction="bidirectional",)
def load_model(cls): cls.wordemb1 = spacy.load('zh_core_web_sm') cls.wordemb2 = TokenEmbedding( "w2v.baidu_encyclopedia.target.word-word.dim300") stopwords = [] for word in open('static/dict/chineseStopWords.txt', 'r', encoding='utf-8'): stopwords.append(word.strip()) cls.stopwords = stopwords print('模型加载完成')
class TestTokenEmbeddingUNK(TestTokenEmbedding): def setUp(self): super().setUp() self.config["unknown_token"] = "[unk]" # default [UNK], change it self.config["unknown_token_vector"] = np.random.normal( scale=0.02, size=300).astype(paddle.get_default_dtype()) def test_unk_token(self): self.embedding = TokenEmbedding(**self.config) self.check_output_equal(self.config["unknown_token"], self.embedding.unknown_token) self.check_output_equal( self.config["unknown_token_vector"], self.embedding.search(self.embedding.unknown_token)[0])
def __init__(self, embed_dim, hidden_size, vocab_size, output_dim, vocab_path, padding_idx=0, num_layers=1, dropout_prob=0.0, init_scale=0.1, embedding_name=None): super(BiLSTM, self).__init__() if embedding_name is not None: self.embedder = TokenEmbedding(embedding_name, extended_vocab_path=vocab_path, keep_extended_vocab_only=True) embed_dim = self.embedder.embedding_dim else: self.embedder = nn.Embedding(vocab_size, embed_dim, padding_idx) self.lstm = nn.LSTM(embed_dim, hidden_size, num_layers, 'bidirectional', dropout=dropout_prob) self.fc = nn.Linear( hidden_size * 2, hidden_size, weight_attr=paddle.ParamAttr( initializer=I.Uniform(low=-init_scale, high=init_scale))) self.fc_1 = nn.Linear( hidden_size * 8, hidden_size, weight_attr=paddle.ParamAttr( initializer=I.Uniform(low=-init_scale, high=init_scale))) self.output_layer = nn.Linear( hidden_size, output_dim, weight_attr=paddle.ParamAttr( initializer=I.Uniform(low=-init_scale, high=init_scale)))
def __init__(self, emb_size, hidden_size, word_num, label_num, use_w2v_emb=False): super(BiGRUWithCRF, self).__init__() if use_w2v_emb: self.word_emb = TokenEmbedding( extended_vocab_path='./conf/word.dic', unknown_token='OOV') else: self.word_emb = nn.Embedding(word_num, emb_size) self.gru = nn.GRU(emb_size, hidden_size, num_layers=2, direction='bidirectional') self.fc = nn.Linear(hidden_size * 2, label_num + 2) # BOS EOS self.crf = LinearChainCrf(label_num) self.decoder = ViterbiDecoder(self.crf.transitions)
def __init__(self, emb_size, hidden_size, word_num, label_num, use_w2v_emb=False): super(BiGRUWithCRF, self).__init__() if use_w2v_emb: self.word_emb = TokenEmbedding( extended_vocab_path='./data/word.dic', unknown_token='OOV') else: self.word_emb = nn.Embedding(word_num, emb_size) self.gru = nn.GRU(emb_size, hidden_size, num_layers=2, direction='bidirect') # We need `label_num + 2` for appending BOS and EOS tag self.fc = nn.Linear(hidden_size * 2, label_num + 2) self.crf = LinearChainCrf(label_num) self.crf_loss = LinearChainCrfLoss(self.crf) self.viterbi_decoder = ViterbiDecoder(self.crf.transitions)
def __init__(self, vocab_size, num_classes, vocab_path, emb_dim=300, hidden_size=128, fc_hidden_size=96, use_token_embedding=True): super().__init__() if use_token_embedding: self.embedder = TokenEmbedding( args.embedding_name, extended_vocab_path=vocab_path) emb_dim = self.embedder.embedding_dim else: padding_idx = vocab_size - 1 self.embedder = nn.Embedding( vocab_size, emb_dim, padding_idx=padding_idx) self.bow_encoder = paddlenlp.seq2vec.BoWEncoder(emb_dim) self.fc1 = nn.Linear(self.bow_encoder.get_output_dim(), hidden_size) self.fc2 = nn.Linear(hidden_size, fc_hidden_size) self.dropout = nn.Dropout(p=0.3, axis=1) self.output_layer = nn.Linear(fc_hidden_size, num_classes)
def test_trainable(self): self.embedding = TokenEmbedding(**self.config) self.check_output_not_equal(self.config["trainable"], self.embedding.weight.stop_gradient)
def set_log(args): set_seeds(args) if True:#args.do_train: # preparing embeddings tokens_emb = TokenEmbedding("w2v.baidu_encyclopedia.target.word-word.dim300") # preparing train datasets assert args.raw_train_file != None, "--raw_train_file should be set when training!" if not os.path.exists(args.train_file): process_data(args.raw_train_file,args.train_file,tokens_emb) with open(args.train_file,mode="r",encoding="utf-8") as rfp: train_ex = json.load(rfp) train_dataset = DuReaderDataset(train_ex) train_batch_sampler = paddle.io.DistributedBatchSampler( train_dataset, batch_size=args.batch_size, shuffle=True) train_data_loader = paddle.io.DataLoader( dataset=train_dataset, batch_sampler=train_batch_sampler, collate_fn=batchify, return_list=True) # preparing dev datasets assert args.raw_dev_file != None, "--raw_dev_file should be set when training!" if not os.path.exists(args.dev_file): process_data(args.raw_dev_file,args.dev_file,tokens_emb) with open(args.train_file,mode="r",encoding="utf-8") as rfp: dev_ex = json.load(rfp) dev_dataset = DuReaderDataset(dev_ex) dev_batch_sampler = paddle.io.DistributedBatchSampler( dev_dataset, batch_size=args.dev_batch_size, shuffle=True) dev_data_loader = paddle.io.DataLoader( dataset=dev_dataset, batch_sampler=dev_batch_sampler, collate_fn=batchify, return_list=True) num_training_steps = args.max_steps if args.max_steps > 0 else len( train_data_loader) * args.num_train_epochs if paddle.distributed.get_rank() == 0: dev_count = paddle.fluid.core.get_cuda_device_count() logger.info("Device count: %d" % dev_count) logger.info("Num train examples: %d" % len(train_dataset)) logger.info("Num dev examples: %d" % len(dev_dataset)) logger.info("Max train steps: %d" % num_training_steps) model = DocReader(args) model.init_lr_scheduler(args, num_training_steps) model.init_optimizer(args) model.init_loss(args) # Training process global_step = 0 tic_train = time.time() for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 loss = model.update(batch) if global_step % args.logging_steps == 0: logger.info("global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch, step, loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() if global_step % args.save_steps == 0 or global_step == num_training_steps: if paddle.distributed.get_rank() == 0: output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) # need better way to get inner model of DataParallel model_file = os.path.join(output_dir + '.ckpt') model.save(model_file) model_file = os.path.join(args.output_dir, args.model_name + "-global.ckpt") model.save(model_file) if args.do_predict: # preparing test datasets pass
words = jiagu.seg(text) # 分词 print(words) pos = jiagu.pos(words) # 词性标注 print(pos) ner = jiagu.ner(words) # 命名实体识别 print(ner) from paddlenlp.datasets import ChnSentiCorp train_ds, dev_ds, test_ds = ChnSentiCorp.get_datasets(['train', 'dev', 'test']) from paddlenlp.embeddings import TokenEmbedding wordemb = TokenEmbedding("w2v.baidu_encyclopedia.target.word-word.dim300") print(wordemb.cosine_sim("苹果", "香蕉")) wordemb.cosine_sim("艺术", "火车") wordemb.cosine_sim("狗", "香蕉") for token1 in ['狗', '猫', '香蕉']: for token2 in ['狗', '猫', '香蕉']: print(wordemb.cosine_sim(token1, token2)) vv = wordemb.search(['狗', '猫', '香蕉']) vv2 = wordemb.search('狗猫香蕉')