def main() -> None: tokenizer = Tokenizer(args.vocab_file) vocabulary_size = len(tokenizer) searcher = BeamSearch(tokenizer.eos_index, beam_size=args.search_width) model = VAE( num_embeddings=len(tokenizer), dim_embedding=args.dim_embedding, dim_hidden=args.dim_hidden, dim_latent=args.dim_latent, num_layers=args.num_layers, bidirectional=args.bidirectional, dropout=0., word_dropout=0., dropped_index=tokenizer.unk_index, ).to(device) model.load_state_dict(torch.load(args.checkpoint_file, map_location=device)) model.eval() sentence1 = input('Please input sentence1: ') sentence2 = input('Please input sentence2: ') s1 = [tokenizer.bos_index ] + tokenizer.encode(sentence1) + [tokenizer.eos_index] s2 = [tokenizer.bos_index ] + tokenizer.encode(sentence2) + [tokenizer.eos_index] z1, _ = model.encode( torch.tensor([s1]).to(device), torch.tensor([len(s1)]).to(device)) z2, _ = model.encode( torch.tensor([s2]).to(device), torch.tensor([len(s2)]).to(device)) print("\nGenerate intermediate sentences") print(" %s" % sentence1) for r in range(1, 10): z = (1 - 0.1 * r) * z1 + 0.1 * r * z2 hidden = model.fc_hidden(z) hidden = hidden.view(1, -1, model.dim_hidden).transpose(0, 1).contiguous() start_predictions = torch.zeros(1, device=device).fill_( tokenizer.bos_index).long() start_state = {'hidden': hidden.permute(1, 0, 2)} predictions, log_probabilities = searcher.search( start_predictions, start_state, model.step) tokens = predictions[0, 0] tokens = tokens[tokens != tokenizer.eos_index].tolist() print("[%d:%d] %s" % (10 - r, r, tokenizer.decode(tokens))) print(" %s" % sentence2)
class IntentClassifier(tf.keras.Model): def __init__(self, n_intents=None, dropout=0.2, model_name="bert-base-uncased"): super().__init__(name="intent_classifier") self.tokenizer = Tokenizer() self.bert = TFBertForSequenceClassification.from_pretrained(model_name) self.dropout = Dropout(dropout) self.intent_classifier = Dense(n_intents, activation='softmax') def call(self, inputs, **kwargs): # The second output of the main BERT layer corresponds to the [CLS] token # and gives a pooled representation for the full sequence pooled_output = self.bert(inputs, **kwargs) pooled_output = self.dropout(pooled_output) intent = self.intent_classifier(pooled_output) return intent def get_embedding(self, plain_text, **kwargs): encoded = self.tokenizer.encode(plain_text) print(">> encoded", encoded) _, pooled_output = self.bert(encoded, **kwargs) return pooled_output.numpy()
class DreamDataset(Dataset): """ 自定义dataset 针对周公姐解梦数据集,定义一个相关的取数据的方式 """ def __init__(self): # 一般init函数是加载所有数据 super(DreamDataset, self).__init__() # 读原始数据 self.sents_src, self.sents_tgt = read_corpus( Config.dream_train_corpus_path) self.word2idx = load_bert_vocab() self.idx2word = {k: v for v, k in self.word2idx.items()} self.tokenizer = Tokenizer(self.word2idx) def __getitem__(self, i): # 得到单个数据 src = self.sents_src[i] tgt = self.sents_tgt[i] token_ids, token_type_ids = self.tokenizer.encode(src, tgt) output = { "token_ids": token_ids, "token_type_ids": token_type_ids, } return output def __len__(self): return len(self.sents_src)
class BertDataset(Dataset): """ 针对特定数据集,定义一个相关的取数据的方式 """ def __init__(self, sents_src, sents_tgt, vocab_path): ## 一般init函数是加载所有数据 super(BertDataset, self).__init__() # 读原始数据 # self.sents_src, self.sents_tgt = read_corpus(poem_corpus_dir) self.sents_src = sents_src self.sents_tgt = sents_tgt self.word2idx = load_chinese_base_vocab(vocab_path) self.idx2word = {k: v for v, k in self.word2idx.items()} self.tokenizer = Tokenizer(self.word2idx) def __getitem__(self, i): ## 得到单个数据 src = self.sents_src[i] tgt = self.sents_tgt[i] token_ids, token_type_ids = self.tokenizer.encode(src, tgt, max_length=256) output = { "token_ids": token_ids, "token_type_ids": token_type_ids, } return output def __len__(self): return len(self.sents_src)
def main(input_path, output_path, sp_model_path, n_val, n_test, seed): tokenizer = Tokenizer(sp_model_path, bos_eos=True) train_dir = os.path.join(output_path, 'train') val_dir = os.path.join(output_path, 'val') test_dir = os.path.join(output_path, 'test') os.makedirs(train_dir, exist_ok=True) os.makedirs(val_dir, exist_ok=True) os.makedirs(test_dir, exist_ok=True) input_file_paths = sorted(glob.glob(os.path.join(input_path, '*.txt'))) random.seed(seed) random.shuffle(input_file_paths) for i, input_file_path in enumerate(input_file_paths): print(f'\r{i + 1} / {len(input_file_paths)}', end='') file_name = os.path.basename(input_file_path) with open(input_file_path) as f: tids = tokenizer.encode(f.read()) tids = torch.tensor(tids, dtype=torch.long) if i < n_val: torch.save(tids, os.path.join(val_dir, file_name.replace('.txt', '.pt'))) elif n_val <= i < n_val + n_test: torch.save( tids, os.path.join(test_dir, file_name.replace('.txt', '.pt'))) else: torch.save( tids, os.path.join(train_dir, file_name.replace('.txt', '.pt'))) print() print('done.')
def generate( x: str, beam_width: int, device: torch.device, max_seq_len: int, model: Transformer, tokenizer: Tokenizer ) -> str: model.eval() seq = torch.LongTensor([tokenizer.bos_id]).to(device) x = torch.LongTensor([tokenizer.encode(x, max_len=-1)]).to(device) accum_prob = torch.zeros(beam_width).to(device) for _ in range(max_seq_len): pred_y = model.predict(x, seq) top_k_in_all_beams = [] for out_beams in range(seq.size(0)): top_k_prob_in_beam, top_k_index_in_beam = \ pred_y[out_beams, -1].topk( k=beam_width, dim=-1 ) for in_beam in range(beam_width): prob = accum_prob[out_beams] -\ top_k_prob_in_beam[in_beam].log() prob = prob.unsqueeze(0) temp_seq = torch.cat([ seq[out_beams], top_k_index_in_beam[in_beam].unsqueeze(0) ], dim=-1).unsqueeze(0) top_k_in_all_beams.append({ 'prob': prob, 'seq': temp_seq }) _, top_k_index_in_all_beams = torch.cat([ beam['prob'] for beam in top_k_in_all_beams ]).topk(k=beam_width, dim=0) seq = torch.cat([ top_k_in_all_beams[index]['seq'] for index in top_k_index_in_all_beams ], dim=0) accum_prob = torch.cat([ top_k_in_all_beams[index]['prob'] for index in top_k_index_in_all_beams ], dim=0) if x.size(0) != seq.size(0): x = x.repeat(seq.size(0) // x.size(0), 1) for i in tokenizer.batch_decode(seq.tolist()): print(i)
def __init__(self, tokenizer: Tokenizer, file_path: str, block_size: int, overwrite_cache=False): super(TextDataset, self).__init__() self.path = file_path assert os.path.isfile(file_path) directory, filename = os.path.split(file_path) cached_features_file = os.path.join( directory, "cached_lm_{}_{}_{}".format( tokenizer.__class__.__name__, str(block_size), filename, ), ) # Make sure only the first process in distributed training processes the dataset, # and the others will use the cache. lock_path = cached_features_file + ".lock" with FileLock(lock_path): if os.path.exists(cached_features_file) and not overwrite_cache: start = time.time() with open(cached_features_file, "rb") as handle: self.data = pickle.load(handle) logger.info( f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start) else: logger.info( f"Creating features from dataset file at {directory}") self.data = [] with open(file_path, encoding="utf-8") as f: text = f.read() tokenized_text = tokenizer.encode(text) for i in range(0, len(tokenized_text.ids) - block_size + 1, block_size): # Truncate in block of block_size self.data.append(tokenized_text.ids[i:i + block_size]) # Note that we are losing the last truncated example here for the sake of simplicity (no padding) # If your dataset is small, first you should loook for a bigger one :-) and second you # can change this behavior by adding (model specific) padding. start = time.time() with open(cached_features_file, "wb") as handle: pickle.dump(self.data, handle, protocol=pickle.HIGHEST_PROTOCOL) logger.info( "Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start)
def collate(data: List[str], tokenizer: Tokenizer, block_size: int) -> Batch: ids = tokenizer.encode(data, block_size) mask = tokenizer.mask(ids) return Batch(ids=ids, attention_mask=mask)
class Seq2SeqModel(nn.Module): """ 模型 """ def __init__(self, config: BertConfig): super(Seq2SeqModel, self).__init__() # 获取配置信息 self.hidden_dim = config.hidden_size self.vocab_size = config.vocab_size # encoder and decoder self.bert = BertModel(config) self.decoder = BertLMPredictionHead( config, self.bert.embeddings.word_embeddings.weight) # 加载字典和分词器 self.word2ix = load_bert_vocab() self.tokenizer = Tokenizer(self.word2ix) def compute_loss(self, predictions, labels, target_mask): """ target_mask : 句子a部分和pad部分全为0, 而句子b部分为1 """ predictions = predictions.view(-1, self.vocab_size) labels = labels.view(-1) target_mask = target_mask.view(-1).float() loss = nn.CrossEntropyLoss(ignore_index=0, reduction="none") return (loss(predictions, labels) * target_mask ).sum() / target_mask.sum() ## 通过mask 取消 pad 和句子a部分预测的影响 def forward(self, input_tensor, token_type_id, position_enc=None, labels=None, device="cpu"): ''' :param input_tensor: 传入输入 :param token_type_id: 句子标志 :param position_enc: 位置编码 :param labels: 解码的句子 :param device: :return: ''' input_shape = input_tensor.size() seq_len = input_shape[1] # 构建特殊的mask ones = torch.ones((1, 1, seq_len, seq_len), dtype=torch.float32, device=device) a_mask = ones.tril() # 下三角矩阵 s_ex12 = token_type_id.unsqueeze(1).unsqueeze(2).float() s_ex13 = token_type_id.unsqueeze(1).unsqueeze(3).float() a_mask = (1.0 - s_ex12) * (1.0 - s_ex13) + s_ex13 * a_mask # print(a_mask.size()) # torch.Size([2, 1, 44, 44]) enc_layers, _ = self.bert(input_tensor, position_ids=position_enc, token_type_ids=token_type_id, attention_mask=a_mask, output_all_encoded_layers=True) # print(_.size()) # torch.Size([2, 768]) (batch_size, hidden_size) squence_out = enc_layers[-1] # 取出来最后一层输出 # print(squence_out.size()) # torch.Size([2, 31, 768]) predictions = self.decoder(squence_out) # print(labels.size()) # torch.Size([2, 30]) # print(predictions.size()) # torch.Size([2, 31, 21128]) if labels is not None: # 计算loss # 需要构建特殊的输出mask 才能计算正确的loss # 预测的值不用取最后sep符号的结果 因此是到-1 predictions = predictions[:, :-1].contiguous() # print(predictions.size()) # torch.Size([2, 30, 21128]) target_mask = token_type_id[:, 1:].contiguous() # print(target_mask) loss = self.compute_loss(predictions, labels, target_mask) return predictions, loss else: return predictions def generate(self, text, out_max_length=50, beam_size=1, device="cpu"): # 对一个句子生成相应的结果 # 通过输出最大长度得到输入的最大长度,这里问题不大,如果超过最大长度会进行截断 self.out_max_length = out_max_length input_max_length = Config.max_length - out_max_length # print(text) token_ids, token_type_ids = self.tokenizer.encode( text, max_length=input_max_length) token_ids = torch.tensor(token_ids, device=device).view(1, -1) token_type_ids = torch.tensor(token_type_ids, device=device).view(1, -1) out_puts_ids = self.beam_search(token_ids, token_type_ids, self.word2ix, beam_size=beam_size, device=device) # 解码 得到相应输出 return self.tokenizer.decode(out_puts_ids) def beam_search(self, token_ids, token_type_ids, word2ix, beam_size=1, device="cpu"): """ beam-search操作 """ sep_id = word2ix["[SEP]"] # 用来保存输出序列 output_ids = [[]] # 用来保存累计得分 output_scores = torch.zeros(token_ids.shape[0], device=device) for step in range(self.out_max_length): scores = self.forward(token_ids, token_type_ids, device=device) # print(scores.shape) if step == 0: # 重复beam-size次 输入ids token_ids = token_ids.view(1, -1).repeat(beam_size, 1) token_type_ids = token_type_ids.view(1, -1).repeat(beam_size, 1) ## 计算log 分值 (beam_size, vocab_size) logit_score = torch.log_softmax(scores, dim=-1)[:, -1] logit_score = output_scores.view(-1, 1) + logit_score # 累计得分 ## 取topk的时候我们是展平了然后再去调用topk函数 # 展平 logit_score = logit_score.view(-1) hype_score, hype_pos = torch.topk(logit_score, beam_size) indice1 = hype_pos / scores.shape[-1] # 行索引 indice2 = hype_pos % scores.shape[-1] # 列索引 # 下面需要更新一下输出了 new_hype_scores = [] new_hype_ids = [] # 为啥有这个[],就是因为要过滤掉结束的序列。 next_chars = [] # 用来保存新预测出来的一个字符,继续接到输入序列后面,再去预测新字符 for i_1, i_2, score in zip(indice1, indice2, hype_score): i_1 = i_1.item() i_2 = i_2.item() socre = score.item() hype_id = output_ids[i_1] + [i_2] # 保存所有输出的序列,而不仅仅是新预测的单个字符 if i_2 == sep_id: # 说明解码到最后了 if score == torch.max(hype_score).item(): # 说明找到得分最大的那个序列了 直接返回即可 return hype_id[:-1] else: # 完成一个解码了,但这个解码得分并不是最高,因此的话需要跳过这个序列 beam_size -= 1 else: new_hype_ids.append(hype_id) new_hype_scores.append(score) next_chars.append(i_2) # 收集一下,需要连接到当前的输入序列之后 output_ids = new_hype_ids output_scores = torch.tensor(new_hype_scores, dtype=torch.float32, device=device) # 现在需要重新构造输入数据了,用上一次输入连接上这次新输出的字符,再输入bert中预测新字符 token_ids = token_ids[:len(output_ids)].contiguous( ) # 截取,因为要过滤掉已经完成预测的序列 token_type_ids = token_type_ids[:len(output_ids)].contiguous() next_chars = torch.tensor(next_chars, dtype=torch.long, device=device).view(-1, 1) next_token_type_ids = torch.ones_like(next_chars, device=device) # 连接 token_ids = torch.cat((token_ids, next_chars), dim=1) token_type_ids = torch.cat((token_type_ids, next_token_type_ids), dim=1) if beam_size < 1: break # 如果达到最大长度的话 直接把得分最高的输出序列返回把 return output_ids[output_scores.argmax().item()]
class Engine: def __init__(self, data_file_path="intents_db.pkl"): self.data = [] self.known_intents = [] self.n_intents = 0 self.intents_labels = {} self.intents_embeddings = {} self.data_file_path = data_file_path # self.model_file_path = "intent_classifier.h5" self.encoder = Tokenizer() self.model = None def initialize(self): RELOAD_DATA = os.path.exists(self.data_file_path) if RELOAD_DATA: logger.info("Reloading data from file {}".format( self.data_file_path)) with open(self.data_file_path, 'rb') as f: self.data = pickle.load(f) self.update_intents() else: self.data = [] with open("raw_intents.csv", 'r') as f: rows = f.readlines() for row in rows: if len(row) == 0: continue if row.startswith("#"): continue query, intent = row.split(",") self.data.append({ 'raw': { 'query': query.lower().strip(), 'intent': intent.lower().strip() }, 'nlu': {} }) self.update_intents() self.tokenize_data() print("Known intents:") print(self.known_intents) self.model = IntentClassifier(n_intents=self.n_intents) self.model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['categorical_accuracy']) X, y = self.make_training_dataset(self.data) print(X["input_ids"].shape, X['attention_masks'].shape, y.shape) self.model.train_on_batch(X, y) self.model.summary() self.make_intents_embeddings() def update_intents(self, new_intent=None): if new_intent is None: self.known_intents = list( set([ x['raw']['intent'] for x in self.data if 'intent' in x['raw'] ])) else: if new_intent not in self.known_intents: self.known_intents.append(new_intent) self.known_intents.sort() self.n_intents = len(self.known_intents) self.intents_labels = {k: i for i, k in enumerate(self.known_intents)} def make_intents_embeddings(self): embeddings = self.model.get_embedding(self.known_intents) self.intents_embeddings = { k: emb for k, emb in zip(self.known_intents, embeddings) } print(self.known_intents) def tokenize_data(self): query = [x['raw']['query'] for x in self.data] encoded = self.encoder.encode(query) for i in range(len(self.data)): x = self.data[i] x['nlu'] = { 'input_ids': encoded['input_ids'][i], 'attention_masks': encoded['attention_masks'][i], 'label': self.intents_labels[x['raw']['intent']] } @staticmethod def make_training_dataset(batch): X = { "input_ids": np.array([x['nlu']['input_ids'] for x in batch]), 'attention_masks': np.array([x['nlu']['attention_masks'] for x in batch]) } y = np.array([x['nlu']['label'] for x in batch], dtype=np.int64) y = to_categorical(y) return X, y def write_out(self): # Write out to file print("Saving data to file {}".format(self.data_file_path)) with open(self.data_file_path, 'wb') as f: pickle.dump(self.data, f) #model.save(model_file_path) def predict_intent(self, txt): this_embedding = self.model.get_embedding([txt]) all_embeddings = [ self.intents_embeddings[i] for i in self.known_intents ] scores = cosine_similarity(this_embedding, all_embeddings) k = np.argmax(scores[0]) confidence = scores[0][k] closest_intent = self.known_intents[k] return closest_intent, confidence def loop(self): while True: print("Tell me what you would like to do") txt = input() txt = txt.lower() if txt in ['q', 'quit', 'stop']: return intent, confidence = self.predict_intent(txt) print("Is this your purpose? {} (confidence={:.3f})".format( intent, confidence)) reply = input().lower() if reply in ['n', 'no', 'nope']: print("What is the purpose?") intent = input().lower() if intent not in self.known_intents: closest_intent, confidence = self.predict_intent(intent) print("Is this the same as {} (confidence={:.3f})? [y, n]". format(closest_intent, confidence)) reply = input().lower() if reply in ['y', 'yes']: intent = closest_intent else: print("This is a new intent to me") self.update_intents(intent) self.make_intents_embeddings() print("Ok, so you are asking for: {}".format(intent)) encoded = self.encoder.encode(txt) entry = { 'raw': { 'query': txt, 'intent': intent }, 'nlu': { 'input_ids': encoded['input_ids'][0], 'attention_masks': encoded['attention_masks'][0], 'label': self.intents_labels[intent] } } self.data.append(entry) X, y = self.make_training_dataset([entry]) self.model.train_on_batch(X, y)
type=int, default=20, help='размер генерируемого текста') parser.add_argument('--num_beams', type=int) parser.add_argument('--num_return_sequences', default=1, type=int) parser.add_argument('--no_repeat_ngram_size', type=int) parser.add_argument('--temperature', type=float) parser.add_argument('--top_k', type=int) parser.add_argument('--top_p', type=float) args = parser.parse_args() model = GPT2LMHeadModel.from_pretrained(args.output_dir or train_args.output_dir) tokenizer = Tokenizer(train_args.tokenizer_path) model.eval() input_ids = tokenizer.encode([args.start]) if args.top_k: if args.top_p: outputs = model.generate( input_ids, do_sample=True, max_length=args.length, top_k=args.top_k, top_p=args.top_p, no_repeat_ngram_size=args.no_repeat_ngram_size, num_return_sequences=args.num_return_sequences) else: outputs = model.generate( input_ids, do_sample=True, max_length=args.length,