def chat(folder_bert, voc, testing=False): tf.random.set_seed(1) tokenizer = BertTokenizer(vocab_file=folder_bert + voc) if testing: tokens = tokenizer.tokenize("jeg tror det skal regne") print(tokens) ids = tokenizer.convert_tokens_to_ids(tokens) print(ids) print("Vocab size:", len(tokenizer.vocab)) config = BertConfig.from_json_file(folder_bert + "/config.json") model = BertLMHeadModel.from_pretrained(folder_bert, config=config) while (1): text = input(">>User: "******"Bot: {}".format(tokenizer.decode(sample_output[0]))) print("Bot: {}".format( tokenizer.decode(sample_output[:, input_ids.shape[-1]:][0], skip_special_tokens=True)))
def generate_samples( model: PreTrainedModel, tokenizer: BertTokenizer, prompt_text: str, max_length=args['max_length'], temperature=args['temperature'], top_k=args['k'], top_p=args['p'], repetition_penalty=args['repetition_penalty'], num_return_sequences=args['num_return_sequences'], stop_token=args['stop'] ): encoded_prompt=tokenizer.encode(prompt_text, add_special_tokens=True, return_tensors='pt') encoded_prompt=encoded_prompt.to(model.device) input_ids=encoded_prompt if encoded_prompt.shape[-1]>0 else None output_sequences = model.generate( input_ids=input_ids, max_length=max_length, temperature=temperature, top_k=top_k, top_p=top_p, repetition_penalty=repetition_penalty, do_sample=True, num_return_sequences=num_return_sequences, ) if len(output_sequences.shape) > 2: output_sequences.squeeze_() generated_sequences = [] for generated_sequence_idx, generated_sequence in enumerate(output_sequences): print("=== GENERATED SEQUENCE {} ===".format(generated_sequence_idx + 1)) generated_sequence = generated_sequence.tolist() # Decode text text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True) # Remove all text after the stop token text = text[: text.find(stop_token) if args['stop_token'] else None] # Add the prompt at the beginning of the sequence. Remove the excess text that was used for pre-processing total_sequence = ( prompt_text + text[len(tokenizer.decode(encoded_prompt[0], clean_up_tokenization_spaces=True)) :] ) generated_sequences.append(total_sequence) print(total_sequence) return generated_sequences
def find_answer(tokenizer: BertTokenizer, answer_model: BertForQuestionAnswering, query: str, text: str) -> str: with torch.no_grad(): start, end = answer_model(**tokenizer.encode_plus( query, text, max_length=256, truncation=True, return_tensors="pt")) start_pos = torch.argmax(start).item() end_pos = torch.argmax(end).item() if start_pos >= end_pos: start = torch.softmax(start, dim=1) end = torch.softmax(end, dim=1) k = -2 start_args = torch.argsort(start).tolist()[0] end_args = torch.argsort(end).tolist()[0] calc_score = lambda start_pos, end_pos: start[0][start_pos] * end[0][ end_pos] s_score, e_score = 0, 0 s_pos, e_pos = start_pos, end_pos while s_score == 0 or e_score == 0: s_pos = start_args[k] e_pos = end_args[k] s_score = 0 if s_pos > end_pos else calc_score(s_pos, end_pos) e_score = 0 if e_pos < start_pos else calc_score(start_pos, e_pos) k -= 1 if s_score > e_score: start_pos = s_pos else: end_pos = e_pos return tokenizer.decode(tokenizer.encode(query, text)[start_pos:end_pos])
def find_paragraph(tokenizer: BertTokenizer, model: BertForNextSentencePrediction, question: str, context: str, max_len=256, batch_size=16): q_len = len(tokenizer.tokenize(question)) context_tokens = tokenizer.tokenize(context) part_len = max_len - q_len - 3 parts = [] n = 0 while n < len(context_tokens): parts += [context_tokens[n:n + part_len]] n += part_len // 2 results = [] all_parts = parts[:] while len(parts) > 0: batch = tokenizer.batch_encode_plus(list( zip([question] * batch_size, parts[:batch_size])), max_length=max_len, truncation=True, pad_to_max_length=True, return_tensors="pt").to("cuda") with torch.no_grad(): output = model(**batch)[0] results += [a - b for a, b in output.cpu().tolist()] parts = parts[batch_size:] return np.array(results), [ tokenizer.decode(tokenizer.encode(part), skip_special_tokens=True) for part in all_parts ]
def find_answer(tokenizer: BertTokenizer, model: BertForQuestionAnswering, context: str, question: str): input_data = tokenizer.encode_plus(question, context, return_tensors="pt") with torch.no_grad(): out = model(**input_data) start, end = out[0], out[1] start = torch.argmax(start).item() end = torch.argmax(end).item() return tokenizer.decode(tokenizer.encode(question, context)[start:end])
def print_output(out_ids, tokenizer: BertTokenizer): out_str = tokenizer.decode(out_ids) turn = 'B :' out_str = f'Meena: {out_str[4:]}' out_list = out_str.split(turn) if len(out_list) == 1: print(out_str) elif len(out_list) > 1: for item in out_list: print(item) time.sleep(1.1)
def make_new_source_input(tokenizer: BertTokenizer, target_input_ids: torch.Tensor, source_input_ids: torch.Tensor): list_target_input_ids = target_input_ids.tolist()[0] list_target_input_ids.append(tokenizer.sep_token_id) source_input_ids = remove_pad_token(tokenizer, source_input_ids[0]) source_input_ids = source_input_ids + list_target_input_ids[1:] if source_input_ids[-127:][0] == tokenizer.cls_token_id: source_input_ids = source_input_ids[-127:] else: source_input_ids = [tokenizer.cls_token_id] + source_input_ids[-127:] source_input_str = tokenizer.decode(source_input_ids, clean_up_tokenization_spaces=True) return torch.tensor(source_input_ids), source_input_str
def decode_one_example( tokenizer: BertTokenizer, label_list: List[str], inputs: Dict[str, torch.Tensor], logits: Optional[torch.FloatTensor] = None ) -> Union[Tuple[str, str], Tuple[str, str, str]]: if inputs["input_ids"].shape[0] != 1: raise ValueError X = tokenizer.decode(inputs["input_ids"][0]) Y = label_list[inputs["labels"].item()] if logits is not None: _Y_hat = logits.argmax(dim=-1).item() Y_hat = label_list[_Y_hat] return X, Y, Y_hat else: return X, Y
def main(): parser = argparse.ArgumentParser() parser.add_argument("--device", default="0", type=str, required=False, help="生成设备") parser.add_argument("--length", default=-1, type=int, required=False, help="生成长度") parser.add_argument("--batch_size", default=1, type=int, required=False, help="生成的batch size") parser.add_argument("--nsamples", default=10, type=int, required=False, help="生成几个样本") parser.add_argument("--temperature", default=1, type=float, required=False, help="生成温度") parser.add_argument("--topk", default=8, type=int, required=False, help="最高几选一") parser.add_argument("--topp", default=0, type=float, required=False, help="最高积累概率") parser.add_argument( "--model_config", default="config/model_config.json", type=str, required=False, help="模型参数", ) parser.add_argument( "--tokenizer_path", default="vocab/vocab.txt", type=str, required=False, help="词表路径", ) parser.add_argument( "--model_path", default="model/epoch=0-step=99.ckpt", type=str, required=False, help="模型路径", ) parser.add_argument("--prefix", default="我", type=str, required=False, help="生成文章的开头") parser.add_argument("--no_wordpiece", action="store_true", help="不做word piece切词") parser.add_argument("--segment", action="store_true", help="中文以词为单位") parser.add_argument("--fast_pattern", action="store_true", help="采用更加快的方式生成文本") parser.add_argument("--save_samples", action="store_true", help="保存产生的样本") parser.add_argument("--save_samples_path", default=".", type=str, required=False, help="保存样本的路径") parser.add_argument("--repetition_penalty", default=1.0, type=float, required=False) args = parser.parse_args() print("args:\n" + args.__repr__()) os.environ["CUDA_VISIBLE_DEVICES"] = args.device # 此处设置程序使用哪些显卡 length = args.length batch_size = args.batch_size nsamples = args.nsamples temperature = args.temperature topk = args.topk topp = args.topp repetition_penalty = args.repetition_penalty device = "cuda" if torch.cuda.is_available() else "cpu" tokenizer = BertTokenizer(vocab_file=args.tokenizer_path) model_config = GPT2Config.from_json_file(args.model_config) model = GPT2LMHeadModel(config=model_config) state_dict = { key[6:]: value for key, value in torch.load(args.model_path, map_location="cpu") ["state_dict"].items() } model.load_state_dict(state_dict) model.to(device) model.eval() for i in range(10): raw_text = args.prefix encoded = tokenizer.encode_plus(raw_text)["input_ids"] out = sample_sequence( model, encoded, length=512, n_ctx=1024, tokenizer=tokenizer, temperature=temperature, top_k=topk, top_p=topp, repitition_penalty=repetition_penalty, device=device, ) print(tokenizer.decode(out))
class Dialog(Agent): def __init__(self, model_file=DEFAULT_MODEL_URL, name="Dialog"): super(Dialog, self).__init__(name=name) if not os.path.exists(os.path.join(DEFAULT_DIRECTORY,'multiwoz/data')): os.mkdir(os.path.join(DEFAULT_DIRECTORY,'multiwoz/data')) ### download multiwoz data print('down load data from', DEFAULT_ARCHIVE_FILE_URL) if not os.path.exists(os.path.join(DEFAULT_DIRECTORY,'multiwoz/save')): os.mkdir(os.path.join(DEFAULT_DIRECTORY,'multiwoz/save')) ### download trained model print('down load model from', DEFAULT_MODEL_URL) model_path = "" config = Config() parser = config.parser config = parser.parse_args() with open("assets/never_split.txt") as f: never_split = f.read().split("\n") self.tokenizer = BertTokenizer("assets/vocab.txt", never_split=never_split) self.nlu = BERTNLU() self.dst_ = DST(config).cuda() ckpt = torch.load("save/model_Sun_Jun_21_07:08:48_2020.pt", map_location = lambda storage, loc: storage.cuda(local_rank)) self.dst_.load_state_dict(ckpt["model"]) self.dst_.eval() self.policy = RulePolicy() self.nlg = TemplateNLG(is_user=False) self.init_session() self.slot_mapping = { "leave": "leaveAt", "arrive": "arriveBy" } def init_session(self): self.nlu.init_session() self.policy.init_session() self.nlg.init_session() self.history = [] self.state = default_state() pass def response(self, user): self.history.append(["user", user]) user_action = [] self.input_action = self.nlu.predict(user, context=[x[1] for x in self.history[:-1]]) self.input_action = deepcopy(self.input_action) for act in self.input_action: intent, domain, slot, value = act if intent == "Request": user_action.append(act) if not self.state["request_state"].get(domain): self.state["request_state"][domain] = {} if slot not in self.state["request_state"][domain]: self.state['request_state'][domain][slot] = 0 context = " ".join([utterance[1] for utterance in self.history]) context = context[-MAX_CONTEXT_LENGTH:] context = self.tokenizer.encode(context) context = torch.tensor(context, dtype=torch.int64).unsqueeze(dim=0).cuda() # [1, len] belief_gen = self.dst_(None, context, 0, test=True)[0] # [slots, len] for slot_idx, domain_slot in enumerate(ontology.all_info_slots): domain, slot = domain_slot.split("-") slot = self.slot_mapping.get(slot, slot) value = belief_gen[slot_idx][:-1] # remove <EOS> value = self.tokenizer.decode(value) if value != "none": if slot in self.state["belief_state"][domain]["book"].keys(): if self.state["belief_state"][domain]["book"][slot] == "": action = ["Inform", domain.capitalize(), REF_USR_DA[domain].get(slot, slot), value] user_action.append(action) self.state["belief_state"][domain]["book"][slot] = value elif slot in self.state["belief_state"][domain]["semi"].keys(): if self.state["belief_state"][domain]["semi"][slot] == "": action = ["Inform", domain.capitalize(), REF_USR_DA[domain].get(slot, slot), value] user_action.append(action) self.state["belief_state"][domain]["semi"][slot] = value self.state["user_action"] = user_action self.output_action = deepcopy(self.policy.predict(self.state)) model_response = self.nlg.generate(self.output_action) self.history.append(["sys", model_response]) return model_response
max_seq_len=max_len, causal=True) checkpoint = torch.load(PATH, map_location=torch.device('cpu')) model.load_state_dict(checkpoint['model_state_dict']) # model.load_state_dict(torch.load(PATH, map_location=torch.device('cpu'))) model.eval() sent = '사람이 철학적으로 생각하는 것은' padd_token_id = tokenizer.pad_token_id tokenized_sentence = tokenizer.encode(sent, add_special_tokens=False) while 1: input_ids = sentence_mask_to_max_length([ tokenizer.cls_token_id, ] + tokenized_sentence, 128, 0) input_ids = torch.tensor(input_ids).unsqueeze(0) output = model(input_ids) pred = output[0] next_token_pred = pred.squeeze()[len(tokenized_sentence)] top_k_sample = top_k(next_token_pred, 9) # gen = tokenizer.decode(top_k_sample).replace(' ','') tokenized_sentence = tokenized_sentence + top_k_sample.tolist() # if gen == '[SEP]': # pass # # if '##'in gen: # sent += gen.replace('##','') # else: # sent += ' '+gen print(tokenizer.decode(tokenized_sentence, skip_special_tokens=True)) # tokenized_sentence = tokenizer.encode(sent, add_special_tokens=False)
def mask_and_fill_by_threshold( text: str, bert: BertForMaskedLM, tokenizer: BertTokenizer, threshold: float, device, mlm_score, ): print("=== New example ===") print("Original Text: {}".format(text)) context, response, word_list, score_list, sorted_index = mlm_score if score_list is None: raise ValueError if len(text.split()) < 3: return None assert len(sorted_index[0]) == len(sorted_index[1]) == len(score_list) response = response.strip() text = text.strip() assert response == text if max(score_list) < threshold or len(score_list) < 3: return None encoded = tokenizer.encode_plus( text, return_tensors="pt", max_length=512, truncation=True, ) assert len(encoded["input_ids"][0]) == len(word_list) + 2 masked_token_indices = [] masked_token_original_list = [] for idx, score in enumerate(score_list): if score >= threshold: masked_token_original_list.append( encoded["input_ids"][0][idx + 1].clone().detach()) encoded["input_ids"][0][idx + 1] = tokenizer.mask_token_id masked_token_indices.append(idx + 1) encoded = {k: v.to(device) for k, v in encoded.items()} with torch.no_grad(): output = bert(**encoded)[0] changed_indices = [] for mask_order, mask_index in enumerate(masked_token_indices): while True: decoded_index = torch.argmax(output[0][mask_index]).item() if decoded_index not in [masked_token_original_list[mask_order]]: break output[0][mask_index, decoded_index] = -100 changed_indices.append(decoded_index) for idx, mask_position in enumerate(masked_token_indices): encoded["input_ids"][0][mask_position] = changed_indices[idx] changed_response = tokenizer.decode( encoded["input_ids"][0], skip_special_tokens=True, clean_up_tokenization_spaces=True, ) print("Changed: {}".format(changed_response)) return changed_response
ToTensor(tokenizer) ])) a = DataLoader(x, batch_size=10, sampler=SubsetRandomSampler(x.sampler)) for i, s in enumerate(a): print(i) print(s) tokenizer.ids_to_tokens[0] tokenizer.convert_ids_to_tokens tokenizer.vocab.keys() tokenizer.build_inputs_with_special_tokens([95, 209], [95, 209]) tokenizer.pretrained_vocab_files_map tokenizer.encode("<BOS> I like tea") s = idr.dataset.tokens[0] tokenizer.decode(tokenizer.encode(" ".join(s))) encoded_tensor = torch.Tensor([ tokenizer.encode(idr.dataset.partial[j]) for j in range(len(idr.dataset.partial)) ]) encoded_tensor = torch.Tensor(tokenizer.encode(idr.dataset.partial[0])) model = BertModel.from_pretrained("bert-base-uncased", output_hidden_states=True) model.eval() with torch.no_grad(): outputs = model(encoded_tensor) hidden_states = outputs[2] data_root = os.path.join(pathlib.Path(__file__).parents[1], "data") os.listdir(data_root)
new_node_embedding = node_decoder_hidden topv, topi = node_decoder_output.topk(1) if teacher_forcing: node_decoder_input = target[:, ni].view(batch_size, 1) else: node_decoder_input = topi.squeeze().detach( ) # detach from history as input output.append(topi.squeeze().detach().cpu().numpy()) loss += criterion(node_decoder_output.view(batch_size, -1), target[:, ni].view(batch_size)) # find the index of the corresponding new nodes if node_decoder_input == 1: break n_id = y[j][1].index(X[j][k]) target_id = y[j][0][n_id] phrase_loss, p_output, y_tokenize = gen_phrase( phrase_generator, new_node_embedding, target_id) phrase_output.append(p_output) target_output.append( tokenizer.decode(y_tokenize.view(-1).cpu().numpy())) for i in range(len(phrase_output)): total += 1 if phrase_output[i] == target_output[i]: count += 1 print("{} - {} / {}".format(index2phrase[X[j][i]], phrase_output[i], target_output[i])) #print(phrase_output) #print(target_output) print("-" * 100) print("ACC:{:.4f}".format(count / total))