def chat(folder_bert, voc, testing=False):
    tf.random.set_seed(1)
    tokenizer = BertTokenizer(vocab_file=folder_bert + voc)
    if testing:
        tokens = tokenizer.tokenize("jeg tror det skal regne")
        print(tokens)
        ids = tokenizer.convert_tokens_to_ids(tokens)
        print(ids)
        print("Vocab size:", len(tokenizer.vocab))

    config = BertConfig.from_json_file(folder_bert + "/config.json")
    model = BertLMHeadModel.from_pretrained(folder_bert, config=config)
    while (1):
        text = input(">>User: "******"Bot: {}".format(tokenizer.decode(sample_output[0])))
        print("Bot: {}".format(
            tokenizer.decode(sample_output[:, input_ids.shape[-1]:][0],
                             skip_special_tokens=True)))
Exemple #2
0
def generate_samples(
    model: PreTrainedModel,
    tokenizer: BertTokenizer,
    prompt_text: str,
    max_length=args['max_length'],
    temperature=args['temperature'],
    top_k=args['k'],
    top_p=args['p'],
    repetition_penalty=args['repetition_penalty'],
    num_return_sequences=args['num_return_sequences'],
    stop_token=args['stop']
    ):

    encoded_prompt=tokenizer.encode(prompt_text, add_special_tokens=True, return_tensors='pt')
    encoded_prompt=encoded_prompt.to(model.device)
    input_ids=encoded_prompt if encoded_prompt.shape[-1]>0 else None

    output_sequences = model.generate(
        input_ids=input_ids,
        max_length=max_length,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        do_sample=True,
        num_return_sequences=num_return_sequences,
    )

    if len(output_sequences.shape) > 2:
        output_sequences.squeeze_()

    generated_sequences = []

    for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
        print("=== GENERATED SEQUENCE {} ===".format(generated_sequence_idx + 1))
        generated_sequence = generated_sequence.tolist()

        # Decode text
        text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)

        # Remove all text after the stop token
        text = text[: text.find(stop_token) if args['stop_token'] else None]

        # Add the prompt at the beginning of the sequence. Remove the excess text that was used for pre-processing
        total_sequence = (
            prompt_text + text[len(tokenizer.decode(encoded_prompt[0], clean_up_tokenization_spaces=True)) :]
        )

        generated_sequences.append(total_sequence)
        print(total_sequence)

    return generated_sequences
Exemple #3
0
def find_answer(tokenizer: BertTokenizer,
                answer_model: BertForQuestionAnswering, query: str,
                text: str) -> str:
    with torch.no_grad():
        start, end = answer_model(**tokenizer.encode_plus(
            query, text, max_length=256, truncation=True, return_tensors="pt"))
    start_pos = torch.argmax(start).item()
    end_pos = torch.argmax(end).item()
    if start_pos >= end_pos:
        start = torch.softmax(start, dim=1)
        end = torch.softmax(end, dim=1)
        k = -2
        start_args = torch.argsort(start).tolist()[0]
        end_args = torch.argsort(end).tolist()[0]
        calc_score = lambda start_pos, end_pos: start[0][start_pos] * end[0][
            end_pos]
        s_score, e_score = 0, 0
        s_pos, e_pos = start_pos, end_pos
        while s_score == 0 or e_score == 0:
            s_pos = start_args[k]
            e_pos = end_args[k]
            s_score = 0 if s_pos > end_pos else calc_score(s_pos, end_pos)
            e_score = 0 if e_pos < start_pos else calc_score(start_pos, e_pos)
            k -= 1
        if s_score > e_score:
            start_pos = s_pos
        else:
            end_pos = e_pos
    return tokenizer.decode(tokenizer.encode(query, text)[start_pos:end_pos])
Exemple #4
0
def find_paragraph(tokenizer: BertTokenizer,
                   model: BertForNextSentencePrediction,
                   question: str,
                   context: str,
                   max_len=256,
                   batch_size=16):
    q_len = len(tokenizer.tokenize(question))
    context_tokens = tokenizer.tokenize(context)
    part_len = max_len - q_len - 3
    parts = []
    n = 0
    while n < len(context_tokens):
        parts += [context_tokens[n:n + part_len]]
        n += part_len // 2
    results = []
    all_parts = parts[:]
    while len(parts) > 0:
        batch = tokenizer.batch_encode_plus(list(
            zip([question] * batch_size, parts[:batch_size])),
                                            max_length=max_len,
                                            truncation=True,
                                            pad_to_max_length=True,
                                            return_tensors="pt").to("cuda")
        with torch.no_grad():
            output = model(**batch)[0]
        results += [a - b for a, b in output.cpu().tolist()]
        parts = parts[batch_size:]
    return np.array(results), [
        tokenizer.decode(tokenizer.encode(part), skip_special_tokens=True)
        for part in all_parts
    ]
Exemple #5
0
def find_answer(tokenizer: BertTokenizer, model: BertForQuestionAnswering,
                context: str, question: str):
    input_data = tokenizer.encode_plus(question, context, return_tensors="pt")
    with torch.no_grad():
        out = model(**input_data)
    start, end = out[0], out[1]
    start = torch.argmax(start).item()
    end = torch.argmax(end).item()
    return tokenizer.decode(tokenizer.encode(question, context)[start:end])
Exemple #6
0
def print_output(out_ids, tokenizer: BertTokenizer):
    out_str = tokenizer.decode(out_ids)
    turn = 'B :'
    out_str = f'Meena: {out_str[4:]}'
    out_list = out_str.split(turn)

    if len(out_list) == 1:
        print(out_str)
    elif len(out_list) > 1:
        for item in out_list:
            print(item)
            time.sleep(1.1)
Exemple #7
0
def make_new_source_input(tokenizer: BertTokenizer,
                          target_input_ids: torch.Tensor,
                          source_input_ids: torch.Tensor):
    list_target_input_ids = target_input_ids.tolist()[0]
    list_target_input_ids.append(tokenizer.sep_token_id)

    source_input_ids = remove_pad_token(tokenizer, source_input_ids[0])
    source_input_ids = source_input_ids + list_target_input_ids[1:]
    if source_input_ids[-127:][0] == tokenizer.cls_token_id:
        source_input_ids = source_input_ids[-127:]
    else:
        source_input_ids = [tokenizer.cls_token_id] + source_input_ids[-127:]

    source_input_str = tokenizer.decode(source_input_ids,
                                        clean_up_tokenization_spaces=True)

    return torch.tensor(source_input_ids), source_input_str
def decode_one_example(
    tokenizer: BertTokenizer,
    label_list: List[str],
    inputs: Dict[str, torch.Tensor],
    logits: Optional[torch.FloatTensor] = None
) -> Union[Tuple[str, str], Tuple[str, str, str]]:

    if inputs["input_ids"].shape[0] != 1:
        raise ValueError

    X = tokenizer.decode(inputs["input_ids"][0])
    Y = label_list[inputs["labels"].item()]
    if logits is not None:
        _Y_hat = logits.argmax(dim=-1).item()
        Y_hat = label_list[_Y_hat]
        return X, Y, Y_hat
    else:
        return X, Y
Exemple #9
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--device",
                        default="0",
                        type=str,
                        required=False,
                        help="生成设备")
    parser.add_argument("--length",
                        default=-1,
                        type=int,
                        required=False,
                        help="生成长度")
    parser.add_argument("--batch_size",
                        default=1,
                        type=int,
                        required=False,
                        help="生成的batch size")
    parser.add_argument("--nsamples",
                        default=10,
                        type=int,
                        required=False,
                        help="生成几个样本")
    parser.add_argument("--temperature",
                        default=1,
                        type=float,
                        required=False,
                        help="生成温度")
    parser.add_argument("--topk",
                        default=8,
                        type=int,
                        required=False,
                        help="最高几选一")
    parser.add_argument("--topp",
                        default=0,
                        type=float,
                        required=False,
                        help="最高积累概率")
    parser.add_argument(
        "--model_config",
        default="config/model_config.json",
        type=str,
        required=False,
        help="模型参数",
    )
    parser.add_argument(
        "--tokenizer_path",
        default="vocab/vocab.txt",
        type=str,
        required=False,
        help="词表路径",
    )
    parser.add_argument(
        "--model_path",
        default="model/epoch=0-step=99.ckpt",
        type=str,
        required=False,
        help="模型路径",
    )
    parser.add_argument("--prefix",
                        default="我",
                        type=str,
                        required=False,
                        help="生成文章的开头")
    parser.add_argument("--no_wordpiece",
                        action="store_true",
                        help="不做word piece切词")
    parser.add_argument("--segment", action="store_true", help="中文以词为单位")
    parser.add_argument("--fast_pattern",
                        action="store_true",
                        help="采用更加快的方式生成文本")
    parser.add_argument("--save_samples", action="store_true", help="保存产生的样本")
    parser.add_argument("--save_samples_path",
                        default=".",
                        type=str,
                        required=False,
                        help="保存样本的路径")
    parser.add_argument("--repetition_penalty",
                        default=1.0,
                        type=float,
                        required=False)

    args = parser.parse_args()
    print("args:\n" + args.__repr__())

    os.environ["CUDA_VISIBLE_DEVICES"] = args.device  # 此处设置程序使用哪些显卡
    length = args.length
    batch_size = args.batch_size
    nsamples = args.nsamples
    temperature = args.temperature
    topk = args.topk
    topp = args.topp
    repetition_penalty = args.repetition_penalty

    device = "cuda" if torch.cuda.is_available() else "cpu"

    tokenizer = BertTokenizer(vocab_file=args.tokenizer_path)
    model_config = GPT2Config.from_json_file(args.model_config)
    model = GPT2LMHeadModel(config=model_config)
    state_dict = {
        key[6:]: value
        for key, value in torch.load(args.model_path, map_location="cpu")
        ["state_dict"].items()
    }
    model.load_state_dict(state_dict)
    model.to(device)
    model.eval()

    for i in range(10):
        raw_text = args.prefix
        encoded = tokenizer.encode_plus(raw_text)["input_ids"]
        out = sample_sequence(
            model,
            encoded,
            length=512,
            n_ctx=1024,
            tokenizer=tokenizer,
            temperature=temperature,
            top_k=topk,
            top_p=topp,
            repitition_penalty=repetition_penalty,
            device=device,
        )
        print(tokenizer.decode(out))
Exemple #10
0
class Dialog(Agent):
    def __init__(self, model_file=DEFAULT_MODEL_URL, name="Dialog"):
        super(Dialog, self).__init__(name=name)
        if not os.path.exists(os.path.join(DEFAULT_DIRECTORY,'multiwoz/data')):
            os.mkdir(os.path.join(DEFAULT_DIRECTORY,'multiwoz/data'))
            ### download multiwoz data
            print('down load data from', DEFAULT_ARCHIVE_FILE_URL)
        if not os.path.exists(os.path.join(DEFAULT_DIRECTORY,'multiwoz/save')):
            os.mkdir(os.path.join(DEFAULT_DIRECTORY,'multiwoz/save'))
            ### download trained model
            print('down load model from', DEFAULT_MODEL_URL)
        model_path = ""

        config = Config()
        parser = config.parser
        config = parser.parse_args()
        with open("assets/never_split.txt") as f:
            never_split = f.read().split("\n")
        self.tokenizer = BertTokenizer("assets/vocab.txt", never_split=never_split)
        self.nlu = BERTNLU()
        self.dst_ = DST(config).cuda()
        ckpt = torch.load("save/model_Sun_Jun_21_07:08:48_2020.pt", map_location = lambda storage, loc: storage.cuda(local_rank))
        self.dst_.load_state_dict(ckpt["model"])
        self.dst_.eval()
        self.policy = RulePolicy()
        self.nlg = TemplateNLG(is_user=False)
        self.init_session()
        self.slot_mapping = {
            "leave": "leaveAt",
            "arrive": "arriveBy"
        }

    def init_session(self):
        self.nlu.init_session()
        self.policy.init_session()
        self.nlg.init_session()
        self.history = []
        self.state = default_state()
        pass

    def response(self, user):
        self.history.append(["user", user])

        user_action = []

        self.input_action = self.nlu.predict(user, context=[x[1] for x in self.history[:-1]])
        self.input_action = deepcopy(self.input_action)
        for act in self.input_action:
            intent, domain, slot, value = act
            if intent == "Request":
                user_action.append(act)
                if not self.state["request_state"].get(domain):
                    self.state["request_state"][domain] = {}
                if slot not in self.state["request_state"][domain]:
                    self.state['request_state'][domain][slot] = 0
        
        context = " ".join([utterance[1] for utterance in self.history])
        context = context[-MAX_CONTEXT_LENGTH:]
        context = self.tokenizer.encode(context)
        context = torch.tensor(context, dtype=torch.int64).unsqueeze(dim=0).cuda()  # [1, len]

        belief_gen = self.dst_(None, context, 0, test=True)[0]  # [slots, len]
        for slot_idx, domain_slot in enumerate(ontology.all_info_slots):
            domain, slot = domain_slot.split("-")
            slot = self.slot_mapping.get(slot, slot)
            value = belief_gen[slot_idx][:-1]  # remove <EOS>
            value = self.tokenizer.decode(value)
            if value != "none":
                if slot in self.state["belief_state"][domain]["book"].keys():
                    if self.state["belief_state"][domain]["book"][slot] == "":
                        action = ["Inform", domain.capitalize(), REF_USR_DA[domain].get(slot, slot), value]
                        user_action.append(action)
                    self.state["belief_state"][domain]["book"][slot] = value
                elif slot in self.state["belief_state"][domain]["semi"].keys():
                    if self.state["belief_state"][domain]["semi"][slot] == "":
                        action = ["Inform", domain.capitalize(), REF_USR_DA[domain].get(slot, slot), value]
                        user_action.append(action)
                    self.state["belief_state"][domain]["semi"][slot] = value

        self.state["user_action"] = user_action

        self.output_action = deepcopy(self.policy.predict(self.state))
        model_response = self.nlg.generate(self.output_action)
        self.history.append(["sys", model_response])

        return model_response
                                        max_seq_len=max_len,
                                        causal=True)
    checkpoint = torch.load(PATH, map_location=torch.device('cpu'))
    model.load_state_dict(checkpoint['model_state_dict'])
    # model.load_state_dict(torch.load(PATH, map_location=torch.device('cpu')))
    model.eval()
    sent = '사람이 철학적으로 생각하는 것은'
    padd_token_id = tokenizer.pad_token_id
    tokenized_sentence = tokenizer.encode(sent, add_special_tokens=False)
    while 1:
        input_ids = sentence_mask_to_max_length([
            tokenizer.cls_token_id,
        ] + tokenized_sentence, 128, 0)
        input_ids = torch.tensor(input_ids).unsqueeze(0)

        output = model(input_ids)
        pred = output[0]
        next_token_pred = pred.squeeze()[len(tokenized_sentence)]
        top_k_sample = top_k(next_token_pred, 9)
        # gen = tokenizer.decode(top_k_sample).replace(' ','')
        tokenized_sentence = tokenized_sentence + top_k_sample.tolist()
        # if gen == '[SEP]':
        #     pass
        #
        # if '##'in gen:
        #   sent += gen.replace('##','')
        # else:
        #   sent += ' '+gen
        print(tokenizer.decode(tokenized_sentence, skip_special_tokens=True))
        # tokenized_sentence = tokenizer.encode(sent, add_special_tokens=False)
def mask_and_fill_by_threshold(
    text: str,
    bert: BertForMaskedLM,
    tokenizer: BertTokenizer,
    threshold: float,
    device,
    mlm_score,
):
    print("=== New example ===")
    print("Original Text: {}".format(text))
    context, response, word_list, score_list, sorted_index = mlm_score
    if score_list is None:
        raise ValueError
    if len(text.split()) < 3:
        return None
    assert len(sorted_index[0]) == len(sorted_index[1]) == len(score_list)
    response = response.strip()
    text = text.strip()
    assert response == text

    if max(score_list) < threshold or len(score_list) < 3:
        return None

    encoded = tokenizer.encode_plus(
        text,
        return_tensors="pt",
        max_length=512,
        truncation=True,
    )
    assert len(encoded["input_ids"][0]) == len(word_list) + 2

    masked_token_indices = []
    masked_token_original_list = []
    for idx, score in enumerate(score_list):
        if score >= threshold:
            masked_token_original_list.append(
                encoded["input_ids"][0][idx + 1].clone().detach())
            encoded["input_ids"][0][idx + 1] = tokenizer.mask_token_id
            masked_token_indices.append(idx + 1)

    encoded = {k: v.to(device) for k, v in encoded.items()}
    with torch.no_grad():
        output = bert(**encoded)[0]

    changed_indices = []
    for mask_order, mask_index in enumerate(masked_token_indices):
        while True:
            decoded_index = torch.argmax(output[0][mask_index]).item()
            if decoded_index not in [masked_token_original_list[mask_order]]:
                break
            output[0][mask_index, decoded_index] = -100
        changed_indices.append(decoded_index)

    for idx, mask_position in enumerate(masked_token_indices):
        encoded["input_ids"][0][mask_position] = changed_indices[idx]

    changed_response = tokenizer.decode(
        encoded["input_ids"][0],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True,
    )

    print("Changed: {}".format(changed_response))
    return changed_response
Exemple #13
0
                           ToTensor(tokenizer)
                       ]))
a = DataLoader(x, batch_size=10, sampler=SubsetRandomSampler(x.sampler))
for i, s in enumerate(a):
    print(i)
    print(s)

tokenizer.ids_to_tokens[0]
tokenizer.convert_ids_to_tokens
tokenizer.vocab.keys()
tokenizer.build_inputs_with_special_tokens([95, 209], [95, 209])
tokenizer.pretrained_vocab_files_map

tokenizer.encode("<BOS> I like tea")
s = idr.dataset.tokens[0]
tokenizer.decode(tokenizer.encode(" ".join(s)))
encoded_tensor = torch.Tensor([
    tokenizer.encode(idr.dataset.partial[j])
    for j in range(len(idr.dataset.partial))
])
encoded_tensor = torch.Tensor(tokenizer.encode(idr.dataset.partial[0]))

model = BertModel.from_pretrained("bert-base-uncased",
                                  output_hidden_states=True)
model.eval()
with torch.no_grad():
    outputs = model(encoded_tensor)
    hidden_states = outputs[2]

data_root = os.path.join(pathlib.Path(__file__).parents[1], "data")
os.listdir(data_root)
Exemple #14
0
             new_node_embedding = node_decoder_hidden
             topv, topi = node_decoder_output.topk(1)
             if teacher_forcing:
                 node_decoder_input = target[:, ni].view(batch_size, 1)
             else:
                 node_decoder_input = topi.squeeze().detach(
                 )  # detach from history as input
             output.append(topi.squeeze().detach().cpu().numpy())
             loss += criterion(node_decoder_output.view(batch_size, -1),
                               target[:, ni].view(batch_size))
             # find the index of the corresponding new nodes
             if node_decoder_input == 1:
                 break
             n_id = y[j][1].index(X[j][k])
             target_id = y[j][0][n_id]
             phrase_loss, p_output, y_tokenize = gen_phrase(
                 phrase_generator, new_node_embedding, target_id)
             phrase_output.append(p_output)
             target_output.append(
                 tokenizer.decode(y_tokenize.view(-1).cpu().numpy()))
     for i in range(len(phrase_output)):
         total += 1
         if phrase_output[i] == target_output[i]:
             count += 1
         print("{} - {} / {}".format(index2phrase[X[j][i]],
                                     phrase_output[i],
                                     target_output[i]))
     #print(phrase_output)
     #print(target_output)
     print("-" * 100)
 print("ACC:{:.4f}".format(count / total))