def predict_word(text: str, model: BertForMaskedLM, tokenizer: BertTokenizer, tgt_word: str, tgt_pos: int):
    # print('Template sentence: ', text)
    mask_positions = []

    # insert mask tokens
    tokenized_text = tokenizer.tokenize(text)

    for i in range(len(tokenized_text)):
        if tokenized_text[i] == '_':
            tokenized_text[i] = '[MASK]'
            mask_positions.append(i)

    # Convert tokens to vocab indices
    token_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
    tokens_tensor = torch.tensor([token_ids])

    # Call BERT to calculate unnormalized probabilities for all pos
    model.eval()
    predictions = model(tokens_tensor)

    # normalize by softmax
    predictions = F.softmax(predictions, dim=2)

    # For the target word position, get probabilities for each word of interest
    normalized = predictions[0, tgt_pos, :]
    out_prob = normalized[tokenizer.vocab[tgt_word]].item()

    # Also, fill in all blanks by max prob, and print for inspection
    for mask_pos in mask_positions:
        predicted_index = torch.argmax(predictions[0, mask_pos, :]).item()
        predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
        tokenized_text[mask_pos] = predicted_token

    for mask_pos in mask_positions:
        tokenized_text[mask_pos] = "_" + tokenized_text[mask_pos] + "_"
    pred_sent = ' '.join(tokenized_text).replace(' ##', '')
    # print(pred_sent)
    return out_prob, pred_sent
Exemple #2
0
            continue
        if tokens[0] != CLS:
            tokens = [CLS] + tokens
        if tokens[-1] != SEP:
            tokens.append(SEP)
        token_idx, segment_idx, mask = to_bert_input(tokens, bert_tokenizer)
        with torch.no_grad():
            logits = bert_model(token_idx, segment_idx,
                                mask, masked_lm_labels=None)
        logits = logits.squeeze(0)
        probs = torch.softmax(logits, dim=-1)
        for idx, token in enumerate(tokens):
            if token == MASK:
                mask_cnt += 1
                print('Top {} predictions for {}th {}:'.format(
                    args.topk, mask_cnt, MASK))
                topk_prob, topk_indices = torch.topk(probs[idx, :], args.topk)
                topk_tokens = bert_tokenizer.convert_ids_to_tokens(
                    topk_indices.cpu().numpy())
                for prob, tok in zip(topk_prob, topk_tokens):
                    print('{} {}'.format(tok, prob))
                    predict_res.append(tok)
                print('='*80)
    cnt = correct_cnt = 0
    for item1, item2 in zip(res, predict_res):
        if item1 == item2:
            correct_cnt = correct_cnt+1
            print(item1+'对了!')
        cnt = cnt+1
    print('correct rate is:%.2f' % (correct_cnt/cnt))
Exemple #3
0
class BertWithJumanModel():
    def __init__(self, bert_path, vocab_file_name="vocab.txt", use_cuda=False):
        # 日本語文章をBERTに食わせるためにJumanを読み込む
        self.juman_tokenizer = JumanTokenizer()
        # 事前学習済みのBERTモデルのMaskedLMタスクモデルを読み込む
        self.model = BertForMaskedLM.from_pretrained(bert_path)
        # 事前学習済みのBERTモデルのTokenizerを読み込む
        self.bert_tokenizer = BertTokenizer(Path(bert_path) / vocab_file_name,
                                            do_lower_case=False,
                                            do_basic_tokenize=False)
        # CUDA-GPUを利用するかどうかのフラグ読み込み
        self.use_cuda = use_cuda

    def _preprocess_text(self, text):
        # 事前処理、テキストの半角スペースは削除
        return text.replace(" ", "")  # for Juman

    def paraphrase(self, text):
        # テキストの半角スペースを削除する
        preprocessed_text = self._preprocess_text(text)
        # 日本語のテキストを分かち書きし、トークンリストに変換する
        tokens = self.juman_tokenizer.tokenize(preprocessed_text)
        # トークンを半角スペースで結合しstrに変換する
        bert_tokens = self.bert_tokenizer.tokenize(" ".join(tokens))
        # テキストのサイズは128までなので、ヘッダ + トークン126個 + フッタを作成
        # トークンをidに置換する
        ids = self.bert_tokenizer.convert_tokens_to_ids(
            ["[CLS]"] + bert_tokens[:126] + ["[SEP]"])  # max_seq_len-2
        generated_token_ids = torch.tensor(ids).reshape(1, -1)

        if self.use_cuda:
            # GPUの利用チェック、利用
            generated_token_ids = generated_token_ids.to('cuda')
            self.model.to('cuda')

        # モデルを評価モードに変更
        self.model.eval()
        with torch.no_grad():
            for i in range(10):
                for j, _ in enumerate(tokens):
                    # 文章のトークン1つをMASKに変換する
                    # ヘッダは除くから、+1から
                    masked_index = j + 1

                    pre_token = generated_token_ids[0, masked_index].item()

                    generated_token_ids[
                        0, masked_index] = self.bert_tokenizer.vocab["[MASK]"]

                    outputs = self.model(generated_token_ids)
                    predictions = outputs[0]

                    _, predicted_indexes = torch.topk(
                        predictions[0, masked_index], k=5)
                    predicted_tokens = self.bert_tokenizer.convert_ids_to_tokens(
                        predicted_indexes.tolist())

                    print(predicted_tokens)

                    predict_token = predicted_indexes.tolist()[0]

                    # if pre_token == predict_token:
                    #     predict_token = predicted_indexes.tolist()[1]

                    generated_token_ids[0, masked_index] = predict_token

                    # idから文字列に変換、結合
                    sampled_sequence = [
                        self.bert_tokenizer.ids_to_tokens[token_id]
                        for token_id in generated_token_ids[0].cpu().numpy()
                    ]
                    sampled_sequence = "".join([
                        token[2:] if token.startswith("##") else token
                        for token in list(
                            filter(lambda x: x != '[PAD]', sampled_sequence))
                    ])

                    logger.info(
                        "sampled sequence: {}".format(sampled_sequence))
Exemple #4
0
class Generater:
    def __init__(self, bert_path):
        vocab_file_name = 'vocab.txt'
        # 日本語文章をBERTに食わせるためにJumanを読み込む
        self.juman_tokenizer = JumanTokenizer()
        # 事前学習済みのBERTモデルを読み込む
        self.model = BertModel.from_pretrained(bert_path)
        # 事前学習済みのBERTモデルのTokenizerを読み込む
        self.bert_tokenizer = BertTokenizer(Path(bert_path) / vocab_file_name,
                                            do_lower_case=False, do_basic_tokenize=False)
        self.vocab_size = len(self.bert_tokenizer.vocab)

        # 事前学習済みのBERTモデルのMaskedLMタスクモデルを読み込む
        self.model = BertForMaskedLM.from_pretrained(bert_path)

        # 除外するヘッダ等トークン
        except_tokens = ["[MASK]", 
        #"[PAD]",
        "[UNK]", "[CLS]", "[SEP]",
        "(", ")", "・", "/", "、", "。", "!", "?", "「", "」", "…", "’", "』", "『", ":", "※"
        ]
        self.except_ids = [self.bert_tokenizer.vocab[token] for token in except_tokens]

        # vocab_sizeのうち、except_ids以外は、利用する
        self.candidate_ids = [i for i in range(self.vocab_size)
                        if i not in self.except_ids]


    def _preprocess_text(self, text):
        # 事前処理、テキストの半角スペースは削除
        return text.replace(" ", "").replace('#', '')  # for Juman

    def text2tokens(self, text):
        # テキストの半角スペースを削除する
        preprocessed_text = self._preprocess_text(text)
        # 日本語のテキストを分かち書きし、トークンリストに変換する
        tokens = self.juman_tokenizer.tokenize(preprocessed_text)
        # トークンを半角スペースで結合しstrに変換する
        bert_tokens = self.bert_tokenizer.tokenize(" ".join(tokens))
        # テキストのサイズは128までなので、ヘッダ + トークン126個 + フッタを作成
        # トークンをidに置換する
        ids = self.bert_tokenizer.convert_tokens_to_ids(["[CLS]"] + bert_tokens[:126] + ["[SEP]"]) # max_seq_len-2
        generated_token_ids = torch.tensor(ids).reshape(1, -1)
        return generated_token_ids

    def tokens2text(self, tokens):
        sampled_sequence = [self.bert_tokenizer.ids_to_tokens[token_id]
                                        for token_id in tokens[0].cpu().numpy()]
        sampled_sequence = "".join(
            [
                token[2:] if token.startswith("##") else token
                for token in list(filter(lambda x: x != '[PAD]' and x != '[CLS]' and x != '[SEP]', sampled_sequence))
            ]
        )
        return sampled_sequence


    def likelihood(self, tokens):
        outputs = self.model(tokens)
        predictions = outputs[0]

        score_sum = 0.0
        for idx, scores in zip(tokens[0].tolist(), predictions[0].tolist()):
            score_sum += scores[idx]
        return score_sum

    def initialization_text(self, length=10):
        init_tokens = []
        # ヘッダ
        init_tokens.append(self.bert_tokenizer.vocab["[CLS]"])
        for _ in range(length):
            # ランダムに文字を選択
            init_tokens.append(random.choice(self.candidate_ids))
        # フッタ
        init_tokens.append(self.bert_tokenizer.vocab["[SEP]"])

        return torch.tensor(init_tokens).reshape(1, -1)

    def scoring(self, tokens):
        return self.likelihood(tokens) + self.juman_tokenizer.tanka_score_subsets(self.tokens2text(tokens)) + self.juman_tokenizer.tanka_score_flow(self.tokens2text(tokens))

    def select(self, l_tokens, size=5):
        scores = list(map(self.scoring, l_tokens))
        print(sorted(scores, reverse=True)[:3])
        selected = list(map(
            lambda x: x[0],
            sorted(
                list(zip(l_tokens, scores)), 
                key=lambda x: x[1],
                reverse=True
            )
        ))

        return selected

    def crossover(self, tokens_0, tokens_1):
        l_tokens_0 = tokens_0.numpy().reshape(-1).tolist()
        l_tokens_1 = tokens_1.numpy().reshape(-1).tolist()

        start = random.randint(1, len(l_tokens_0) - 3)
        end = random.randint(start, len(l_tokens_0) - 2)

        for num in range(start, end):
            l_tokens_0[num] = l_tokens_1[num]

        return torch.tensor(l_tokens_0).reshape(1, -1)

    def mutation(self, tokens, N=3):
        l_tokens = tokens.numpy().reshape(-1).tolist()

        for num in range(N):
            num = random.randint(1, len(l_tokens) - 2)
            l_tokens[num] = self.bert_tokenizer.vocab["[MASK]"]
            
            outputs = self.model(torch.tensor(l_tokens).reshape(1, -1))
            predictions = outputs[0]
            _, predicted_indexes = torch.topk(predictions[0, num], k=10)

            # random_tokens = [random.choice(self.candidate_ids) for i in range(1)]
            random_tokens = []

            predicted_indexes = list(
                set(predicted_indexes.tolist() + random_tokens) - set(self.except_ids)
            )

            predicted_tokens = self.bert_tokenizer.convert_ids_to_tokens(predicted_indexes)
            predict_token = random.choice(predicted_indexes)

            l_tokens[num] = predict_token

        return torch.tensor(l_tokens).reshape(1, -1)