Esempio n. 1
0
 def __init__(self,
              device,
              tokenizer: BertTokenizer,
              token_rate: float,
              exclude_names: bool = False):
     super().__init__(device)
     self.tokenizer = tokenizer
     self.token_rate = token_rate
     self.exclude_names = exclude_names
     self.mask_id = tokenizer.convert_tokens_to_ids(["[MASK]"])[0]
     self.sep_id = tokenizer.convert_tokens_to_ids(["[SEP]"])[0]
Esempio n. 2
0
class NERDataSet(Dataset):
    def __init__(self, data_path, config, add_cls=False, add_sep=False):
        self.config = config
        self.sents, self.tags = load_tsv(data_path,
                                         add_cls=add_cls,
                                         add_sep=add_sep)
        self.tokenizer = BertTokenizer(vocab_file=config.vocab_path,
                                       do_lower_case=False)
        self.tokenize()

    def __len__(self):
        return len(self.sents)

    def tokenize(self):
        alltok_sents, alltok_tags = [], []
        for sent_words, sent_tags in zip(self.sents, self.tags):
            tok_sent, tok_tag = [], []
            for w, t in zip(sent_words, sent_tags):  # tokenize the words
                tokens = self.tokenizer.tokenize(w)
                tok_ids = self.tokenizer.convert_tokens_to_ids(tokens)
                tok_tags = [t] + [self.config.piece_tag] * (len(tokens) - 1)
                ttags_ids = [self.config.tag2idx[tt] for tt in tok_tags]
                tok_sent.extend(tok_ids)
                tok_tag.extend(ttags_ids)
            alltok_sents.append(tok_sent)
            alltok_tags.append(tok_tag)
        self.tok_sents = alltok_sents
        self.tok_tags = alltok_tags

    def __getitem__(self, idx):
        return self.tok_sents[idx], self.tok_tags[idx]
Esempio n. 3
0
class JapaneseWorker:
    def __init__(self):
        self.juman_tokenizer = JumanTokenizer()
        self.bert_tokenizer = BertTokenizer(config['DEFAULT']['vocab_path'],
                                            do_basic_tokenize=False)
        self.cls_id = self.bert_tokenizer.vocab['[CLS]']
        self.mask_id = self.bert_tokenizer.vocab['[MASK]']
        self.bert_model = 'PATH_TO_BERTJPN'

        self.cp = 'checkpoint/jp/cp_step_710000.pt'
        self.opt = 'checkpoint/jp/opt_step_710000.pt'

    @staticmethod
    def linesplit(src):
        """
        :param src: type str, String type article
        :return: type list, punctuation seperated sentences
        """
        def remove_newline(x):
            x = x.replace('\n', '')
            return x

        def remove_blank(x):
            x = x.replace(' ', '')
            return x

        def remove_unknown(x):
            unknown = ['\u3000']
            for h in unknown:
                x = x.replace(h, '')
            return x

        src = remove_blank(src)
        src = remove_newline(src)
        src = remove_unknown(src)
        src_line = re.split('。(?<!」)|!(?<!」)|?(?!」)', src)
        src_line = [x for x in src_line if x is not '']
        return src_line

    def tokenizer(self, src):
        """
        :param src: type list, punctuation seperated sentences
        :return: token: type list, numberized tokens
                 token_id: type list, tokens
        """
        token = []
        token_id = []

        def _preprocess_text(text):
            return text.replace(" ", "")  # for Juman

        for sentence in src:
            preprocessed_text = _preprocess_text(sentence)
            juman_tokens = self.juman_tokenizer(preprocessed_text)
            tokens = self.bert_tokenizer.tokenize(" ".join(juman_tokens))
            tokens = ["[CLS]"] + tokens + ["[SEP]"]
            ids = self.bert_tokenizer.convert_tokens_to_ids(tokens)
            token += tokens
            token_id += ids
        return token, token_id
Esempio n. 4
0
class BertWithJumanModel():
    """学習済みBertを使うやつ Fork:https://github.com/yagays/pytorch_bert_japanese"""
    def __init__(self, bert_path, vocab_file_name="vocab.txt", use_cuda=False):
        self.juman_tokenizer = JumanTokenizer()
        self.model = BertModel.from_pretrained(bert_path)
        self.bert_tokenizer = BertTokenizer(Path(bert_path) / vocab_file_name,
                                            do_lower_case=False,
                                            do_basic_tokenize=False)
        self.use_cuda = use_cuda

    def _preprocess_text(self, text):
        return text.replace(" ", "")

    def get_sentence_embedding(self,
                               text,
                               pooling_layer=-2,
                               pooling_strategy="REDUCE_MEAN"):
        preprocessed_text = self._preprocess_text(text)
        n = math.ceil(len(preprocessed_text) / 2048)
        result = [
            preprocessed_text[idx:idx + n]
            for idx in range(0, len(preprocessed_text), n)
        ]
        tokens = []
        for t in result:
            tokens += self.juman_tokenizer.tokenize(t)
        bert_tokens = self.bert_tokenizer.tokenize(" ".join(tokens))
        ids = self.bert_tokenizer.convert_tokens_to_ids(
            ["[CLS]"] + bert_tokens[:126] + ["[SEP]"])  # max_seq_len-2
        tokens_tensor = torch.tensor(ids).reshape(1, -1)

        if self.use_cuda:
            tokens_tensor = tokens_tensor.to('cuda')
            self.model.to('cuda')

        self.model.eval()
        with torch.no_grad():
            all_encoder_layers, _ = self.model(tokens_tensor)

        embedding = all_encoder_layers[pooling_layer].cpu().numpy()[0]
        if pooling_strategy == "REDUCE_MEAN":
            return np.mean(embedding, axis=0)
        elif pooling_strategy == "REDUCE_MAX":
            return np.max(embedding, axis=0)
        elif pooling_strategy == "REDUCE_MEAN_MAX":
            return np.r_[np.max(embedding, axis=0), np.mean(embedding, axis=0)]
        elif pooling_strategy == "CLS_TOKEN":
            return embedding[0]
        else:
            raise ValueError(
                "specify valid pooling_strategy: {REDUCE_MEAN, REDUCE_MAX, REDUCE_MEAN_MAX, CLS_TOKEN}"
            )
Esempio n. 5
0
def get_words_for_blank_slow_decode(text: str, model: BertForMaskedLM, tokenizer: BertTokenizer):
    random.seed(42)
    np.random.seed(42)
    torch.manual_seed(42)


    mask_positions = []
    tokenized_text = tokenizer.tokenize(text)
    top_words_all = []
    for i in range(len(tokenized_text)):
        if tokenized_text[i] == '_':
            tokenized_text[i] = '[MASK]'
            mask_positions.append(i)

    while mask_positions:
        top_words = []
        # Convert tokens to vocab indices
        token_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
        tokens_tensor = torch.tensor([token_ids])

        # Call BERT to calculate unnormalized probabilities for all pos
        model.eval()
        predictions = model(tokens_tensor)

        # get predictions
        mask_preds = predictions[0, mask_positions, :]

        candidates = [] #(word, prob)
        for mask_pos in mask_positions:
            mask_preds = predictions[0, mask_pos, :]

            top_idxs = mask_preds.detach().numpy().argsort()[::-1]
            top_idx = top_idxs[0]
            top_prob = mask_preds[top_idx]
            top_word = tokenizer.ids_to_tokens[top_idx]
            candidates.append((top_word, top_prob.detach().item()))
            top_words_pos = []
            for i in top_idxs[:20]:
                top_words_pos.append((tokenizer.ids_to_tokens[i], mask_preds[i].detach().item()))
            top_words.append(top_words_pos)
        best_candidate = max(candidates, key = lambda x: x[1])
        best_pos = mask_positions[candidates.index(best_candidate)]

        tokenized_text[best_pos] = best_candidate[0]
        mask_positions = [i for i in mask_positions if i != best_pos]

        top_words_all.append(top_words[candidates.index(best_candidate)])

    pred_sent = ' '.join(tokenized_text).replace(' ##', '')
    return (pred_sent, top_words_all)
Esempio n. 6
0
class FedPredictDataset(Dataset):
    def __init__(self,
                 texts,
                 vocab_path,
                 max_seq_length=512,
                 vocab='finance-uncased'):
        self.texts = texts
        self.dict_labels = {'lower': 0, 'maintain': 1, 'raise': 2}

        self.max_seq_length = max_seq_length
        self.vocab = vocab
        if self.vocab == 'finance-uncased':
            self.tokenizer = BertTokenizer(vocab_file=vocab_path,
                                           do_lower_case=True,
                                           do_basic_tokenize=True)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        tokenized_review = self.tokenizer.tokenize(self.texts[index])

        if len(tokenized_review) > self.max_seq_length:
            tokenized_review = tokenized_review[:self.max_seq_length]

        ids_review = self.tokenizer.convert_tokens_to_ids(tokenized_review)

        mask_input = [1] * len(ids_review)

        padding = [0] * (self.max_seq_length - len(ids_review))
        ids_review += padding
        mask_input += padding

        input_type = [0] * self.max_seq_length

        assert len(ids_review) == self.max_seq_length
        assert len(mask_input) == self.max_seq_length
        assert len(input_type) == self.max_seq_length

        ids_review = torch.tensor(ids_review)
        mask_input = torch.tensor(mask_input)
        input_type = torch.tensor(input_type)

        input_feature = {
            "token_type_ids": input_type,
            "attention_mask": mask_input,
            "input_ids": ids_review
        }

        return input_feature
def _bert_embed_sentence(sentence, bert_model: BertModel, bert_tokenizer: BertTokenizer):
    text = "[CLS] {} [SEP]".format(sentence)
    tokenized_text = bert_tokenizer.tokenize(text)
    indexed_tokens = bert_tokenizer.convert_tokens_to_ids(tokenized_text)
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_ids = [0] * len(indexed_tokens)
    segments_tensors = torch.tensor([segments_ids])

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    tokens_tensor = tokens_tensor.to(device)
    segments_tensors = segments_tensors.to(device)

    with torch.no_grad():
        encoded_layers, _ = bert_model(tokens_tensor, segments_tensors, output_all_encoded_layers=False)

    # Embedding of the [CLS] token
    return encoded_layers[0][0]
Esempio n. 8
0
class BertWithJumanModel:
    def __init__(self, bert_path, vocab_file_name="vocab.txt", use_cuda=False):
        self.juman_tokenizer = JumanTokenizer()
        self.model = BertModel.from_pretrained(bert_path)
        self.bert_tokenizer = BertTokenizer(Path(bert_path) / vocab_file_name,
                                            do_lower_case=False,
                                            do_basic_tokenize=False)
        self.use_cuda = use_cuda

    def _preprocess_text(self, text):
        return text.replace(" ", "")  # for Juman

    def get_sentence_embedding(self,
                               text,
                               pooling_layer=-2,
                               pooling_strategy="REDUCE_MEAN"):
        preprocessed_text = self._preprocess_text(text)
        tokens = self.juman_tokenizer.tokenize(preprocessed_text)
        bert_tokens = self.bert_tokenizer.tokenize(" ".join(tokens))
        ids = self.bert_tokenizer.convert_tokens_to_ids(
            ["[CLS]"] + bert_tokens[:126] + ["[SEP]"])  # max_seq_len-2
        tokens_tensor = torch.tensor(ids).reshape(1, -1)

        if self.use_cuda:
            tokens_tensor = tokens_tensor.to('cuda')
            self.model.to('cuda')

        self.model.eval()
        with torch.no_grad():
            all_encoder_layers, _ = self.model(tokens_tensor)

        embedding = all_encoder_layers[pooling_layer].cpu().numpy()[0]
        if pooling_strategy == "REDUCE_MEAN":
            return np.mean(embedding, axis=0)
        elif pooling_strategy == "REDUCE_MAX":
            return np.max(embedding, axis=0)
        elif pooling_strategy == "REDUCE_MEAN_MAX":
            return np.r_[np.max(embedding, axis=0), np.mean(embedding, axis=0)]
        elif pooling_strategy == "CLS_TOKEN":
            return embedding[0]
        else:
            raise ValueError(
                "specify valid pooling_strategy: {REDUCE_MEAN, REDUCE_MAX, REDUCE_MEAN_MAX, CLS_TOKEN}"
            )
Esempio n. 9
0
def get_sample_bert_token_id_seq(bert_tokenizer: BertTokenizer, left_seq_str,
                                 right_seq_str, max_seq_len):
    left_bert_token_seq = bert_tokenizer.tokenize(left_seq_str)
    right_bert_token_seq = bert_tokenizer.tokenize(right_seq_str)

    if len(right_bert_token_seq) + 3 > max_seq_len:
        right_bert_token_seq = right_bert_token_seq[:max_seq_len - 3]

    if len(right_bert_token_seq) + len(left_bert_token_seq) + 3 > max_seq_len:
        left_bert_token_seq = left_bert_token_seq[:max_seq_len -
                                                  len(right_bert_token_seq) -
                                                  3]

    bert_token_seq = ['[CLS]'] + left_bert_token_seq + [
        '[SEP]'
    ] + right_bert_token_seq + ['[SEP]']
    # print(bert_token_seq)
    bert_token_id_seq = bert_tokenizer.convert_tokens_to_ids(bert_token_seq)
    return bert_token_id_seq
def convert_data2(path1, path2, max_length, number, seq1, seq2):
    """转ID,进行padding,再加上CLP、SEP之后"""
    tokenizer = BertTokenizer('./model/bert-base-chinese/vocab.txt')
    input_id = []
    input_mask = []
    segment_id = []
    # number = 0
    print(len(seq1))

    for i in range(number):
        tokens_a = tokenizer.tokenize(seq1[i])
        tokens_b = tokenizer.tokenize(seq2[i])
        # print(seq2[i])
        # print(tokens_b)
        while True:
            if (len(tokens_a) + len(tokens_b)) <= max_length - 3:
                break
            else:
                # print(tokens_b)
                # tokens_b.pop()
                tokens_a = tokens_a[: int((max_length - 3) * len(tokens_a)/(len(tokens_a) + len(tokens_b)))]
                tokens_b = tokens_b[: int((max_length - 3) * len(tokens_b)/(len(tokens_a) + len(tokens_b)))]
        # 头尾加上[CLS] [SEP]标签
        tokens_a = ['[CLS]'] + tokens_a + ['[SEP]']
        tokens = tokens_a + tokens_b + ['[SEP]']
        input_id_ = tokenizer.convert_tokens_to_ids(tokens)
        segment_id_ = [0] * len(tokens_a) + [1] * (len(tokens_b) + 1)
        input_mask_ = [1] * len(tokens)
        # segment_id是用于区分token_a和token_b的
        # input_mask用于区分padding
        padding_ = [0] * (max_length - len(tokens))
        # 所有的输入进入bert的配置参数都要加上padding
        input_id_ += padding_
        segment_id_ += padding_
        input_mask_ += padding_
        # 每条语句放入列表中[sentence_num, MAX_LENGTH]
        input_id.append(input_id_)
        input_mask.append(input_mask_)
        segment_id.append(segment_id_)

    return input_id, input_mask, segment_id
Esempio n. 11
0
def convert_tokens_to_features(tokens: List[str],
                               tokenizer: BertTokenizer,
                               do_lower_case: bool = True) -> Features:
    input_tokens: List[str] = ['[CLS]']
    ids_to_original: List[int] = [-1]
    for k, token in enumerate(tokens):
        for wp in tokenizer.wordpiece_tokenizer.tokenize(
                token.lower() if do_lower_case else token):  # lower_case
            input_tokens.append(wp)
            ids_to_original.append(k)
    input_tokens.append('[SEP]')
    ids_to_original.append(-1)

    features: Features = Features(
        tokens=input_tokens,
        ids_to_original=torch.tensor(ids_to_original, dtype=torch.long),
        input_ids=torch.tensor(tokenizer.convert_tokens_to_ids(input_tokens),
                               dtype=torch.long),
    )

    return features
def predict_word(text: str, model: BertForMaskedLM, tokenizer: BertTokenizer, tgt_word: str, tgt_pos: int):
    # print('Template sentence: ', text)
    mask_positions = []

    # insert mask tokens
    tokenized_text = tokenizer.tokenize(text)

    for i in range(len(tokenized_text)):
        if tokenized_text[i] == '_':
            tokenized_text[i] = '[MASK]'
            mask_positions.append(i)

    # Convert tokens to vocab indices
    token_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
    tokens_tensor = torch.tensor([token_ids])

    # Call BERT to calculate unnormalized probabilities for all pos
    model.eval()
    predictions = model(tokens_tensor)

    # normalize by softmax
    predictions = F.softmax(predictions, dim=2)

    # For the target word position, get probabilities for each word of interest
    normalized = predictions[0, tgt_pos, :]
    out_prob = normalized[tokenizer.vocab[tgt_word]].item()

    # Also, fill in all blanks by max prob, and print for inspection
    for mask_pos in mask_positions:
        predicted_index = torch.argmax(predictions[0, mask_pos, :]).item()
        predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
        tokenized_text[mask_pos] = predicted_token

    for mask_pos in mask_positions:
        tokenized_text[mask_pos] = "_" + tokenized_text[mask_pos] + "_"
    pred_sent = ' '.join(tokenized_text).replace(' ##', '')
    # print(pred_sent)
    return out_prob, pred_sent
def bert_sentence_pair_preprocessing(dataset: pd.DataFrame, tokenizer: BertTokenizer, max_sequence_length=64):
    max_bert_input_length = 70

    dataset_input_ids = torch.empty((len(dataset), max_bert_input_length), dtype=torch.long)
    dataset_token_type_ids = torch.empty((len(dataset), max_bert_input_length), dtype=torch.long)
    dataset_attention_masks = torch.empty((len(dataset), max_bert_input_length), dtype=torch.long)
    dataset_lengths = torch.empty((len(dataset), 1), dtype=torch.long)
    dataset_labels = torch.empty((len(dataset), 1), dtype=torch.long)
    dataset_other_type_ids = torch.empty((len(dataset), 18), dtype=torch.long)
    # dataset_input_tensors = torch.empty(len(dataset), 4, max_bert_input_length, dtype=torch.float)

    for idx, data in dataset.iterrows():
        tokens = []
        input_type_ids = []

        # other type 전처리
        other_type_ids = []
        other_type_ids.append(data['addr0'])
        other_type_ids.append(data['addr1'])
        other_type_ids.append(data['addr2'])
        other_type_ids.append(data['addr3'])
        other_type_ids.append(data['addr4'])
        other_type_ids.append(data['addr5'])
        other_type_ids.append(data['phone0'])
        other_type_ids.append(data['phone1'])
        other_type_ids.append(data['phone2'])
        other_type_ids.append(data['phone3'])
        other_type_ids.append(data['cate0'])
        other_type_ids.append(data['cate1'])
        other_type_ids.append(data['cate2'])
        other_type_ids.append(data['cate3'])
        other_type_ids.append(data['cate4'])
        other_type_ids.append(data['cname0'])
        other_type_ids.append(data['cname1'])
        other_type_ids.append(data['cname2'])

        dataset_other_type_ids[idx] = torch.tensor(other_type_ids, dtype=torch.long)

        sentence_1_tokenized, sentence_2_tokenized = tokenizer.tokenize(data['full_placename1']), tokenizer.tokenize(data['full_placename2'])

        tokens.append("[CLS]")
        input_type_ids.append(0)

        for token in sentence_1_tokenized:
            tokens.append(token)
            input_type_ids.append(0)

        tokens.append("[SEP]")
        input_type_ids.append(0)

        for token in sentence_2_tokenized:
            tokens.append(token)
            input_type_ids.append(1)

        tokens.append("[SEP]")
        input_type_ids.append(1)

        # 전처리한 token 바탕으로 인덱스값 얻음
        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # attention mask 전처리
        attention_masks = [1] * len(input_ids)

        # input_ids length 저장
        dataset_lengths[idx] = torch.tensor(len(input_ids), dtype=torch.long)

        while len(input_ids) < max_bert_input_length:
            input_ids.append(0)
            attention_masks.append(0)
            input_type_ids.append(0)

        dataset_input_ids[idx] = torch.tensor(input_ids, dtype=torch.long)
        dataset_token_type_ids[idx] = torch.tensor(input_type_ids, dtype=torch.long)
        dataset_attention_masks[idx] = torch.tensor(attention_masks, dtype=torch.long)

        dataset_labels[idx] = torch.tensor(data['label'], dtype=torch.long)

    return dataset_input_ids, dataset_token_type_ids, dataset_attention_masks, dataset_other_type_ids, dataset_lengths, dataset_labels
Esempio n. 14
0
class Preprocess:
    def __init__(self):
        self.juman_tokenizer = JumanTokenizer()
        self.rouge_calculator = RougeNCalc()
        self.bert_tokenizer = BertTokenizer(config['DEFAULT']['vocab_path'],
                                            do_lower_case=False,
                                            do_basic_tokenize=False)
        self.trim_input = 0
        self.trim_clss = 0

    def __call__(self, data_dic, length):
        self.src_body = data_dic['body']
        self.src_summary = data_dic['summary'].split('<sep>')
        self._init_data()

        if self.src_body is '':
            raise ValueError('Empty data')

        # step 1. article to lines
        self._split_line()
        # step 2. pick extractive summary by rouge
        self._rougematch()
        # step 3. tokenize
        self._tokenize()
        # step 4. clss process
        self._prep_clss()
        # step 5. segs process
        self._prep_segs()
        # step 6. trim length for input
        self._set_length(length)

        return {
            'src': self.tokenid,
            'labels': self.label,
            'segs': self.segs,
            'mask': self.mask,
            'mask_cls': self.mask_cls,
            'clss': self.clss,
            'src_str': self.src_line
        }

    def _init_data(self):
        self.src_line = []
        self.label = []
        self.tokenid = []
        self.token = []
        self.clss = []
        self.segs = []
        self.mask = []
        self.mask_cls = []

    # step 1.
    def _split_line(self):
        # regex note: (?!...) Negative Lookahead
        # e.g. /foo(?!bar)/ for "foobar foobaz" get "foobaz" only
        self.src_line = re.split('。(?<!」)|!(?<!」)|?(?!」)', self.src_body)
        self.src_line = [x for x in self.src_line if x is not '']

    # step 2.
    def _rougematch(self):
        self.label = [0] * len(self.src_line)
        for summ in self.src_summary:
            scores = [self.rouge_calculator(x, summ) for x in self.src_line]
            self.label[scores.index(max(scores))] = 1

    # step 3.
    def _tokenize(self):
        def _preprocess_text(text):
            return text.replace(" ", "")  # for Juman

        for sentence in self.src_line:
            preprocessed_text = _preprocess_text(sentence)
            juman_tokens = self.juman_tokenizer(preprocessed_text)
            tokens = self.bert_tokenizer.tokenize(" ".join(juman_tokens))
            tokens = ["[CLS]"] + tokens + ["[SEP]"]
            ids = self.bert_tokenizer.convert_tokens_to_ids(tokens)
            self.token += tokens
            self.tokenid += ids

    # step 4.
    def _prep_clss(self):
        self.clss = [
            i for i, x in enumerate(self.tokenid)
            if x == self.bert_tokenizer.vocab['[CLS]']
        ]

    # step 5.
    def _prep_segs(self):
        flag = 1
        for idx in self.tokenid:
            if idx == self.bert_tokenizer.vocab['[CLS]']:
                flag = not flag
            self.segs.append(int(flag))

    # step 6.
    def _set_length(self, n):
        self.__trim_data(n)
        self.__add_mask(n)

    def __trim_data(self, n):
        if len(self.tokenid) > n:
            # If last sentence starts after 512
            if self.clss[-1] > 512:
                for i, idx in enumerate(self.clss):
                    if idx > n:
                        # Index of last [SEP] in length=n
                        self.trim_input = self.clss[i - 1] - 1
                        # Index of last [CLS] index in clss
                        self.trim_clss = i - 2
                        break
            # If src longer than 512 but last sentence start < 512
            else:
                self.trim_input = self.clss[len(self.clss) - 1] - 1
                self.trim_clss = len(self.clss) - 2
        # Do nothing if length < n
        if self.trim_clss * self.trim_input == 0:
            return
        self.tokenid = self.tokenid[:(self.trim_input + 1)]
        self.segs = self.segs[:(self.trim_input + 1)]
        self.clss = self.clss[:(self.trim_clss + 1)]
        self.label = self.label[:(self.trim_clss + 1)]
        self.src_line = self.src_line[:(self.trim_clss + 1)]

    def __add_mask(self, n):
        # from index to len: +1
        pad_len = (n - len(self.tokenid))
        self.tokenid = self.tokenid + ([self.bert_tokenizer.vocab['[MASK]']] *
                                       pad_len)
        self.segs = self.segs + ([int(not self.segs[-1])] * pad_len)
class text_dataset(Dataset):
    def __init__(self,
                 x_y_list,
                 vocab_path,
                 max_seq_length=256,
                 vocab='base-cased',
                 transform=None):
        self.max_seq_length = max_seq_length
        self.x_y_list = x_y_list
        self.vocab = vocab
        if self.vocab == 'base-cased':
            self.tokenizer = BertTokenizer.from_pretrained(
                'bert-base-cased', do_lower_case=False, do_basic_tokenize=True)
        elif self.vocab == 'finance-cased':
            self.tokenizer = BertTokenizer(vocab_file=vocab_path,
                                           do_lower_case=False,
                                           do_basic_tokenize=True)
        elif self.vocab == 'base-uncased':
            self.tokenizer = BertTokenizer.from_pretrained(
                'bert-base-uncased',
                do_lower_case=True,
                do_basic_tokenize=True)
        elif self.vocab == 'finance-uncased':
            self.tokenizer = BertTokenizer(vocab_file=vocab_path,
                                           do_lower_case=True,
                                           do_basic_tokenize=True)

    def __getitem__(self, index):
        tokenized_review = self.tokenizer.tokenize(self.x_y_list[0][index])

        if len(tokenized_review) > self.max_seq_length:
            tokenized_review = tokenized_review[:self.max_seq_length]

        ids_review = self.tokenizer.convert_tokens_to_ids(tokenized_review)

        mask_input = [1] * len(ids_review)

        padding = [0] * (self.max_seq_length - len(ids_review))
        ids_review += padding
        mask_input += padding

        input_type = [0] * self.max_seq_length

        assert len(ids_review) == self.max_seq_length
        assert len(mask_input) == self.max_seq_length
        assert len(input_type) == self.max_seq_length

        ids_review = torch.tensor(ids_review)
        mask_input = torch.tensor(mask_input)
        input_type = torch.tensor(input_type)

        sentiment = self.x_y_list[1][index]
        list_of_labels = [torch.from_numpy(np.array(sentiment))]

        input_feature = {
            "token_type_ids": input_type,
            "attention_mask": mask_input,
            "input_ids": ids_review
        }

        return input_feature, list_of_labels[0]

    def __len__(self):
        return len(self.x_y_list[0])
Esempio n. 16
0
class CAILDataset(Dataset):
    def __init__(
        self,
        data_path,
        max_seq_len,
        vocab_path,
        # tfidf_a_df,
        # tfidf_b_df,
        # tfidf_c_df,
        fts_flag=False,
        mode="test",
    ):
        self.data_path = data_path
        self.max_seq_len = max_seq_len
        self.vocab_path = vocab_path
        # self.exft_a_df = tfidf_a_df
        # self.exft_b_df = tfidf_b_df
        # self.exft_c_df = tfidf_c_df
        self.fts_flag = fts_flag
        self.mode = mode
        self.reset()

    def reset(self):
        self.tokenizer = BertTokenizer(vocab_file=self.vocab_path)
        self.build_examples()

    def read_data(self):
        print(self.data_path)
        xlist = []
        with open(self.data_path, "r", encoding="utf-8") as f:
            for i, line in enumerate(f):
                x = json.loads(line)
                # xlist.append((x["A"], x["B"], x["C"]))
                if self.mode == "train" or self.mode == "valid":
                    if i % 2 == 0:
                        xlist.append((x["A"], x["B"], x["C"]))
                    else:
                        xlist.append((x["A"], x["C"], x["B"]))
                else:
                    xlist.append((x["A"], x["B"], x["C"]))
        return xlist

    def build_examples(self):
        xlist = self.read_data()
        self.examples = []
        list_text_a = []
        list_text_b = []
        list_text_c = []
        for idx, x in enumerate(xlist):
            guid = "%s-%d" % (self.mode, idx)
            text_a = x[0]
            text_b = x[1]
            text_c = x[2]
            example = InputExample(guid=guid,
                                   text_a=text_a,
                                   text_b=text_b,
                                   text_c=text_c)
            self.examples.append(example)
            list_text_a.append(text_a)
            list_text_b.append(text_b)
            list_text_c.append(text_c)
        if self.fts_flag:
            self.exft_a_df = self.build_ex_features(list_text_a)
            self.exft_b_df = self.build_ex_features(list_text_b)
            self.exft_c_df = self.build_ex_features(list_text_c)
            self.exft_a_df.fillna(0, inplace=True)
            self.exft_b_df.fillna(0, inplace=True)
            self.exft_c_df.fillna(0, inplace=True)

    def build_features(self, example):
        max_seq_len = self.max_seq_len - 2

        tokens_a = self.tokenizer.tokenize(example.text_a)
        tokens_b = self.tokenizer.tokenize(example.text_b)
        tokens_c = self.tokenizer.tokenize(example.text_c)

        if len(tokens_a) > max_seq_len:
            tokens_a = tokens_a[-max_seq_len:]
        if len(tokens_b) > max_seq_len:
            tokens_b = tokens_b[-max_seq_len:]
        if len(tokens_c) > max_seq_len:
            tokens_c = tokens_c[-max_seq_len:]

        input_ids_a = self.tokenizer.convert_tokens_to_ids(["[CLS]"] +
                                                           tokens_a +
                                                           ["[SEP]"])
        input_ids_b = self.tokenizer.convert_tokens_to_ids(["[CLS]"] +
                                                           tokens_b +
                                                           ["[SEP]"])
        input_ids_c = self.tokenizer.convert_tokens_to_ids(["[CLS]"] +
                                                           tokens_c +
                                                           ["[SEP]"])
        input_mask_a = [1] * len(input_ids_a)
        input_mask_b = [1] * len(input_ids_b)
        input_mask_c = [1] * len(input_ids_c)
        segment_ids_a = [0] * len(input_ids_a)
        segment_ids_b = [0] * len(input_ids_b)
        segment_ids_c = [0] * len(input_ids_c)

        padding_a = [0] * (max_seq_len - len(tokens_a))
        padding_b = [0] * (max_seq_len - len(tokens_b))
        padding_c = [0] * (max_seq_len - len(tokens_c))

        input_ids_a += padding_a
        segment_ids_a += padding_a
        input_mask_a += padding_a
        input_ids_b += padding_b
        segment_ids_b += padding_b
        input_mask_b += padding_b
        input_ids_c += padding_c
        segment_ids_c += padding_c
        input_mask_c += padding_c

        feature_a = InputFeature(
            input_ids=input_ids_a,
            segment_ids=segment_ids_a,
            input_mask=input_mask_a,
        )
        feature_b = InputFeature(
            input_ids=input_ids_b,
            segment_ids=segment_ids_b,
            input_mask=input_mask_b,
        )
        feature_c = InputFeature(
            input_ids=input_ids_c,
            segment_ids=segment_ids_c,
            input_mask=input_mask_c,
        )
        return feature_a, feature_b, feature_c

    def build_ex_features(self, list_text):
        return do_feature_engineering(list_text)

    def _preprocess_op(self, index):
        example = self.examples[index]
        if self.mode == "train" or self.mode == "valid":
            if index % 2 == 0:
                op = 1
            else:
                op = -1
        else:
            op = 1
        feature_a, feature_b, feature_c = self.build_features(example)
        return (
            op,
            np.array(feature_a.input_ids, dtype=np.int64),
            np.array(feature_a.segment_ids, dtype=np.int64),
            np.array(feature_a.input_mask, dtype=np.int64),
            np.array(feature_b.input_ids, dtype=np.int64),
            np.array(feature_b.segment_ids, dtype=np.int64),
            np.array(feature_b.input_mask, dtype=np.int64),
            np.array(feature_c.input_ids, dtype=np.int64),
            np.array(feature_c.segment_ids, dtype=np.int64),
            np.array(feature_c.input_mask, dtype=np.int64),
        )

    def _exft_preprocess_op(self, index):
        example = self.examples[index]
        if self.mode == "train" or self.mode == "valid":
            if index % 2 == 0:
                op = 1
            else:
                op = -1
        else:
            op = 1
        feature_a, feature_b, feature_c = self.build_features(example)
        return (
            op,
            np.array(feature_a.input_ids, dtype=np.int64),
            np.array(feature_a.segment_ids, dtype=np.int64),
            np.array(feature_a.input_mask, dtype=np.int64),
            np.array(feature_b.input_ids, dtype=np.int64),
            np.array(feature_b.segment_ids, dtype=np.int64),
            np.array(feature_b.input_mask, dtype=np.int64),
            np.array(feature_c.input_ids, dtype=np.int64),
            np.array(feature_c.segment_ids, dtype=np.int64),
            np.array(feature_c.input_mask, dtype=np.int64),
            torch.tensor(self.exft_a_df.iloc[index], dtype=torch.float32),
            torch.tensor(self.exft_b_df.iloc[index], dtype=torch.float32),
            torch.tensor(self.exft_c_df.iloc[index], dtype=torch.float32),
        )

    def __getitem__(self, index):
        if self.fts_flag:
            return self._exft_preprocess_op(index)
        else:
            return self._preprocess_op(index)

    def __len__(self):
        return len(self.examples)
Esempio n. 17
0
def evaluate(args:Dict):
    model_root = args['--model-root'] if args['--model-root'] else './models'
    print("load model from {}".format(model_root), file=sys.stderr)

    dataLoader = sentence.Sentence(args['--test-src'])

    device = torch.device("cuda:0" if args['--cuda'] else "cpu")

    output_model_file = os.path.join(model_root, "model_file.bin")
    output_config_file = os.path.join(model_root, "config_file.bin")
    output_vocab_file = os.path.join(model_root, "vocab.txt")
    config = BertConfig.from_json_file(output_config_file)
    model = BertForTokenClassification(config,num_labels=len(dataLoader.tag2idx))
    state_dict = torch.load(output_model_file)
    model.load_state_dict(state_dict)
    tokenizer = BertTokenizer(output_vocab_file, do_lower_case=False)

    tokenized_texts = [tokenizer.tokenize(sent) for sent in dataLoader.sentences]

    if args['--cuda']:
        model = model.to(torch.device("cuda:0"))

    MAX_LEN = int(args['--max-len'])

    input_ids_test = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                              maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

    tags_test = pad_sequences([[dataLoader.tag2idx.get(l) for l in lab] for lab in dataLoader.labels],
                         maxlen=MAX_LEN, value=dataLoader.tag2idx["O"], padding="post",
                         dtype="long", truncating="post")

    attention_masks_test = [[float(i > 0) for i in ii] for ii in input_ids_test]

    for i, inp in enumerate(input_ids_test):
        if (102 not in inp):
            inp[-1] = 102
            tags_test[i][-1] = dataLoader.tag2idx.get("O")

    te_inputs = torch.tensor(input_ids_test).to(torch.int64)
    te_tags = torch.tensor(tags_test).to(torch.int64)
    te_masks = torch.tensor(attention_masks_test)

    test_data = TensorDataset(te_inputs, te_masks, te_tags)
    test_sampler = SequentialSampler(test_data)
    test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=int(args['--batch-size']))

    model.eval()
    predictions = []
    true_labels = []
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
            tmp_eval_loss = model(b_input_ids, token_type_ids=None,
                                  attention_mask=b_input_mask, labels=b_labels)
            logits = model(b_input_ids, token_type_ids=None,
                           attention_mask=b_input_mask)

        logits = logits.detach().cpu().numpy()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        label_ids = b_labels.to('cpu').numpy()
        true_labels.append(label_ids)
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)

        eval_loss += tmp_eval_loss.mean().item()
        eval_accuracy += tmp_eval_accuracy

        nb_eval_examples += b_input_ids.size(0)
        nb_eval_steps += 1

    pred_tags = [[dataLoader.tags_vals[p_i] for p_i in p] for p in predictions]
    test_tags = [[dataLoader.tags_vals[l_ii] for l_ii in l_i] for l in true_labels for l_i in l]

    tags_test_fin = list()
    for l in tags_test:
        temp_tag = list()
        for l_i in l:
            temp_tag.append(dataLoader.tags_vals[l_i])
        tags_test_fin.append(temp_tag)

    print("Test loss: {}".format(eval_loss / nb_eval_steps))
    print("Test Accuracy: {}".format(eval_accuracy / nb_eval_steps))
    print("Test F1-Score: {}".format(f1_score(tags_test_fin, pred_tags)))

    print(classification_report(tags_test_fin, pred_tags))

    print("Number of Test sentences: ", len(tags_test_fin))
Esempio n. 18
0
class mod_eventclass(BasePlugin):
    """ Web Scraping plugin: mod_eventclass
    For classifying news events.
    """
    minArticleLengthInChars = 400
    pluginType = Types.MODULE_DATA_PROCESSOR  # implies data post-processor

    dataFrame = None
    device = None
    model = None
    sentencesColList = [
        'url', 'sentence', 'sentence_no', 'neutral_prob', 'positive_prob',
        'negative_prob'
    ]
    sentencesRec = None

    def __init__(self):
        """ Initialize the object
        """
        super().__init__()

    def additionalConfig(self, sessionHistoryObj):
        """ Perform additional configuration that is specific to this plugin.

        :param sessionHistoryObj: The session history object to be used by this plugin
         for putting items into the data processing competed queue.
        :return:
        """
        self.workDir = self.app_config.data_dir
        self.sessionHistDB = sessionHistoryObj
        self.pretuned_modelfile = self.app_config.checkAndSanitizeConfigString(
            'plugins', 'mod_eventclass_modelfile')
        self.model_weights_path = self.app_config.checkAndSanitizeConfigString(
            'plugins', 'mod_eventclass_weightspath')
        self.vocab_path = self.app_config.checkAndSanitizeConfigString(
            'plugins', 'mod_eventclass_vocab_path')
        self.labels = {0: 'neutral', 1: 'positive', 2: 'negative'}
        # TODO: fix model load error:
        self.setupModel()
        self.sentencesRec = pd.DataFrame(np.zeros(
            (1, len(self.sentencesColList)), dtype=np.unicode_),
                                         columns=self.sentencesColList)
        # convert last 4 into float32 dtype
        for colname in [
                "sentence_no", "neutral_prob", "positive_prob", "negative_prob"
        ]:
            self.sentencesRec[colname] = pd.to_numeric(
                self.sentencesRec[colname])

    def setupModel(self):
        """ Load the classification model.
        """
        num_labels = len(self.labels)
        vocab_type = "finance-uncased"
        self.max_seq_length = 256
        if torch.cuda.is_available():
            self.device = torch.device("cuda")
        else:
            self.device = torch.device("cpu")
        self.model = BertClassification(weight_path=self.model_weights_path,
                                        num_labels=num_labels,
                                        vocab=vocab_type)
        self.model.load_state_dict(
            torch.load(self.pretuned_modelfile, map_location=self.device))
        self.model.to(self.device)
        self.tokenizer = BertTokenizer(vocab_file=self.vocab_path,
                                       do_lower_case=True,
                                       do_basic_tokenize=True)

    def processDataObj(self, newsEventObj):
        """ Process given data object by this plugin.

        :param newsEventObj: The NewsEvent object to be classified.
        :type newsEventObj: NewsEvent
        """
        assert type(newsEventObj) == NewsEvent
        # Do not proceed if the articles has already been classified, i.e. contains scores
        if newsEventObj.getClassification() is None:
            # TODO: lock file to avoid conflicting writes, release lock at the end of the method
            logger.debug(
                f"Started news event classification for data in: {newsEventObj.getFileName()}"
            )
            classificationObj = self.classifyText(newsEventObj.getText(),
                                                  newsEventObj.getURL())
            # put classification field in NewsEvent document:
            newsEventObj.setClassification(classificationObj)
            # prepare filename:
            fileNameWOExt = newsEventObj.getFileName().replace('.json', '')
            # save document to file:
            newsEventObj.writeFiles(fileNameWOExt, '', saveHTMLFile=False)
            logger.info(
                f"Completed classifying news event in: {fileNameWOExt} as: {classificationObj}"
            )

    def classifyText(self, textValue, url):
        """
        Examine and classify the text from the document and return classification scores text.

        :param textValue: Text to be examined and classified.
        :type textValue: str
        :return: Classification scores
        :rtype: dict{str:float}
        """
        sentenceDF = None
        classificationScores = {
            'positive': 0.0,
            'neutral': 0.0,
            'negative': 0.0
        }
        try:
            logger.debug(
                f'Classifying using finbert model for text of length {len(textValue)}'
            )
            if len(textValue) > self.minArticleLengthInChars:
                thisRec = self.sentencesRec.copy(deep=True)
                thisRec['url'] = url
                sentences = sent_tokenize(textValue.lower())
                self.model.eval()
                for index, sent in enumerate(sentences):
                    thisRec['sentence'] = sent
                    thisRec['sentence_no'] = index
                    # apply model on the sentence to get classification scores
                    [neutralProb, positiveProb,
                     negativeProb] = self.classifySentences(sent)
                    thisRec['neutral_prob'] = neutralProb
                    thisRec['positive_prob'] = positiveProb
                    thisRec['negative_prob'] = negativeProb
                    if sentenceDF is None:
                        sentenceDF = thisRec
                    else:
                        sentenceDF = sentenceDF.append(thisRec)
                aggscores = sentenceDF.groupby('url').agg({
                    'neutral_prob':
                    'sum',
                    'positive_prob':
                    'sum',
                    'negative_prob':
                    'sum'
                })
                classificationScores = {
                    'positive': aggscores['positive_prob'][0],
                    'neutral': aggscores['neutral_prob'][0],
                    'negative': aggscores['negative_prob'][0]
                }
        except Exception as e:
            print("Error getting sentence classification:", e)
        return (classificationScores)

    def classifySentences(self, sent):
        """ Classify one text sentence at a time.
        """
        tokenized_sent = self.tokenizer.tokenize(sent)
        if len(tokenized_sent) > self.max_seq_length:
            tokenized_sent = tokenized_sent[:self.max_seq_length]
        ids_review = self.tokenizer.convert_tokens_to_ids(tokenized_sent)
        mask_input = [1] * len(ids_review)
        padding = [0] * (self.max_seq_length - len(ids_review))
        ids_review += padding
        mask_input += padding
        input_type = [0] * self.max_seq_length
        input_ids = torch.tensor(ids_review).to(self.device).reshape(-1, 256)
        attention_mask = torch.tensor(mask_input).to(self.device).reshape(
            -1, 256)
        token_type_ids = torch.tensor(input_type).to(self.device).reshape(
            -1, 256)
        with torch.set_grad_enabled(False):
            outputs = self.model(input_ids, token_type_ids, attention_mask)
            outputs = F.softmax(outputs, dim=1)
            # print('\n FinBERT predicted sentiment: ', labels[torch.argmax(outputs).item()])
            return ([i.item() for i in outputs.data[0]])
    def __init__(self,
                 path: str,
                 fields: List[Tuple[str, tt.data.Field]],
                 tokenizer: BertTokenizer,
                 max_length: int = 512,
                 include_features=False,
                 **kwargs):
        max_length = max_length - 3  # Count without special tokens

        with open(path) as dataf:
            data_json = json.load(dataf)
            examples = []
            # Each input needs  to have at most 2 segments
            # We will create following input
            # - [CLS] source post, previous post [SEP] choice_1 [SEP]

            for example in data_json["Examples"]:
                make_ids = lambda x: tokenizer.convert_tokens_to_ids(
                    tokenizer.tokenize(x))
                text = make_ids(example["spacy_processed_text"])
                prev = make_ids(example["spacy_processed_text_prev"])
                src = make_ids(example["spacy_processed_text_src"])
                segment_A = src
                segment_C = prev
                segment_B = text
                text_ids = [tokenizer.vocab["[CLS]"]] + segment_A + [tokenizer.vocab["[SEP]"]] + segment_C + \
                           [tokenizer.vocab["[SEP]"]] + segment_B + [tokenizer.vocab["[SEP]"]]

                # truncate if exceeds max length
                if len(text_ids) > max_length:
                    # Truncate segment A
                    segment_C = segment_C[:max_length // 2]
                    text_ids = [tokenizer.vocab["[CLS]"]] + segment_A + [tokenizer.vocab["[SEP]"]] + segment_C + \
                               [tokenizer.vocab["[SEP]"]] + segment_B + [tokenizer.vocab["[SEP]"]]
                    if len(text_ids) > max_length:
                        # Truncate segment A
                        segment_A = segment_A[:max_length // 2]
                        text_ids = [tokenizer.vocab["[CLS]"]] + segment_A + [tokenizer.vocab["[SEP]"]] + segment_C + \
                                   [tokenizer.vocab["[SEP]"]] + segment_B + [tokenizer.vocab["[SEP]"]]
                        if len(text_ids) > max_length:
                            # Truncate also segment B
                            segment_B = segment_B[:max_length // 2]
                            text_ids = [tokenizer.vocab["[CLS]"]] + segment_A + [tokenizer.vocab["[SEP]"]] + segment_C + \
                                       [tokenizer.vocab["[SEP]"]] + segment_B + [tokenizer.vocab["[SEP]"]]

                segment_ids = [0] * (len(segment_A) + 2) + [2] * (
                    len(segment_C) + 1) + [1] * (len(segment_B) + 1)
                # example_list = list(example.values())[:-3] + [text_ids, segment_ids]
                if include_features:
                    example_list = list(
                        example.values()) + [text_ids, segment_ids]
                else:
                    example_list = [
                        example["id"], example["branch_id"],
                        example["tweet_id"], example["stance_label"],
                        example["veracity_label"], "\n-----------\n".join([
                            example["raw_text_src"], example["raw_text_prev"],
                            example["raw_text"]
                        ]), example["issource"]
                    ] + [text_ids, segment_ids]

                examples.append(Example.fromlist(example_list, fields))
            super(RumourEval2019Dataset_BERTTriplets_3Segments,
                  self).__init__(examples, fields, **kwargs)
def bert_tokenize_with_spacy_meta(
        spacy_model: SpacyLanguage,
        bert_tokenizer: BertTokenizer,
        unique_id: int,
        words: Sequence[str],
        sentence_ids: Sequence[int],
        data_key: Optional[Union[str, Sequence[str]]],
        data_ids: Optional[Sequence[int]],
        start: int = 0,
        stop: Optional[int] = None,
        start_sequence_2: Optional[int] = None,
        stop_sequence_2: Optional[int] = None,
        start_sequence_3: Optional[int] = None,
        stop_sequence_3: Optional[int] = None,
        multipart_id: Optional[int] = None,
        span_ids: Optional[Sequence[int]] = None,
        is_apply_data_offset_entire_group: bool = False) -> InputFeatures:
    """
    Uses spacy to get information such as part of speech, probability of word, etc. and aligns the tokenization from
    spacy with the bert tokenization.
    Args:
        spacy_model: The spacy model to use for spacy tokenization, part of speech analysis, etc. Generally from
            make_tokenizer_model()
        bert_tokenizer: The bert tokenizer to use. Usually from corpus_loader.make_bert_tokenizer()
        unique_id: The unique id for this example
        words: The words in this example. Generally a sentence, but it doesn't have to be.
        sentence_ids: For each word, identifies which sentence the word belongs to. Used to compute
            index_word_in_sentence
        data_key: A key (or multiple keys) to designate which response data set(s) data_ids references
        data_ids: Sequence[Int]. Describes an indices into a separate data array for each word. For example, if the
            first word in words corresponds to fMRI image 17 in a separate data array, and the second word corresponds
            to image 19, then this parameter could start with [17, 19, ...].
        start: Offset where the actual input features should start. It is best to compute spacy meta on full sentences,
            then slice the resulting tokens. start and end are used to slice words, sentence_ids, data_key and data_ids
        stop: Exclusive end point for the actual input features. If None, the full length is used
        start_sequence_2: Used for bert to combine 2 sequences as a single input. Generally this is used for tasks
            like question answering where type_id=0 is the question and type_id=1 is the answer. If None, assumes
            the entire input is sequence 1.
        stop_sequence_2: Used for bert to combine 2 sequences as a single input. Generally this is used for tasks
            like question answering where type_id=0 is the question and type_id=1 is the answer. If None, assumes
            the entire input is sequence 1.
        start_sequence_3: Used for bert to combine 3 sequences as a single input. Generally this is used for tasks
            like question answering with a context. type_id=0 is the context and type_id=1 is the question and
            answer
        stop_sequence_3: Used for bert to combine 3 sequences as a single input. Generally this is used for tasks
            like question answering with a context. type_id=0 is the context and type_id=1 is the question and answer
        multipart_id: Used to express that this example needs to be in the same batch as other examples sharing the
            same multipart_id to be evaluated
        span_ids: Bit-encoded span identifiers which indicate which spans each word belongs to when spans are labeled
            in the input. If not given, no span ids will be set on the returned InputFeatures instance.
        is_apply_data_offset_entire_group: If a word is broken into multiple tokens, generally a single token is
            heuristically chosen as the 'main' token corresponding to that word. The data_id it is assigned is given
            by data offset, while all the tokens that are not the main token in the group are assigned -1. If this
            parameter is set to True, then all of the multiple tokens corresponding to a word are assigned the same
            data_id, and none are set to -1. This can be a better option for fMRI where the predictions are not at
            the word level, but rather at the level of an image containing multiple words.
    Returns:
        An InputFeatures instance
    """

    sent = ''
    cum_lengths = list()

    bert_token_groups = list()
    for w in words:

        if len(sent) > 0:
            sent += ' '
        sent += str(w)
        cum_lengths.append(len(sent))
        bert_token_groups.append(bert_tokenizer.tokenize(w))

    spacy_token_groups = group_by_cum_lengths(cum_lengths, spacy_model(sent))

    # bert bert_erp_tokenization does not seem to care whether we do word-by-word or not; it is simple whitespace
    # splitting etc., then sub-word tokens are created from that

    example_tokens = list()
    example_mask = list()
    example_is_stop = list()
    example_is_begin_word_pieces = list()
    example_lengths = list()
    example_probs = list()
    example_head_location = list()
    example_token_head = list()
    example_type_ids = list()
    example_data_ids = list()
    example_span_ids = list() if span_ids is not None else None
    example_index_word_in_example = list()
    example_index_token_in_sentence = list()

    def _append_special_token(special_token, index_word_in_example_,
                              index_token_in_sentence_, type_id_):
        example_tokens.append(special_token)
        example_mask.append(1)
        example_is_stop.append(1)
        example_is_begin_word_pieces.append(1)
        example_lengths.append(0)
        example_probs.append(-20.)
        example_head_location.append(np.nan)
        example_token_head.append('[PAD]')
        example_type_ids.append(type_id_)
        example_data_ids.append(-1)
        if span_ids is not None:
            example_span_ids.append(0)
        example_index_word_in_example.append(index_word_in_example_)
        example_index_token_in_sentence.append(index_token_in_sentence_)

    type_id = 0

    _append_special_token('[CLS]',
                          index_word_in_example_=0,
                          index_token_in_sentence_=0,
                          type_id_=type_id)

    index_token_in_sentence = 0
    index_word_in_example = 0
    last_sentence_id = None

    bert_token_groups_with_spacy = list()
    for spacy_token_group, bert_token_group, word in zip(
            spacy_token_groups, bert_token_groups, words):
        bert_token_groups_with_spacy.append(
            align_spacy_meta(spacy_token_group, bert_token_group, word,
                             bert_tokenizer))

    if start < 0:
        start = len(words) + start
    if stop is None:
        stop = len(words)
    elif stop < 0:
        stop = len(words) + stop

    sequences = [(start, stop)]

    if start_sequence_2 is not None and start_sequence_2 < 0:
        start_sequence_2 = len(words) + start_sequence_2
    if stop_sequence_2 is not None and stop_sequence_2 < 0:
        stop_sequence_2 = len(words) + stop_sequence_2

    if start_sequence_2 is not None:
        if start_sequence_2 < stop:
            raise ValueError('start_sequence_2 ({}) < stop ({})'.format(
                start_sequence_2, stop))
        if stop_sequence_2 is None:
            stop_sequence_2 = len(words)
        sequences.append((start_sequence_2, stop_sequence_2))

    if start_sequence_3 is not None and start_sequence_3 < 0:
        start_sequence_3 = len(words) + start_sequence_3
    if stop_sequence_3 is not None and stop_sequence_3 < 0:
        stop_sequence_3 = len(words) + stop_sequence_3

    if stop_sequence_3 is not None:
        if stop_sequence_2 is None or start_sequence_3 < stop_sequence_2:
            raise ValueError(
                'start_sequence_3 ({}) < stop_sequence_2 ({})'.format(
                    start_sequence_3, stop_sequence_2))
        if stop_sequence_3 is None:
            stop_sequence_3 = len(words)
        sequences.append((start_sequence_3, stop_sequence_3))

    idx_sequence = 0
    for idx_group, bert_tokens_with_spacy in enumerate(
            bert_token_groups_with_spacy):
        if last_sentence_id is None or sentence_ids[
                idx_group] != last_sentence_id:
            index_token_in_sentence = -1
        last_sentence_id = sentence_ids[idx_group]
        if idx_group >= sequences[idx_sequence][1]:
            if idx_sequence + 1 < len(sequences):
                idx_sequence += 1
            else:
                break
        if idx_group < sequences[idx_sequence][0]:
            continue
        assert (sequences[idx_sequence][0] <= idx_group <
                sequences[idx_sequence][1])
        index_word_in_example += 1
        idx_data = get_data_token_index(bert_tokens_with_spacy)
        for idx_token, (t, length,
                        spacy_token) in enumerate(bert_tokens_with_spacy):
            index_token_in_sentence += 1
            idx_head_group = _get_syntactic_head_group(
                spacy_token, bert_token_groups_with_spacy)
            head_token = '[PAD]'
            head_location = np.nan
            if idx_head_group is not None:
                idx_head_data_token = get_data_token_index(
                    bert_token_groups_with_spacy[idx_head_group])
                head_token = bert_token_groups_with_spacy[idx_head_group][
                    idx_head_data_token][0]
                head_location = idx_head_group - idx_group
            example_tokens.append(t)
            example_mask.append(1)
            example_is_stop.append(1 if _is_stop(spacy_token) else 0)
            example_lengths.append(length)
            example_probs.append(
                -20. if spacy_token is None else spacy_token.prob)
            example_head_location.append(head_location)
            example_token_head.append(head_token)
            is_continue_word_piece = t.startswith('##')
            example_is_begin_word_pieces.append(
                0 if is_continue_word_piece else 1)
            example_type_ids.append(type_id)
            if span_ids is not None:
                example_span_ids.append(span_ids[idx_group])
            example_index_word_in_example.append(index_word_in_example)
            example_index_token_in_sentence.append(index_token_in_sentence)
            # we follow the BERT paper and always use the first word-piece as the labeled one
            data_id = -1
            if data_ids is not None and idx_token == idx_data or is_apply_data_offset_entire_group:
                data_id = data_ids[idx_group]
            example_data_ids.append(data_id)
        if idx_group == sequences[idx_sequence][1]:
            _append_special_token('[SEP]', index_word_in_example + 1,
                                  index_token_in_sentence + 1, type_id)
            index_word_in_example += 1
            type_id = 1

    if data_key is None:
        data_key = dict()
    if isinstance(data_key, str):
        data_key = [data_key]

    def _readonly(arr):
        arr.setflags(write=False)
        return arr

    example_data_ids = _readonly(np.array(example_data_ids))

    return InputFeatures(
        unique_id=unique_id,
        tokens=tuple(example_tokens),
        token_ids=_readonly(
            np.asarray(bert_tokenizer.convert_tokens_to_ids(example_tokens))),
        mask=_readonly(np.array(example_mask)),
        is_stop=_readonly(np.array(example_is_stop)),
        is_begin_word_pieces=_readonly(np.array(example_is_begin_word_pieces)),
        token_lengths=_readonly(np.array(example_lengths)),
        token_probabilities=_readonly(np.array(example_probs)),
        type_ids=_readonly(np.array(example_type_ids)),
        head_location=_readonly(np.array(example_head_location)),
        head_tokens=tuple(example_token_head),
        head_token_ids=_readonly(
            np.array(
                bert_tokenizer.convert_tokens_to_ids(example_token_head))),
        index_word_in_example=_readonly(
            np.array(example_index_word_in_example)),
        index_token_in_sentence=_readonly(
            np.array(example_index_token_in_sentence)),
        multipart_id=multipart_id,
        span_ids=_readonly(np.array(example_span_ids))
        if example_span_ids is not None else None,
        data_ids=dict((k, example_data_ids) for k in data_key))
Esempio n. 21
0
    vocab_file='../input/torch-bert-weights/bert-base-uncased-vocab.txt')

# ## Make prediction

# In[ ]:

# lets tokenize some text (I intentionally mispelled 'plastic' to check berts subword information handling)
text = 'hi my name is Dieter and I like wearing my yellow pglastic hat while coding.'
tokens = tokenizer.tokenize(text)
tokens

# In[ ]:

# added start and end token and convert to ids
tokens = ["[CLS]"] + tokens + ["[SEP]"]
input_ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids

# In[ ]:

# put input on gpu and make prediction
bert_output = bert(torch.tensor([input_ids]).cuda())
bert_output

# ## (Optional) Convert model to fp16

# In[ ]:

import apex
bert.half()
Esempio n. 22
0
def home():

    #   global model, BERT_FP, bert, tokenizer, nlp
    model = torch.load('model_sciBERT_CRF10.pth')
    BERT_FP = 'scibert_scivocab_uncased'
    bert = BertModel.from_pretrained(BERT_FP)
    tokenizer = BertTokenizer(vocab_file=BERT_FP + '/vocab.txt')
    nlp = en_core_web_sm.load()
    datatowrite = []
    result = ''
    if (request.method == 'POST'):
        token_indices = []
        file_raw = request.form.get('abstract')
        actual_file = open('abstract_str/abstract.txt', 'w')
        actual_file.write(file_raw)
        actual_file.close()
        file = file_raw.lower()
        tokens_list = tokenizer.tokenize(file)
        n = 0
        for i, item in enumerate(tokens_list):
            try:
                start_index = file.index(item.strip('#'))
            except:
                start_index = 100
            if ((start_index < 5 or unk == 1) and item != '[UNK]'):
                token_indices.append(
                    (start_index + n, n + start_index + len(item.strip('#'))))

                n = token_indices[-1][-1]
                file = file[start_index + len(item.strip('#')):]
            else:
                token_indices.append((-1, -1))

                if (item != '[UNK]'):
                    n += len(item.strip('#'))
                    file = file[len(item.strip('#')):]

        with torch.no_grad():
            inputs = tokenizer.convert_tokens_to_ids(tokens_list)
            inputs = bert(torch.tensor([inputs]))[0]
            for j in range(len(inputs)):
                inputs[j] = inputs[j].numpy()
            inputs = torch.tensor(np.array(inputs))
            prediction = model(inputs.permute(1, 2, 0, 3).squeeze(0))
            output = prediction[0]

        dic = {}
        dataarr = file_raw
        tagsarr = output
        indicesarr = token_indices

        indicesdata = []
        datatowrite = []
        for j in range(len(tagsarr)):
            if (tagsarr[j] == 0 or tagsarr[j] == 4):
                indicesdata.append(list(indicesarr[j]))
            if (tagsarr[j] == 1 or tagsarr[j] == 2):
                indicesdata[-1][1] = indicesarr[j][1]

        indicestowrite = indicesdata

        ind_temp = []
        data_temp = []
        for j in indicestowrite:
            ind_temp.append(j)
            data_temp.append(dataarr[j[0]:j[1]])

        indicestowrite = []
        datatowrite = []
        for j in range(len(ind_temp)):
            temp = nlp(data_temp[j])
            count = 0
            for k in temp:
                count += 1

            if (count == 1):
                ind = [
                    [k.start() + 1,
                     k.start() + 1 + len(data_temp[j])] for k in re.finditer(
                         '[^a-z]' + re.escape(data_temp[j].lower()) +
                         '[^a-z]', dataarr.lower())
                    if [k.start() +
                        1, k.start() + 1 + len(data_temp[j])] not in ind_temp
                    and [k.start() +
                         1, k.start() + 1 +
                         len(data_temp[j])] not in indicestowrite
                ]
                temp_ind = []
                dat = []
                for l in ind:
                    if (dataarr[l[0]:l[1]].lower() != dataarr[l[0]:l[1]]):
                        dat.append(dataarr[l[0]:l[1]])
                        temp_ind.append(l)
                indicestowrite += temp_ind
                datatowrite += dat

        ind_temp = ind_temp + indicestowrite
        data_temp = data_temp + datatowrite
        indicestowrite = []
        datatowrite = []

        for j in range(len(data_temp)):
            temp_2 = nlp(data_temp[j])
            temp = []
            for word in temp_2:
                temp.append((len(word.text), word.text))

            if (len(temp) == 1):
                if (str(temp[0][1]).lower() != str(temp[0][1])
                        or re.match('^[a-z]+$', temp[0][1]) == None
                        or len(temp[0][1]) > 3):
                    indicestowrite.append(ind_temp[j])
                    datatowrite.append(data_temp[j])
            else:
                indicestowrite.append(ind_temp[j])
                datatowrite.append(data_temp[j])
        indicestowrite = sorted(indicestowrite, key=lambda x: x[0])
        if (len(indicestowrite) == 0):
            return render_template("index.html", keyphrases=file_raw)
        print(indicestowrite)
        annotation_file = open('abstract_str/abstract.ann', 'w')
        for qwe in range(len(indicestowrite)):
            annotation_file.write(
                'T' + str(qwe + 1) + '\t' + 'Process ' +
                str(indicestowrite[qwe][0]) + ' ' +
                str(indicestowrite[qwe][1]) + '\t' +
                file_raw[indicestowrite[qwe][0]:indicestowrite[qwe][1]] + '\n')
        annotation_file.close()
        X_test, y_test_gold, _, test_entities = read_and_map(
            'abstract_str', mapper)
        loaded_model = pickle.load(open('finalized_model_joined.sav', 'rb'))
        predictions = loaded_model.predict(X_test)
        y_values = ['Process', 'Material', 'Task']
        document_abbr = {}
        asd = os.listdir('abstract_str')
        for i in range(len(asd)):
            document_abbr[asd[i][:-4]] = {}

        for i in range(len(predictions)):
            if (test_entities[i].string == test_entities[i].string.upper()
                    and len(test_entities[i].string) > 1):
                if (y_values[predictions[i]] == "Material"):
                    predictions[i] = y_values.index("Process")

            if (test_entities[i].string
                    == test_entities[i].string.capitalize()
                    and len(test_entities[i].string) == 2):
                predictions[i] = y_values.index("Material")

            tmp = test_entities[i].string.split(" ")
            if (len(tmp) == 1):
                if (test_entities[i].string == test_entities[i].string.upper()
                        and hasNumbers(test_entities[i].string)):
                    predictions[i] = y_values.index("Material")

            if (test_entities[i].string == test_entities[i].string.upper()):
                try:
                    predictions[i] = document_abbr[test_entities[i].docid][
                        test_entities[i].string]
                except:
                    obracket = test_entities[i].start - 1
                    cbracket = test_entities[i].end
                    file = open(
                        'abstract_str/' + test_entities[i].docid + '.txt',
                        'r').read()
                    if (file[obracket] == '(' and file[cbracket] == ')'):
                        if (test_entities[i].start -
                                test_entities[i - 1].end == 2):
                            # print(test_entities[i].string, '\t',test_entities[i-1].string ,'\t' ,test_entities[i].start, '\t',test_entities[i-1].end )
                            document_abbr[test_entities[i].docid][
                                test_entities[i].string] = predictions[i - 1]
                            predictions[i] = predictions[i - 1]

            for j in range(len(tmp)):
                if (len(tmp[j]) == 1 and tmp[j] == tmp[j].upper()):
                    predictions[i] = y_values.index("Material")

        # print(predictions)

        n = 0
        result = []
        last_closing = 0
        for i in range(len(indicestowrite)):
            qwe_temp = file_raw[n:indicestowrite[i][0]]
            if (qwe_temp != ''):
                result.append(qwe_temp)
            temp = ''
            if (predictions[i] == 0):
                temp = '<span style="background-color:rgba(152, 252, 3, 0.5);"><strong>' + file_raw[
                    indicestowrite[i][0]:indicestowrite[i]
                    [1]] + '</strong></span>'
            elif (predictions[i] == 1):
                temp = '<span style="background-color:rgba(252, 152, 3, 0.5);"><strong>' + file_raw[
                    indicestowrite[i][0]:indicestowrite[i]
                    [1]] + '</strong></span>'
            elif (predictions[i] == 2):
                temp = '<span style="background-color:rgba(3, 152, 252, 0.5);"><strong>' + file_raw[
                    indicestowrite[i][0]:indicestowrite[i]
                    [1]] + '</strong></span>'

            if (indicestowrite[i][1] > last_closing):
                result.append(temp)
                last_closing = indicestowrite[i][1]
                n = indicestowrite[i][1]
            # else:
            #     ov_string = file_raw[indicestowrite[i][0]:indicestowrite[i][1]]
            #     temp_start = result[-1].index(ov_string)
            #     result[-1] = result[-1][:temp_start] + temp + result[-1][ temp_start+indicestowrite[i][1] - indicestowrite[i][0]:]

            # result += '<span style="background-color:rgba(152, 252, 3, 0.5);"><strong>' +  file_raw[i[0]:i[1]] + '</strong></span>'

        result += file_raw[n:]
        # print(result)
        result = "".join(result)
    return render_template("index.html", keyphrases=result)
Esempio n. 23
0
class BertWithJumanModel():
    def __init__(self, bert_path, vocab_file_name="vocab.txt", use_cuda=False):
        # 日本語文章をBERTに食わせるためにJumanを読み込む
        self.juman_tokenizer = JumanTokenizer()
        # 事前学習済みのBERTモデルを読み込む
        self.model = BertModel.from_pretrained(bert_path)
        # 事前学習済みのBERTモデルのTokenizerを読み込む
        self.bert_tokenizer = BertTokenizer(Path(bert_path) / vocab_file_name,
                                            do_lower_case=False,
                                            do_basic_tokenize=False)
        # CUDA-GPUを利用するかどうかのフラグ読み込み
        self.use_cuda = use_cuda

    def _preprocess_text(self, text):
        # 事前処理、テキストの半角スペースは削除
        try:
            return text.replace(" ", "")  # for Juman
        except:
            return ''

    def get_sentence_embedding(self,
                               text,
                               pooling_layer=-2,
                               pooling_strategy="REDUCE_MEAN"):
        # テキストの半角スペースを削除する
        preprocessed_text = self._preprocess_text(text)
        # 日本語のテキストを分かち書きし、トークンリストに変換する
        tokens = self.juman_tokenizer.tokenize(preprocessed_text)
        # トークンを半角スペースで結合しstrに変換する
        bert_tokens = self.bert_tokenizer.tokenize(" ".join(tokens))
        # テキストのサイズは128までなので、ヘッダ + トークン126個 + フッタを作成
        # トークンをidに置換する
        ids = self.bert_tokenizer.convert_tokens_to_ids(
            ["[CLS]"] + bert_tokens[:126] + ["[SEP]"])  # max_seq_len-2
        tokens_tensor = torch.tensor(ids).reshape(1, -1)

        if self.use_cuda:
            # GPUの利用チェック、利用
            tokens_tensor = tokens_tensor.to('cuda')
            self.model.to('cuda')

        # モデルを評価モードに変更
        self.model.eval()
        with torch.no_grad():
            # 自動微分を適用しない(メモリ・高速化などなど)
            # id列からベクトル表現を計算する
            all_encoder_layers, _ = self.model(tokens_tensor)

            # SWEMと同じ方法でベクトルを時間方向にaverage-poolingしているらしい
            # 文章列によって次元が可変になってしまうので、伸びていく方向に対してプーリングを行い次元を固定化する
            # https://yag-ays.github.io/project/swem/
            embedding = all_encoder_layers[pooling_layer].cpu().numpy()[0]
            if pooling_strategy == "REDUCE_MEAN":
                return np.mean(embedding, axis=0)
            elif pooling_strategy == "REDUCE_MAX":
                return np.max(embedding, axis=0)
            elif pooling_strategy == "REDUCE_MEAN_MAX":
                return np.r_[np.max(embedding, axis=0),
                             np.mean(embedding, axis=0)]
            elif pooling_strategy == "CLS_TOKEN":
                return embedding[0]
            else:
                raise ValueError(
                    "specify valid pooling_strategy: {REDUCE_MEAN, REDUCE_MAX, REDUCE_MEAN_MAX, CLS_TOKEN}"
                )
Esempio n. 24
0
tokenizer = BertTokenizer(vocab_file='biobert_v1.0_pubmed_pmc/vocab.txt',
                          do_lower_case=False)
tokenized_texts_and_labels = [
    tokenize_and_preserve_labels(sent, labs)
    for sent, labs in zip(sentences, tags)
]

tokenized_texts = [
    token_label_pair[0] for token_label_pair in tokenized_texts_and_labels
]
labels = [
    token_label_pair[1] for token_label_pair in tokenized_texts_and_labels
]

input_ids = pad_sequences(
    [tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
    maxlen=MAX_LEN,
    dtype="long",
    value=0.0,
    truncating="post",
    padding="post")

tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                     maxlen=MAX_LEN,
                     value=tag2idx["PAD"],
                     padding="post",
                     dtype="long",
                     truncating="post")
attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids]
tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids,
                                                            tags,
Esempio n. 25
0
class BertWithJumanModel:
    def __init__(self, bert_path, vocab_file_name="vocab.txt", use_cuda=False):
        self.juman_tokenizer = JumanTokenizer()
        self.model = BertModel.from_pretrained(bert_path)
        self.bert_tokenizer = BertTokenizer(
            Path(bert_path) / vocab_file_name,
            do_lower_case=False,
            do_basic_tokenize=False,
        )
        self.use_cuda = use_cuda

    def _preprocess_text(self, text):
        return text.replace(" ", "")  # for Juman

    def get_sentence_embedding(self,
                               text,
                               pooling_layer=-2,
                               pooling_strategy="REDUCE_MEAN"):
        preprocessed_text = self._preprocess_text(text)
        tokens = self.juman_tokenizer.tokenize(preprocessed_text)
        bert_tokens = self.bert_tokenizer.tokenize(" ".join(tokens))
        ids = self.bert_tokenizer.convert_tokens_to_ids(
            ["[CLS]"] + bert_tokens[:126] + ["[SEP]"])  # max_seq_len-2
        tokens_tensor = torch.tensor(ids).reshape(1, -1)

        if self.use_cuda:
            tokens_tensor = tokens_tensor.to("cuda")
            self.model.to("cuda")

        self.model.eval()
        with torch.no_grad():
            all_encoder_layers, _ = self.model(tokens_tensor)

        embedding = all_encoder_layers[pooling_layer].cpu().numpy()[0]
        if pooling_strategy == "REDUCE_MEAN":
            return np.mean(embedding, axis=0)
        elif pooling_strategy == "REDUCE_MAX":
            return np.max(embedding, axis=0)
        elif pooling_strategy == "REDUCE_MEAN_MAX":
            return np.r_[np.max(embedding, axis=0), np.mean(embedding, axis=0)]
        elif pooling_strategy == "CLS_TOKEN":
            return embedding[0]
        else:
            raise ValueError(
                "specify valid pooling_strategy: {REDUCE_MEAN, REDUCE_MAX, REDUCE_MEAN_MAX, CLS_TOKEN}"
            )

    # edited
    def sentence_list_to_vec(self, sentence_list: list):
        """
        1文をリストにしたものを受け渡す
        """
        vec_list = []
        vec_mean = 0
        if not sentence_list:
            return ([None], np.zeros(768))
        try:
            for s in sentence_list:
                tmp = self.get_sentence_embedding(s)
                vec_list.append(tmp)
                vec_mean += tmp
            vec_mean = vec_mean / len(vec_list)
            return vec_list, vec_mean

        except ValueError:
            return ([None], np.zeros(768))

    def sentence_list_to_vec_with_bug(self, sentence_list: list):
        """
        1文をリストにしたものを受け渡す
        """
        vec_list = []
        vec_mean = 0
        for s in sentence_list:
            tmp = self.get_sentence_embedding(s)
            vec_list.append(tmp)
            vec_mean += tmp
        vec_mean = vec_mean / len(vec_mean)

        return vec_list, vec_mean
Esempio n. 26
0
class Generater:
    def __init__(self, bert_path):
        vocab_file_name = 'vocab.txt'
        # 日本語文章をBERTに食わせるためにJumanを読み込む
        self.juman_tokenizer = JumanTokenizer()
        # 事前学習済みのBERTモデルを読み込む
        self.model = BertModel.from_pretrained(bert_path)
        # 事前学習済みのBERTモデルのTokenizerを読み込む
        self.bert_tokenizer = BertTokenizer(Path(bert_path) / vocab_file_name,
                                            do_lower_case=False, do_basic_tokenize=False)
        self.vocab_size = len(self.bert_tokenizer.vocab)

        # 事前学習済みのBERTモデルのMaskedLMタスクモデルを読み込む
        self.model = BertForMaskedLM.from_pretrained(bert_path)

        # 除外するヘッダ等トークン
        except_tokens = ["[MASK]", 
        #"[PAD]",
        "[UNK]", "[CLS]", "[SEP]",
        "(", ")", "・", "/", "、", "。", "!", "?", "「", "」", "…", "’", "』", "『", ":", "※"
        ]
        self.except_ids = [self.bert_tokenizer.vocab[token] for token in except_tokens]

        # vocab_sizeのうち、except_ids以外は、利用する
        self.candidate_ids = [i for i in range(self.vocab_size)
                        if i not in self.except_ids]


    def _preprocess_text(self, text):
        # 事前処理、テキストの半角スペースは削除
        return text.replace(" ", "").replace('#', '')  # for Juman

    def text2tokens(self, text):
        # テキストの半角スペースを削除する
        preprocessed_text = self._preprocess_text(text)
        # 日本語のテキストを分かち書きし、トークンリストに変換する
        tokens = self.juman_tokenizer.tokenize(preprocessed_text)
        # トークンを半角スペースで結合しstrに変換する
        bert_tokens = self.bert_tokenizer.tokenize(" ".join(tokens))
        # テキストのサイズは128までなので、ヘッダ + トークン126個 + フッタを作成
        # トークンをidに置換する
        ids = self.bert_tokenizer.convert_tokens_to_ids(["[CLS]"] + bert_tokens[:126] + ["[SEP]"]) # max_seq_len-2
        generated_token_ids = torch.tensor(ids).reshape(1, -1)
        return generated_token_ids

    def tokens2text(self, tokens):
        sampled_sequence = [self.bert_tokenizer.ids_to_tokens[token_id]
                                        for token_id in tokens[0].cpu().numpy()]
        sampled_sequence = "".join(
            [
                token[2:] if token.startswith("##") else token
                for token in list(filter(lambda x: x != '[PAD]' and x != '[CLS]' and x != '[SEP]', sampled_sequence))
            ]
        )
        return sampled_sequence


    def likelihood(self, tokens):
        outputs = self.model(tokens)
        predictions = outputs[0]

        score_sum = 0.0
        for idx, scores in zip(tokens[0].tolist(), predictions[0].tolist()):
            score_sum += scores[idx]
        return score_sum

    def initialization_text(self, length=10):
        init_tokens = []
        # ヘッダ
        init_tokens.append(self.bert_tokenizer.vocab["[CLS]"])
        for _ in range(length):
            # ランダムに文字を選択
            init_tokens.append(random.choice(self.candidate_ids))
        # フッタ
        init_tokens.append(self.bert_tokenizer.vocab["[SEP]"])

        return torch.tensor(init_tokens).reshape(1, -1)

    def scoring(self, tokens):
        return self.likelihood(tokens) + self.juman_tokenizer.tanka_score_subsets(self.tokens2text(tokens)) + self.juman_tokenizer.tanka_score_flow(self.tokens2text(tokens))

    def select(self, l_tokens, size=5):
        scores = list(map(self.scoring, l_tokens))
        print(sorted(scores, reverse=True)[:3])
        selected = list(map(
            lambda x: x[0],
            sorted(
                list(zip(l_tokens, scores)), 
                key=lambda x: x[1],
                reverse=True
            )
        ))

        return selected

    def crossover(self, tokens_0, tokens_1):
        l_tokens_0 = tokens_0.numpy().reshape(-1).tolist()
        l_tokens_1 = tokens_1.numpy().reshape(-1).tolist()

        start = random.randint(1, len(l_tokens_0) - 3)
        end = random.randint(start, len(l_tokens_0) - 2)

        for num in range(start, end):
            l_tokens_0[num] = l_tokens_1[num]

        return torch.tensor(l_tokens_0).reshape(1, -1)

    def mutation(self, tokens, N=3):
        l_tokens = tokens.numpy().reshape(-1).tolist()

        for num in range(N):
            num = random.randint(1, len(l_tokens) - 2)
            l_tokens[num] = self.bert_tokenizer.vocab["[MASK]"]
            
            outputs = self.model(torch.tensor(l_tokens).reshape(1, -1))
            predictions = outputs[0]
            _, predicted_indexes = torch.topk(predictions[0, num], k=10)

            # random_tokens = [random.choice(self.candidate_ids) for i in range(1)]
            random_tokens = []

            predicted_indexes = list(
                set(predicted_indexes.tolist() + random_tokens) - set(self.except_ids)
            )

            predicted_tokens = self.bert_tokenizer.convert_ids_to_tokens(predicted_indexes)
            predict_token = random.choice(predicted_indexes)

            l_tokens[num] = predict_token

        return torch.tensor(l_tokens).reshape(1, -1)
Esempio n. 27
0
class BertWithJumanModel():
    def __init__(self, bert_path, vocab_file_name="vocab.txt", use_cuda=False):
        # 日本語文章をBERTに食わせるためにJumanを読み込む
        self.juman_tokenizer = JumanTokenizer()
        # 事前学習済みのBERTモデルのMaskedLMタスクモデルを読み込む
        self.model = BertForMaskedLM.from_pretrained(bert_path)
        # 事前学習済みのBERTモデルのTokenizerを読み込む
        self.bert_tokenizer = BertTokenizer(Path(bert_path) / vocab_file_name,
                                            do_lower_case=False,
                                            do_basic_tokenize=False)
        # CUDA-GPUを利用するかどうかのフラグ読み込み
        self.use_cuda = use_cuda

    def _preprocess_text(self, text):
        # 事前処理、テキストの半角スペースは削除
        return text.replace(" ", "")  # for Juman

    def paraphrase(self, text):
        # テキストの半角スペースを削除する
        preprocessed_text = self._preprocess_text(text)
        # 日本語のテキストを分かち書きし、トークンリストに変換する
        tokens = self.juman_tokenizer.tokenize(preprocessed_text)
        # トークンを半角スペースで結合しstrに変換する
        bert_tokens = self.bert_tokenizer.tokenize(" ".join(tokens))
        # テキストのサイズは128までなので、ヘッダ + トークン126個 + フッタを作成
        # トークンをidに置換する
        ids = self.bert_tokenizer.convert_tokens_to_ids(
            ["[CLS]"] + bert_tokens[:126] + ["[SEP]"])  # max_seq_len-2
        generated_token_ids = torch.tensor(ids).reshape(1, -1)

        if self.use_cuda:
            # GPUの利用チェック、利用
            generated_token_ids = generated_token_ids.to('cuda')
            self.model.to('cuda')

        # モデルを評価モードに変更
        self.model.eval()
        with torch.no_grad():
            for i in range(10):
                for j, _ in enumerate(tokens):
                    # 文章のトークン1つをMASKに変換する
                    # ヘッダは除くから、+1から
                    masked_index = j + 1

                    pre_token = generated_token_ids[0, masked_index].item()

                    generated_token_ids[
                        0, masked_index] = self.bert_tokenizer.vocab["[MASK]"]

                    outputs = self.model(generated_token_ids)
                    predictions = outputs[0]

                    _, predicted_indexes = torch.topk(
                        predictions[0, masked_index], k=5)
                    predicted_tokens = self.bert_tokenizer.convert_ids_to_tokens(
                        predicted_indexes.tolist())

                    print(predicted_tokens)

                    predict_token = predicted_indexes.tolist()[0]

                    # if pre_token == predict_token:
                    #     predict_token = predicted_indexes.tolist()[1]

                    generated_token_ids[0, masked_index] = predict_token

                    # idから文字列に変換、結合
                    sampled_sequence = [
                        self.bert_tokenizer.ids_to_tokens[token_id]
                        for token_id in generated_token_ids[0].cpu().numpy()
                    ]
                    sampled_sequence = "".join([
                        token[2:] if token.startswith("##") else token
                        for token in list(
                            filter(lambda x: x != '[PAD]', sampled_sequence))
                    ])

                    logger.info(
                        "sampled sequence: {}".format(sampled_sequence))