def __init__(self, device, tokenizer: BertTokenizer, token_rate: float, exclude_names: bool = False): super().__init__(device) self.tokenizer = tokenizer self.token_rate = token_rate self.exclude_names = exclude_names self.mask_id = tokenizer.convert_tokens_to_ids(["[MASK]"])[0] self.sep_id = tokenizer.convert_tokens_to_ids(["[SEP]"])[0]
class NERDataSet(Dataset): def __init__(self, data_path, config, add_cls=False, add_sep=False): self.config = config self.sents, self.tags = load_tsv(data_path, add_cls=add_cls, add_sep=add_sep) self.tokenizer = BertTokenizer(vocab_file=config.vocab_path, do_lower_case=False) self.tokenize() def __len__(self): return len(self.sents) def tokenize(self): alltok_sents, alltok_tags = [], [] for sent_words, sent_tags in zip(self.sents, self.tags): tok_sent, tok_tag = [], [] for w, t in zip(sent_words, sent_tags): # tokenize the words tokens = self.tokenizer.tokenize(w) tok_ids = self.tokenizer.convert_tokens_to_ids(tokens) tok_tags = [t] + [self.config.piece_tag] * (len(tokens) - 1) ttags_ids = [self.config.tag2idx[tt] for tt in tok_tags] tok_sent.extend(tok_ids) tok_tag.extend(ttags_ids) alltok_sents.append(tok_sent) alltok_tags.append(tok_tag) self.tok_sents = alltok_sents self.tok_tags = alltok_tags def __getitem__(self, idx): return self.tok_sents[idx], self.tok_tags[idx]
class JapaneseWorker: def __init__(self): self.juman_tokenizer = JumanTokenizer() self.bert_tokenizer = BertTokenizer(config['DEFAULT']['vocab_path'], do_basic_tokenize=False) self.cls_id = self.bert_tokenizer.vocab['[CLS]'] self.mask_id = self.bert_tokenizer.vocab['[MASK]'] self.bert_model = 'PATH_TO_BERTJPN' self.cp = 'checkpoint/jp/cp_step_710000.pt' self.opt = 'checkpoint/jp/opt_step_710000.pt' @staticmethod def linesplit(src): """ :param src: type str, String type article :return: type list, punctuation seperated sentences """ def remove_newline(x): x = x.replace('\n', '') return x def remove_blank(x): x = x.replace(' ', '') return x def remove_unknown(x): unknown = ['\u3000'] for h in unknown: x = x.replace(h, '') return x src = remove_blank(src) src = remove_newline(src) src = remove_unknown(src) src_line = re.split('。(?<!」)|!(?<!」)|?(?!」)', src) src_line = [x for x in src_line if x is not ''] return src_line def tokenizer(self, src): """ :param src: type list, punctuation seperated sentences :return: token: type list, numberized tokens token_id: type list, tokens """ token = [] token_id = [] def _preprocess_text(text): return text.replace(" ", "") # for Juman for sentence in src: preprocessed_text = _preprocess_text(sentence) juman_tokens = self.juman_tokenizer(preprocessed_text) tokens = self.bert_tokenizer.tokenize(" ".join(juman_tokens)) tokens = ["[CLS]"] + tokens + ["[SEP]"] ids = self.bert_tokenizer.convert_tokens_to_ids(tokens) token += tokens token_id += ids return token, token_id
class BertWithJumanModel(): """学習済みBertを使うやつ Fork:https://github.com/yagays/pytorch_bert_japanese""" def __init__(self, bert_path, vocab_file_name="vocab.txt", use_cuda=False): self.juman_tokenizer = JumanTokenizer() self.model = BertModel.from_pretrained(bert_path) self.bert_tokenizer = BertTokenizer(Path(bert_path) / vocab_file_name, do_lower_case=False, do_basic_tokenize=False) self.use_cuda = use_cuda def _preprocess_text(self, text): return text.replace(" ", "") def get_sentence_embedding(self, text, pooling_layer=-2, pooling_strategy="REDUCE_MEAN"): preprocessed_text = self._preprocess_text(text) n = math.ceil(len(preprocessed_text) / 2048) result = [ preprocessed_text[idx:idx + n] for idx in range(0, len(preprocessed_text), n) ] tokens = [] for t in result: tokens += self.juman_tokenizer.tokenize(t) bert_tokens = self.bert_tokenizer.tokenize(" ".join(tokens)) ids = self.bert_tokenizer.convert_tokens_to_ids( ["[CLS]"] + bert_tokens[:126] + ["[SEP]"]) # max_seq_len-2 tokens_tensor = torch.tensor(ids).reshape(1, -1) if self.use_cuda: tokens_tensor = tokens_tensor.to('cuda') self.model.to('cuda') self.model.eval() with torch.no_grad(): all_encoder_layers, _ = self.model(tokens_tensor) embedding = all_encoder_layers[pooling_layer].cpu().numpy()[0] if pooling_strategy == "REDUCE_MEAN": return np.mean(embedding, axis=0) elif pooling_strategy == "REDUCE_MAX": return np.max(embedding, axis=0) elif pooling_strategy == "REDUCE_MEAN_MAX": return np.r_[np.max(embedding, axis=0), np.mean(embedding, axis=0)] elif pooling_strategy == "CLS_TOKEN": return embedding[0] else: raise ValueError( "specify valid pooling_strategy: {REDUCE_MEAN, REDUCE_MAX, REDUCE_MEAN_MAX, CLS_TOKEN}" )
def get_words_for_blank_slow_decode(text: str, model: BertForMaskedLM, tokenizer: BertTokenizer): random.seed(42) np.random.seed(42) torch.manual_seed(42) mask_positions = [] tokenized_text = tokenizer.tokenize(text) top_words_all = [] for i in range(len(tokenized_text)): if tokenized_text[i] == '_': tokenized_text[i] = '[MASK]' mask_positions.append(i) while mask_positions: top_words = [] # Convert tokens to vocab indices token_ids = tokenizer.convert_tokens_to_ids(tokenized_text) tokens_tensor = torch.tensor([token_ids]) # Call BERT to calculate unnormalized probabilities for all pos model.eval() predictions = model(tokens_tensor) # get predictions mask_preds = predictions[0, mask_positions, :] candidates = [] #(word, prob) for mask_pos in mask_positions: mask_preds = predictions[0, mask_pos, :] top_idxs = mask_preds.detach().numpy().argsort()[::-1] top_idx = top_idxs[0] top_prob = mask_preds[top_idx] top_word = tokenizer.ids_to_tokens[top_idx] candidates.append((top_word, top_prob.detach().item())) top_words_pos = [] for i in top_idxs[:20]: top_words_pos.append((tokenizer.ids_to_tokens[i], mask_preds[i].detach().item())) top_words.append(top_words_pos) best_candidate = max(candidates, key = lambda x: x[1]) best_pos = mask_positions[candidates.index(best_candidate)] tokenized_text[best_pos] = best_candidate[0] mask_positions = [i for i in mask_positions if i != best_pos] top_words_all.append(top_words[candidates.index(best_candidate)]) pred_sent = ' '.join(tokenized_text).replace(' ##', '') return (pred_sent, top_words_all)
class FedPredictDataset(Dataset): def __init__(self, texts, vocab_path, max_seq_length=512, vocab='finance-uncased'): self.texts = texts self.dict_labels = {'lower': 0, 'maintain': 1, 'raise': 2} self.max_seq_length = max_seq_length self.vocab = vocab if self.vocab == 'finance-uncased': self.tokenizer = BertTokenizer(vocab_file=vocab_path, do_lower_case=True, do_basic_tokenize=True) def __len__(self): return len(self.texts) def __getitem__(self, index): tokenized_review = self.tokenizer.tokenize(self.texts[index]) if len(tokenized_review) > self.max_seq_length: tokenized_review = tokenized_review[:self.max_seq_length] ids_review = self.tokenizer.convert_tokens_to_ids(tokenized_review) mask_input = [1] * len(ids_review) padding = [0] * (self.max_seq_length - len(ids_review)) ids_review += padding mask_input += padding input_type = [0] * self.max_seq_length assert len(ids_review) == self.max_seq_length assert len(mask_input) == self.max_seq_length assert len(input_type) == self.max_seq_length ids_review = torch.tensor(ids_review) mask_input = torch.tensor(mask_input) input_type = torch.tensor(input_type) input_feature = { "token_type_ids": input_type, "attention_mask": mask_input, "input_ids": ids_review } return input_feature
def _bert_embed_sentence(sentence, bert_model: BertModel, bert_tokenizer: BertTokenizer): text = "[CLS] {} [SEP]".format(sentence) tokenized_text = bert_tokenizer.tokenize(text) indexed_tokens = bert_tokenizer.convert_tokens_to_ids(tokenized_text) tokens_tensor = torch.tensor([indexed_tokens]) segments_ids = [0] * len(indexed_tokens) segments_tensors = torch.tensor([segments_ids]) device = 'cuda' if torch.cuda.is_available() else 'cpu' tokens_tensor = tokens_tensor.to(device) segments_tensors = segments_tensors.to(device) with torch.no_grad(): encoded_layers, _ = bert_model(tokens_tensor, segments_tensors, output_all_encoded_layers=False) # Embedding of the [CLS] token return encoded_layers[0][0]
class BertWithJumanModel: def __init__(self, bert_path, vocab_file_name="vocab.txt", use_cuda=False): self.juman_tokenizer = JumanTokenizer() self.model = BertModel.from_pretrained(bert_path) self.bert_tokenizer = BertTokenizer(Path(bert_path) / vocab_file_name, do_lower_case=False, do_basic_tokenize=False) self.use_cuda = use_cuda def _preprocess_text(self, text): return text.replace(" ", "") # for Juman def get_sentence_embedding(self, text, pooling_layer=-2, pooling_strategy="REDUCE_MEAN"): preprocessed_text = self._preprocess_text(text) tokens = self.juman_tokenizer.tokenize(preprocessed_text) bert_tokens = self.bert_tokenizer.tokenize(" ".join(tokens)) ids = self.bert_tokenizer.convert_tokens_to_ids( ["[CLS]"] + bert_tokens[:126] + ["[SEP]"]) # max_seq_len-2 tokens_tensor = torch.tensor(ids).reshape(1, -1) if self.use_cuda: tokens_tensor = tokens_tensor.to('cuda') self.model.to('cuda') self.model.eval() with torch.no_grad(): all_encoder_layers, _ = self.model(tokens_tensor) embedding = all_encoder_layers[pooling_layer].cpu().numpy()[0] if pooling_strategy == "REDUCE_MEAN": return np.mean(embedding, axis=0) elif pooling_strategy == "REDUCE_MAX": return np.max(embedding, axis=0) elif pooling_strategy == "REDUCE_MEAN_MAX": return np.r_[np.max(embedding, axis=0), np.mean(embedding, axis=0)] elif pooling_strategy == "CLS_TOKEN": return embedding[0] else: raise ValueError( "specify valid pooling_strategy: {REDUCE_MEAN, REDUCE_MAX, REDUCE_MEAN_MAX, CLS_TOKEN}" )
def get_sample_bert_token_id_seq(bert_tokenizer: BertTokenizer, left_seq_str, right_seq_str, max_seq_len): left_bert_token_seq = bert_tokenizer.tokenize(left_seq_str) right_bert_token_seq = bert_tokenizer.tokenize(right_seq_str) if len(right_bert_token_seq) + 3 > max_seq_len: right_bert_token_seq = right_bert_token_seq[:max_seq_len - 3] if len(right_bert_token_seq) + len(left_bert_token_seq) + 3 > max_seq_len: left_bert_token_seq = left_bert_token_seq[:max_seq_len - len(right_bert_token_seq) - 3] bert_token_seq = ['[CLS]'] + left_bert_token_seq + [ '[SEP]' ] + right_bert_token_seq + ['[SEP]'] # print(bert_token_seq) bert_token_id_seq = bert_tokenizer.convert_tokens_to_ids(bert_token_seq) return bert_token_id_seq
def convert_data2(path1, path2, max_length, number, seq1, seq2): """转ID,进行padding,再加上CLP、SEP之后""" tokenizer = BertTokenizer('./model/bert-base-chinese/vocab.txt') input_id = [] input_mask = [] segment_id = [] # number = 0 print(len(seq1)) for i in range(number): tokens_a = tokenizer.tokenize(seq1[i]) tokens_b = tokenizer.tokenize(seq2[i]) # print(seq2[i]) # print(tokens_b) while True: if (len(tokens_a) + len(tokens_b)) <= max_length - 3: break else: # print(tokens_b) # tokens_b.pop() tokens_a = tokens_a[: int((max_length - 3) * len(tokens_a)/(len(tokens_a) + len(tokens_b)))] tokens_b = tokens_b[: int((max_length - 3) * len(tokens_b)/(len(tokens_a) + len(tokens_b)))] # 头尾加上[CLS] [SEP]标签 tokens_a = ['[CLS]'] + tokens_a + ['[SEP]'] tokens = tokens_a + tokens_b + ['[SEP]'] input_id_ = tokenizer.convert_tokens_to_ids(tokens) segment_id_ = [0] * len(tokens_a) + [1] * (len(tokens_b) + 1) input_mask_ = [1] * len(tokens) # segment_id是用于区分token_a和token_b的 # input_mask用于区分padding padding_ = [0] * (max_length - len(tokens)) # 所有的输入进入bert的配置参数都要加上padding input_id_ += padding_ segment_id_ += padding_ input_mask_ += padding_ # 每条语句放入列表中[sentence_num, MAX_LENGTH] input_id.append(input_id_) input_mask.append(input_mask_) segment_id.append(segment_id_) return input_id, input_mask, segment_id
def convert_tokens_to_features(tokens: List[str], tokenizer: BertTokenizer, do_lower_case: bool = True) -> Features: input_tokens: List[str] = ['[CLS]'] ids_to_original: List[int] = [-1] for k, token in enumerate(tokens): for wp in tokenizer.wordpiece_tokenizer.tokenize( token.lower() if do_lower_case else token): # lower_case input_tokens.append(wp) ids_to_original.append(k) input_tokens.append('[SEP]') ids_to_original.append(-1) features: Features = Features( tokens=input_tokens, ids_to_original=torch.tensor(ids_to_original, dtype=torch.long), input_ids=torch.tensor(tokenizer.convert_tokens_to_ids(input_tokens), dtype=torch.long), ) return features
def predict_word(text: str, model: BertForMaskedLM, tokenizer: BertTokenizer, tgt_word: str, tgt_pos: int): # print('Template sentence: ', text) mask_positions = [] # insert mask tokens tokenized_text = tokenizer.tokenize(text) for i in range(len(tokenized_text)): if tokenized_text[i] == '_': tokenized_text[i] = '[MASK]' mask_positions.append(i) # Convert tokens to vocab indices token_ids = tokenizer.convert_tokens_to_ids(tokenized_text) tokens_tensor = torch.tensor([token_ids]) # Call BERT to calculate unnormalized probabilities for all pos model.eval() predictions = model(tokens_tensor) # normalize by softmax predictions = F.softmax(predictions, dim=2) # For the target word position, get probabilities for each word of interest normalized = predictions[0, tgt_pos, :] out_prob = normalized[tokenizer.vocab[tgt_word]].item() # Also, fill in all blanks by max prob, and print for inspection for mask_pos in mask_positions: predicted_index = torch.argmax(predictions[0, mask_pos, :]).item() predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0] tokenized_text[mask_pos] = predicted_token for mask_pos in mask_positions: tokenized_text[mask_pos] = "_" + tokenized_text[mask_pos] + "_" pred_sent = ' '.join(tokenized_text).replace(' ##', '') # print(pred_sent) return out_prob, pred_sent
def bert_sentence_pair_preprocessing(dataset: pd.DataFrame, tokenizer: BertTokenizer, max_sequence_length=64): max_bert_input_length = 70 dataset_input_ids = torch.empty((len(dataset), max_bert_input_length), dtype=torch.long) dataset_token_type_ids = torch.empty((len(dataset), max_bert_input_length), dtype=torch.long) dataset_attention_masks = torch.empty((len(dataset), max_bert_input_length), dtype=torch.long) dataset_lengths = torch.empty((len(dataset), 1), dtype=torch.long) dataset_labels = torch.empty((len(dataset), 1), dtype=torch.long) dataset_other_type_ids = torch.empty((len(dataset), 18), dtype=torch.long) # dataset_input_tensors = torch.empty(len(dataset), 4, max_bert_input_length, dtype=torch.float) for idx, data in dataset.iterrows(): tokens = [] input_type_ids = [] # other type 전처리 other_type_ids = [] other_type_ids.append(data['addr0']) other_type_ids.append(data['addr1']) other_type_ids.append(data['addr2']) other_type_ids.append(data['addr3']) other_type_ids.append(data['addr4']) other_type_ids.append(data['addr5']) other_type_ids.append(data['phone0']) other_type_ids.append(data['phone1']) other_type_ids.append(data['phone2']) other_type_ids.append(data['phone3']) other_type_ids.append(data['cate0']) other_type_ids.append(data['cate1']) other_type_ids.append(data['cate2']) other_type_ids.append(data['cate3']) other_type_ids.append(data['cate4']) other_type_ids.append(data['cname0']) other_type_ids.append(data['cname1']) other_type_ids.append(data['cname2']) dataset_other_type_ids[idx] = torch.tensor(other_type_ids, dtype=torch.long) sentence_1_tokenized, sentence_2_tokenized = tokenizer.tokenize(data['full_placename1']), tokenizer.tokenize(data['full_placename2']) tokens.append("[CLS]") input_type_ids.append(0) for token in sentence_1_tokenized: tokens.append(token) input_type_ids.append(0) tokens.append("[SEP]") input_type_ids.append(0) for token in sentence_2_tokenized: tokens.append(token) input_type_ids.append(1) tokens.append("[SEP]") input_type_ids.append(1) # 전처리한 token 바탕으로 인덱스값 얻음 input_ids = tokenizer.convert_tokens_to_ids(tokens) # attention mask 전처리 attention_masks = [1] * len(input_ids) # input_ids length 저장 dataset_lengths[idx] = torch.tensor(len(input_ids), dtype=torch.long) while len(input_ids) < max_bert_input_length: input_ids.append(0) attention_masks.append(0) input_type_ids.append(0) dataset_input_ids[idx] = torch.tensor(input_ids, dtype=torch.long) dataset_token_type_ids[idx] = torch.tensor(input_type_ids, dtype=torch.long) dataset_attention_masks[idx] = torch.tensor(attention_masks, dtype=torch.long) dataset_labels[idx] = torch.tensor(data['label'], dtype=torch.long) return dataset_input_ids, dataset_token_type_ids, dataset_attention_masks, dataset_other_type_ids, dataset_lengths, dataset_labels
class Preprocess: def __init__(self): self.juman_tokenizer = JumanTokenizer() self.rouge_calculator = RougeNCalc() self.bert_tokenizer = BertTokenizer(config['DEFAULT']['vocab_path'], do_lower_case=False, do_basic_tokenize=False) self.trim_input = 0 self.trim_clss = 0 def __call__(self, data_dic, length): self.src_body = data_dic['body'] self.src_summary = data_dic['summary'].split('<sep>') self._init_data() if self.src_body is '': raise ValueError('Empty data') # step 1. article to lines self._split_line() # step 2. pick extractive summary by rouge self._rougematch() # step 3. tokenize self._tokenize() # step 4. clss process self._prep_clss() # step 5. segs process self._prep_segs() # step 6. trim length for input self._set_length(length) return { 'src': self.tokenid, 'labels': self.label, 'segs': self.segs, 'mask': self.mask, 'mask_cls': self.mask_cls, 'clss': self.clss, 'src_str': self.src_line } def _init_data(self): self.src_line = [] self.label = [] self.tokenid = [] self.token = [] self.clss = [] self.segs = [] self.mask = [] self.mask_cls = [] # step 1. def _split_line(self): # regex note: (?!...) Negative Lookahead # e.g. /foo(?!bar)/ for "foobar foobaz" get "foobaz" only self.src_line = re.split('。(?<!」)|!(?<!」)|?(?!」)', self.src_body) self.src_line = [x for x in self.src_line if x is not ''] # step 2. def _rougematch(self): self.label = [0] * len(self.src_line) for summ in self.src_summary: scores = [self.rouge_calculator(x, summ) for x in self.src_line] self.label[scores.index(max(scores))] = 1 # step 3. def _tokenize(self): def _preprocess_text(text): return text.replace(" ", "") # for Juman for sentence in self.src_line: preprocessed_text = _preprocess_text(sentence) juman_tokens = self.juman_tokenizer(preprocessed_text) tokens = self.bert_tokenizer.tokenize(" ".join(juman_tokens)) tokens = ["[CLS]"] + tokens + ["[SEP]"] ids = self.bert_tokenizer.convert_tokens_to_ids(tokens) self.token += tokens self.tokenid += ids # step 4. def _prep_clss(self): self.clss = [ i for i, x in enumerate(self.tokenid) if x == self.bert_tokenizer.vocab['[CLS]'] ] # step 5. def _prep_segs(self): flag = 1 for idx in self.tokenid: if idx == self.bert_tokenizer.vocab['[CLS]']: flag = not flag self.segs.append(int(flag)) # step 6. def _set_length(self, n): self.__trim_data(n) self.__add_mask(n) def __trim_data(self, n): if len(self.tokenid) > n: # If last sentence starts after 512 if self.clss[-1] > 512: for i, idx in enumerate(self.clss): if idx > n: # Index of last [SEP] in length=n self.trim_input = self.clss[i - 1] - 1 # Index of last [CLS] index in clss self.trim_clss = i - 2 break # If src longer than 512 but last sentence start < 512 else: self.trim_input = self.clss[len(self.clss) - 1] - 1 self.trim_clss = len(self.clss) - 2 # Do nothing if length < n if self.trim_clss * self.trim_input == 0: return self.tokenid = self.tokenid[:(self.trim_input + 1)] self.segs = self.segs[:(self.trim_input + 1)] self.clss = self.clss[:(self.trim_clss + 1)] self.label = self.label[:(self.trim_clss + 1)] self.src_line = self.src_line[:(self.trim_clss + 1)] def __add_mask(self, n): # from index to len: +1 pad_len = (n - len(self.tokenid)) self.tokenid = self.tokenid + ([self.bert_tokenizer.vocab['[MASK]']] * pad_len) self.segs = self.segs + ([int(not self.segs[-1])] * pad_len)
class text_dataset(Dataset): def __init__(self, x_y_list, vocab_path, max_seq_length=256, vocab='base-cased', transform=None): self.max_seq_length = max_seq_length self.x_y_list = x_y_list self.vocab = vocab if self.vocab == 'base-cased': self.tokenizer = BertTokenizer.from_pretrained( 'bert-base-cased', do_lower_case=False, do_basic_tokenize=True) elif self.vocab == 'finance-cased': self.tokenizer = BertTokenizer(vocab_file=vocab_path, do_lower_case=False, do_basic_tokenize=True) elif self.vocab == 'base-uncased': self.tokenizer = BertTokenizer.from_pretrained( 'bert-base-uncased', do_lower_case=True, do_basic_tokenize=True) elif self.vocab == 'finance-uncased': self.tokenizer = BertTokenizer(vocab_file=vocab_path, do_lower_case=True, do_basic_tokenize=True) def __getitem__(self, index): tokenized_review = self.tokenizer.tokenize(self.x_y_list[0][index]) if len(tokenized_review) > self.max_seq_length: tokenized_review = tokenized_review[:self.max_seq_length] ids_review = self.tokenizer.convert_tokens_to_ids(tokenized_review) mask_input = [1] * len(ids_review) padding = [0] * (self.max_seq_length - len(ids_review)) ids_review += padding mask_input += padding input_type = [0] * self.max_seq_length assert len(ids_review) == self.max_seq_length assert len(mask_input) == self.max_seq_length assert len(input_type) == self.max_seq_length ids_review = torch.tensor(ids_review) mask_input = torch.tensor(mask_input) input_type = torch.tensor(input_type) sentiment = self.x_y_list[1][index] list_of_labels = [torch.from_numpy(np.array(sentiment))] input_feature = { "token_type_ids": input_type, "attention_mask": mask_input, "input_ids": ids_review } return input_feature, list_of_labels[0] def __len__(self): return len(self.x_y_list[0])
class CAILDataset(Dataset): def __init__( self, data_path, max_seq_len, vocab_path, # tfidf_a_df, # tfidf_b_df, # tfidf_c_df, fts_flag=False, mode="test", ): self.data_path = data_path self.max_seq_len = max_seq_len self.vocab_path = vocab_path # self.exft_a_df = tfidf_a_df # self.exft_b_df = tfidf_b_df # self.exft_c_df = tfidf_c_df self.fts_flag = fts_flag self.mode = mode self.reset() def reset(self): self.tokenizer = BertTokenizer(vocab_file=self.vocab_path) self.build_examples() def read_data(self): print(self.data_path) xlist = [] with open(self.data_path, "r", encoding="utf-8") as f: for i, line in enumerate(f): x = json.loads(line) # xlist.append((x["A"], x["B"], x["C"])) if self.mode == "train" or self.mode == "valid": if i % 2 == 0: xlist.append((x["A"], x["B"], x["C"])) else: xlist.append((x["A"], x["C"], x["B"])) else: xlist.append((x["A"], x["B"], x["C"])) return xlist def build_examples(self): xlist = self.read_data() self.examples = [] list_text_a = [] list_text_b = [] list_text_c = [] for idx, x in enumerate(xlist): guid = "%s-%d" % (self.mode, idx) text_a = x[0] text_b = x[1] text_c = x[2] example = InputExample(guid=guid, text_a=text_a, text_b=text_b, text_c=text_c) self.examples.append(example) list_text_a.append(text_a) list_text_b.append(text_b) list_text_c.append(text_c) if self.fts_flag: self.exft_a_df = self.build_ex_features(list_text_a) self.exft_b_df = self.build_ex_features(list_text_b) self.exft_c_df = self.build_ex_features(list_text_c) self.exft_a_df.fillna(0, inplace=True) self.exft_b_df.fillna(0, inplace=True) self.exft_c_df.fillna(0, inplace=True) def build_features(self, example): max_seq_len = self.max_seq_len - 2 tokens_a = self.tokenizer.tokenize(example.text_a) tokens_b = self.tokenizer.tokenize(example.text_b) tokens_c = self.tokenizer.tokenize(example.text_c) if len(tokens_a) > max_seq_len: tokens_a = tokens_a[-max_seq_len:] if len(tokens_b) > max_seq_len: tokens_b = tokens_b[-max_seq_len:] if len(tokens_c) > max_seq_len: tokens_c = tokens_c[-max_seq_len:] input_ids_a = self.tokenizer.convert_tokens_to_ids(["[CLS]"] + tokens_a + ["[SEP]"]) input_ids_b = self.tokenizer.convert_tokens_to_ids(["[CLS]"] + tokens_b + ["[SEP]"]) input_ids_c = self.tokenizer.convert_tokens_to_ids(["[CLS]"] + tokens_c + ["[SEP]"]) input_mask_a = [1] * len(input_ids_a) input_mask_b = [1] * len(input_ids_b) input_mask_c = [1] * len(input_ids_c) segment_ids_a = [0] * len(input_ids_a) segment_ids_b = [0] * len(input_ids_b) segment_ids_c = [0] * len(input_ids_c) padding_a = [0] * (max_seq_len - len(tokens_a)) padding_b = [0] * (max_seq_len - len(tokens_b)) padding_c = [0] * (max_seq_len - len(tokens_c)) input_ids_a += padding_a segment_ids_a += padding_a input_mask_a += padding_a input_ids_b += padding_b segment_ids_b += padding_b input_mask_b += padding_b input_ids_c += padding_c segment_ids_c += padding_c input_mask_c += padding_c feature_a = InputFeature( input_ids=input_ids_a, segment_ids=segment_ids_a, input_mask=input_mask_a, ) feature_b = InputFeature( input_ids=input_ids_b, segment_ids=segment_ids_b, input_mask=input_mask_b, ) feature_c = InputFeature( input_ids=input_ids_c, segment_ids=segment_ids_c, input_mask=input_mask_c, ) return feature_a, feature_b, feature_c def build_ex_features(self, list_text): return do_feature_engineering(list_text) def _preprocess_op(self, index): example = self.examples[index] if self.mode == "train" or self.mode == "valid": if index % 2 == 0: op = 1 else: op = -1 else: op = 1 feature_a, feature_b, feature_c = self.build_features(example) return ( op, np.array(feature_a.input_ids, dtype=np.int64), np.array(feature_a.segment_ids, dtype=np.int64), np.array(feature_a.input_mask, dtype=np.int64), np.array(feature_b.input_ids, dtype=np.int64), np.array(feature_b.segment_ids, dtype=np.int64), np.array(feature_b.input_mask, dtype=np.int64), np.array(feature_c.input_ids, dtype=np.int64), np.array(feature_c.segment_ids, dtype=np.int64), np.array(feature_c.input_mask, dtype=np.int64), ) def _exft_preprocess_op(self, index): example = self.examples[index] if self.mode == "train" or self.mode == "valid": if index % 2 == 0: op = 1 else: op = -1 else: op = 1 feature_a, feature_b, feature_c = self.build_features(example) return ( op, np.array(feature_a.input_ids, dtype=np.int64), np.array(feature_a.segment_ids, dtype=np.int64), np.array(feature_a.input_mask, dtype=np.int64), np.array(feature_b.input_ids, dtype=np.int64), np.array(feature_b.segment_ids, dtype=np.int64), np.array(feature_b.input_mask, dtype=np.int64), np.array(feature_c.input_ids, dtype=np.int64), np.array(feature_c.segment_ids, dtype=np.int64), np.array(feature_c.input_mask, dtype=np.int64), torch.tensor(self.exft_a_df.iloc[index], dtype=torch.float32), torch.tensor(self.exft_b_df.iloc[index], dtype=torch.float32), torch.tensor(self.exft_c_df.iloc[index], dtype=torch.float32), ) def __getitem__(self, index): if self.fts_flag: return self._exft_preprocess_op(index) else: return self._preprocess_op(index) def __len__(self): return len(self.examples)
def evaluate(args:Dict): model_root = args['--model-root'] if args['--model-root'] else './models' print("load model from {}".format(model_root), file=sys.stderr) dataLoader = sentence.Sentence(args['--test-src']) device = torch.device("cuda:0" if args['--cuda'] else "cpu") output_model_file = os.path.join(model_root, "model_file.bin") output_config_file = os.path.join(model_root, "config_file.bin") output_vocab_file = os.path.join(model_root, "vocab.txt") config = BertConfig.from_json_file(output_config_file) model = BertForTokenClassification(config,num_labels=len(dataLoader.tag2idx)) state_dict = torch.load(output_model_file) model.load_state_dict(state_dict) tokenizer = BertTokenizer(output_vocab_file, do_lower_case=False) tokenized_texts = [tokenizer.tokenize(sent) for sent in dataLoader.sentences] if args['--cuda']: model = model.to(torch.device("cuda:0")) MAX_LEN = int(args['--max-len']) input_ids_test = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts], maxlen=MAX_LEN, dtype="long", truncating="post", padding="post") tags_test = pad_sequences([[dataLoader.tag2idx.get(l) for l in lab] for lab in dataLoader.labels], maxlen=MAX_LEN, value=dataLoader.tag2idx["O"], padding="post", dtype="long", truncating="post") attention_masks_test = [[float(i > 0) for i in ii] for ii in input_ids_test] for i, inp in enumerate(input_ids_test): if (102 not in inp): inp[-1] = 102 tags_test[i][-1] = dataLoader.tag2idx.get("O") te_inputs = torch.tensor(input_ids_test).to(torch.int64) te_tags = torch.tensor(tags_test).to(torch.int64) te_masks = torch.tensor(attention_masks_test) test_data = TensorDataset(te_inputs, te_masks, te_tags) test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=int(args['--batch-size'])) model.eval() predictions = [] true_labels = [] eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for batch in test_dataloader: batch = tuple(t.to(device) for t in batch) b_input_ids, b_input_mask, b_labels = batch with torch.no_grad(): tmp_eval_loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) logits = logits.detach().cpu().numpy() predictions.extend([list(p) for p in np.argmax(logits, axis=2)]) label_ids = b_labels.to('cpu').numpy() true_labels.append(label_ids) tmp_eval_accuracy = flat_accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += b_input_ids.size(0) nb_eval_steps += 1 pred_tags = [[dataLoader.tags_vals[p_i] for p_i in p] for p in predictions] test_tags = [[dataLoader.tags_vals[l_ii] for l_ii in l_i] for l in true_labels for l_i in l] tags_test_fin = list() for l in tags_test: temp_tag = list() for l_i in l: temp_tag.append(dataLoader.tags_vals[l_i]) tags_test_fin.append(temp_tag) print("Test loss: {}".format(eval_loss / nb_eval_steps)) print("Test Accuracy: {}".format(eval_accuracy / nb_eval_steps)) print("Test F1-Score: {}".format(f1_score(tags_test_fin, pred_tags))) print(classification_report(tags_test_fin, pred_tags)) print("Number of Test sentences: ", len(tags_test_fin))
class mod_eventclass(BasePlugin): """ Web Scraping plugin: mod_eventclass For classifying news events. """ minArticleLengthInChars = 400 pluginType = Types.MODULE_DATA_PROCESSOR # implies data post-processor dataFrame = None device = None model = None sentencesColList = [ 'url', 'sentence', 'sentence_no', 'neutral_prob', 'positive_prob', 'negative_prob' ] sentencesRec = None def __init__(self): """ Initialize the object """ super().__init__() def additionalConfig(self, sessionHistoryObj): """ Perform additional configuration that is specific to this plugin. :param sessionHistoryObj: The session history object to be used by this plugin for putting items into the data processing competed queue. :return: """ self.workDir = self.app_config.data_dir self.sessionHistDB = sessionHistoryObj self.pretuned_modelfile = self.app_config.checkAndSanitizeConfigString( 'plugins', 'mod_eventclass_modelfile') self.model_weights_path = self.app_config.checkAndSanitizeConfigString( 'plugins', 'mod_eventclass_weightspath') self.vocab_path = self.app_config.checkAndSanitizeConfigString( 'plugins', 'mod_eventclass_vocab_path') self.labels = {0: 'neutral', 1: 'positive', 2: 'negative'} # TODO: fix model load error: self.setupModel() self.sentencesRec = pd.DataFrame(np.zeros( (1, len(self.sentencesColList)), dtype=np.unicode_), columns=self.sentencesColList) # convert last 4 into float32 dtype for colname in [ "sentence_no", "neutral_prob", "positive_prob", "negative_prob" ]: self.sentencesRec[colname] = pd.to_numeric( self.sentencesRec[colname]) def setupModel(self): """ Load the classification model. """ num_labels = len(self.labels) vocab_type = "finance-uncased" self.max_seq_length = 256 if torch.cuda.is_available(): self.device = torch.device("cuda") else: self.device = torch.device("cpu") self.model = BertClassification(weight_path=self.model_weights_path, num_labels=num_labels, vocab=vocab_type) self.model.load_state_dict( torch.load(self.pretuned_modelfile, map_location=self.device)) self.model.to(self.device) self.tokenizer = BertTokenizer(vocab_file=self.vocab_path, do_lower_case=True, do_basic_tokenize=True) def processDataObj(self, newsEventObj): """ Process given data object by this plugin. :param newsEventObj: The NewsEvent object to be classified. :type newsEventObj: NewsEvent """ assert type(newsEventObj) == NewsEvent # Do not proceed if the articles has already been classified, i.e. contains scores if newsEventObj.getClassification() is None: # TODO: lock file to avoid conflicting writes, release lock at the end of the method logger.debug( f"Started news event classification for data in: {newsEventObj.getFileName()}" ) classificationObj = self.classifyText(newsEventObj.getText(), newsEventObj.getURL()) # put classification field in NewsEvent document: newsEventObj.setClassification(classificationObj) # prepare filename: fileNameWOExt = newsEventObj.getFileName().replace('.json', '') # save document to file: newsEventObj.writeFiles(fileNameWOExt, '', saveHTMLFile=False) logger.info( f"Completed classifying news event in: {fileNameWOExt} as: {classificationObj}" ) def classifyText(self, textValue, url): """ Examine and classify the text from the document and return classification scores text. :param textValue: Text to be examined and classified. :type textValue: str :return: Classification scores :rtype: dict{str:float} """ sentenceDF = None classificationScores = { 'positive': 0.0, 'neutral': 0.0, 'negative': 0.0 } try: logger.debug( f'Classifying using finbert model for text of length {len(textValue)}' ) if len(textValue) > self.minArticleLengthInChars: thisRec = self.sentencesRec.copy(deep=True) thisRec['url'] = url sentences = sent_tokenize(textValue.lower()) self.model.eval() for index, sent in enumerate(sentences): thisRec['sentence'] = sent thisRec['sentence_no'] = index # apply model on the sentence to get classification scores [neutralProb, positiveProb, negativeProb] = self.classifySentences(sent) thisRec['neutral_prob'] = neutralProb thisRec['positive_prob'] = positiveProb thisRec['negative_prob'] = negativeProb if sentenceDF is None: sentenceDF = thisRec else: sentenceDF = sentenceDF.append(thisRec) aggscores = sentenceDF.groupby('url').agg({ 'neutral_prob': 'sum', 'positive_prob': 'sum', 'negative_prob': 'sum' }) classificationScores = { 'positive': aggscores['positive_prob'][0], 'neutral': aggscores['neutral_prob'][0], 'negative': aggscores['negative_prob'][0] } except Exception as e: print("Error getting sentence classification:", e) return (classificationScores) def classifySentences(self, sent): """ Classify one text sentence at a time. """ tokenized_sent = self.tokenizer.tokenize(sent) if len(tokenized_sent) > self.max_seq_length: tokenized_sent = tokenized_sent[:self.max_seq_length] ids_review = self.tokenizer.convert_tokens_to_ids(tokenized_sent) mask_input = [1] * len(ids_review) padding = [0] * (self.max_seq_length - len(ids_review)) ids_review += padding mask_input += padding input_type = [0] * self.max_seq_length input_ids = torch.tensor(ids_review).to(self.device).reshape(-1, 256) attention_mask = torch.tensor(mask_input).to(self.device).reshape( -1, 256) token_type_ids = torch.tensor(input_type).to(self.device).reshape( -1, 256) with torch.set_grad_enabled(False): outputs = self.model(input_ids, token_type_ids, attention_mask) outputs = F.softmax(outputs, dim=1) # print('\n FinBERT predicted sentiment: ', labels[torch.argmax(outputs).item()]) return ([i.item() for i in outputs.data[0]])
def __init__(self, path: str, fields: List[Tuple[str, tt.data.Field]], tokenizer: BertTokenizer, max_length: int = 512, include_features=False, **kwargs): max_length = max_length - 3 # Count without special tokens with open(path) as dataf: data_json = json.load(dataf) examples = [] # Each input needs to have at most 2 segments # We will create following input # - [CLS] source post, previous post [SEP] choice_1 [SEP] for example in data_json["Examples"]: make_ids = lambda x: tokenizer.convert_tokens_to_ids( tokenizer.tokenize(x)) text = make_ids(example["spacy_processed_text"]) prev = make_ids(example["spacy_processed_text_prev"]) src = make_ids(example["spacy_processed_text_src"]) segment_A = src segment_C = prev segment_B = text text_ids = [tokenizer.vocab["[CLS]"]] + segment_A + [tokenizer.vocab["[SEP]"]] + segment_C + \ [tokenizer.vocab["[SEP]"]] + segment_B + [tokenizer.vocab["[SEP]"]] # truncate if exceeds max length if len(text_ids) > max_length: # Truncate segment A segment_C = segment_C[:max_length // 2] text_ids = [tokenizer.vocab["[CLS]"]] + segment_A + [tokenizer.vocab["[SEP]"]] + segment_C + \ [tokenizer.vocab["[SEP]"]] + segment_B + [tokenizer.vocab["[SEP]"]] if len(text_ids) > max_length: # Truncate segment A segment_A = segment_A[:max_length // 2] text_ids = [tokenizer.vocab["[CLS]"]] + segment_A + [tokenizer.vocab["[SEP]"]] + segment_C + \ [tokenizer.vocab["[SEP]"]] + segment_B + [tokenizer.vocab["[SEP]"]] if len(text_ids) > max_length: # Truncate also segment B segment_B = segment_B[:max_length // 2] text_ids = [tokenizer.vocab["[CLS]"]] + segment_A + [tokenizer.vocab["[SEP]"]] + segment_C + \ [tokenizer.vocab["[SEP]"]] + segment_B + [tokenizer.vocab["[SEP]"]] segment_ids = [0] * (len(segment_A) + 2) + [2] * ( len(segment_C) + 1) + [1] * (len(segment_B) + 1) # example_list = list(example.values())[:-3] + [text_ids, segment_ids] if include_features: example_list = list( example.values()) + [text_ids, segment_ids] else: example_list = [ example["id"], example["branch_id"], example["tweet_id"], example["stance_label"], example["veracity_label"], "\n-----------\n".join([ example["raw_text_src"], example["raw_text_prev"], example["raw_text"] ]), example["issource"] ] + [text_ids, segment_ids] examples.append(Example.fromlist(example_list, fields)) super(RumourEval2019Dataset_BERTTriplets_3Segments, self).__init__(examples, fields, **kwargs)
def bert_tokenize_with_spacy_meta( spacy_model: SpacyLanguage, bert_tokenizer: BertTokenizer, unique_id: int, words: Sequence[str], sentence_ids: Sequence[int], data_key: Optional[Union[str, Sequence[str]]], data_ids: Optional[Sequence[int]], start: int = 0, stop: Optional[int] = None, start_sequence_2: Optional[int] = None, stop_sequence_2: Optional[int] = None, start_sequence_3: Optional[int] = None, stop_sequence_3: Optional[int] = None, multipart_id: Optional[int] = None, span_ids: Optional[Sequence[int]] = None, is_apply_data_offset_entire_group: bool = False) -> InputFeatures: """ Uses spacy to get information such as part of speech, probability of word, etc. and aligns the tokenization from spacy with the bert tokenization. Args: spacy_model: The spacy model to use for spacy tokenization, part of speech analysis, etc. Generally from make_tokenizer_model() bert_tokenizer: The bert tokenizer to use. Usually from corpus_loader.make_bert_tokenizer() unique_id: The unique id for this example words: The words in this example. Generally a sentence, but it doesn't have to be. sentence_ids: For each word, identifies which sentence the word belongs to. Used to compute index_word_in_sentence data_key: A key (or multiple keys) to designate which response data set(s) data_ids references data_ids: Sequence[Int]. Describes an indices into a separate data array for each word. For example, if the first word in words corresponds to fMRI image 17 in a separate data array, and the second word corresponds to image 19, then this parameter could start with [17, 19, ...]. start: Offset where the actual input features should start. It is best to compute spacy meta on full sentences, then slice the resulting tokens. start and end are used to slice words, sentence_ids, data_key and data_ids stop: Exclusive end point for the actual input features. If None, the full length is used start_sequence_2: Used for bert to combine 2 sequences as a single input. Generally this is used for tasks like question answering where type_id=0 is the question and type_id=1 is the answer. If None, assumes the entire input is sequence 1. stop_sequence_2: Used for bert to combine 2 sequences as a single input. Generally this is used for tasks like question answering where type_id=0 is the question and type_id=1 is the answer. If None, assumes the entire input is sequence 1. start_sequence_3: Used for bert to combine 3 sequences as a single input. Generally this is used for tasks like question answering with a context. type_id=0 is the context and type_id=1 is the question and answer stop_sequence_3: Used for bert to combine 3 sequences as a single input. Generally this is used for tasks like question answering with a context. type_id=0 is the context and type_id=1 is the question and answer multipart_id: Used to express that this example needs to be in the same batch as other examples sharing the same multipart_id to be evaluated span_ids: Bit-encoded span identifiers which indicate which spans each word belongs to when spans are labeled in the input. If not given, no span ids will be set on the returned InputFeatures instance. is_apply_data_offset_entire_group: If a word is broken into multiple tokens, generally a single token is heuristically chosen as the 'main' token corresponding to that word. The data_id it is assigned is given by data offset, while all the tokens that are not the main token in the group are assigned -1. If this parameter is set to True, then all of the multiple tokens corresponding to a word are assigned the same data_id, and none are set to -1. This can be a better option for fMRI where the predictions are not at the word level, but rather at the level of an image containing multiple words. Returns: An InputFeatures instance """ sent = '' cum_lengths = list() bert_token_groups = list() for w in words: if len(sent) > 0: sent += ' ' sent += str(w) cum_lengths.append(len(sent)) bert_token_groups.append(bert_tokenizer.tokenize(w)) spacy_token_groups = group_by_cum_lengths(cum_lengths, spacy_model(sent)) # bert bert_erp_tokenization does not seem to care whether we do word-by-word or not; it is simple whitespace # splitting etc., then sub-word tokens are created from that example_tokens = list() example_mask = list() example_is_stop = list() example_is_begin_word_pieces = list() example_lengths = list() example_probs = list() example_head_location = list() example_token_head = list() example_type_ids = list() example_data_ids = list() example_span_ids = list() if span_ids is not None else None example_index_word_in_example = list() example_index_token_in_sentence = list() def _append_special_token(special_token, index_word_in_example_, index_token_in_sentence_, type_id_): example_tokens.append(special_token) example_mask.append(1) example_is_stop.append(1) example_is_begin_word_pieces.append(1) example_lengths.append(0) example_probs.append(-20.) example_head_location.append(np.nan) example_token_head.append('[PAD]') example_type_ids.append(type_id_) example_data_ids.append(-1) if span_ids is not None: example_span_ids.append(0) example_index_word_in_example.append(index_word_in_example_) example_index_token_in_sentence.append(index_token_in_sentence_) type_id = 0 _append_special_token('[CLS]', index_word_in_example_=0, index_token_in_sentence_=0, type_id_=type_id) index_token_in_sentence = 0 index_word_in_example = 0 last_sentence_id = None bert_token_groups_with_spacy = list() for spacy_token_group, bert_token_group, word in zip( spacy_token_groups, bert_token_groups, words): bert_token_groups_with_spacy.append( align_spacy_meta(spacy_token_group, bert_token_group, word, bert_tokenizer)) if start < 0: start = len(words) + start if stop is None: stop = len(words) elif stop < 0: stop = len(words) + stop sequences = [(start, stop)] if start_sequence_2 is not None and start_sequence_2 < 0: start_sequence_2 = len(words) + start_sequence_2 if stop_sequence_2 is not None and stop_sequence_2 < 0: stop_sequence_2 = len(words) + stop_sequence_2 if start_sequence_2 is not None: if start_sequence_2 < stop: raise ValueError('start_sequence_2 ({}) < stop ({})'.format( start_sequence_2, stop)) if stop_sequence_2 is None: stop_sequence_2 = len(words) sequences.append((start_sequence_2, stop_sequence_2)) if start_sequence_3 is not None and start_sequence_3 < 0: start_sequence_3 = len(words) + start_sequence_3 if stop_sequence_3 is not None and stop_sequence_3 < 0: stop_sequence_3 = len(words) + stop_sequence_3 if stop_sequence_3 is not None: if stop_sequence_2 is None or start_sequence_3 < stop_sequence_2: raise ValueError( 'start_sequence_3 ({}) < stop_sequence_2 ({})'.format( start_sequence_3, stop_sequence_2)) if stop_sequence_3 is None: stop_sequence_3 = len(words) sequences.append((start_sequence_3, stop_sequence_3)) idx_sequence = 0 for idx_group, bert_tokens_with_spacy in enumerate( bert_token_groups_with_spacy): if last_sentence_id is None or sentence_ids[ idx_group] != last_sentence_id: index_token_in_sentence = -1 last_sentence_id = sentence_ids[idx_group] if idx_group >= sequences[idx_sequence][1]: if idx_sequence + 1 < len(sequences): idx_sequence += 1 else: break if idx_group < sequences[idx_sequence][0]: continue assert (sequences[idx_sequence][0] <= idx_group < sequences[idx_sequence][1]) index_word_in_example += 1 idx_data = get_data_token_index(bert_tokens_with_spacy) for idx_token, (t, length, spacy_token) in enumerate(bert_tokens_with_spacy): index_token_in_sentence += 1 idx_head_group = _get_syntactic_head_group( spacy_token, bert_token_groups_with_spacy) head_token = '[PAD]' head_location = np.nan if idx_head_group is not None: idx_head_data_token = get_data_token_index( bert_token_groups_with_spacy[idx_head_group]) head_token = bert_token_groups_with_spacy[idx_head_group][ idx_head_data_token][0] head_location = idx_head_group - idx_group example_tokens.append(t) example_mask.append(1) example_is_stop.append(1 if _is_stop(spacy_token) else 0) example_lengths.append(length) example_probs.append( -20. if spacy_token is None else spacy_token.prob) example_head_location.append(head_location) example_token_head.append(head_token) is_continue_word_piece = t.startswith('##') example_is_begin_word_pieces.append( 0 if is_continue_word_piece else 1) example_type_ids.append(type_id) if span_ids is not None: example_span_ids.append(span_ids[idx_group]) example_index_word_in_example.append(index_word_in_example) example_index_token_in_sentence.append(index_token_in_sentence) # we follow the BERT paper and always use the first word-piece as the labeled one data_id = -1 if data_ids is not None and idx_token == idx_data or is_apply_data_offset_entire_group: data_id = data_ids[idx_group] example_data_ids.append(data_id) if idx_group == sequences[idx_sequence][1]: _append_special_token('[SEP]', index_word_in_example + 1, index_token_in_sentence + 1, type_id) index_word_in_example += 1 type_id = 1 if data_key is None: data_key = dict() if isinstance(data_key, str): data_key = [data_key] def _readonly(arr): arr.setflags(write=False) return arr example_data_ids = _readonly(np.array(example_data_ids)) return InputFeatures( unique_id=unique_id, tokens=tuple(example_tokens), token_ids=_readonly( np.asarray(bert_tokenizer.convert_tokens_to_ids(example_tokens))), mask=_readonly(np.array(example_mask)), is_stop=_readonly(np.array(example_is_stop)), is_begin_word_pieces=_readonly(np.array(example_is_begin_word_pieces)), token_lengths=_readonly(np.array(example_lengths)), token_probabilities=_readonly(np.array(example_probs)), type_ids=_readonly(np.array(example_type_ids)), head_location=_readonly(np.array(example_head_location)), head_tokens=tuple(example_token_head), head_token_ids=_readonly( np.array( bert_tokenizer.convert_tokens_to_ids(example_token_head))), index_word_in_example=_readonly( np.array(example_index_word_in_example)), index_token_in_sentence=_readonly( np.array(example_index_token_in_sentence)), multipart_id=multipart_id, span_ids=_readonly(np.array(example_span_ids)) if example_span_ids is not None else None, data_ids=dict((k, example_data_ids) for k in data_key))
vocab_file='../input/torch-bert-weights/bert-base-uncased-vocab.txt') # ## Make prediction # In[ ]: # lets tokenize some text (I intentionally mispelled 'plastic' to check berts subword information handling) text = 'hi my name is Dieter and I like wearing my yellow pglastic hat while coding.' tokens = tokenizer.tokenize(text) tokens # In[ ]: # added start and end token and convert to ids tokens = ["[CLS]"] + tokens + ["[SEP]"] input_ids = tokenizer.convert_tokens_to_ids(tokens) input_ids # In[ ]: # put input on gpu and make prediction bert_output = bert(torch.tensor([input_ids]).cuda()) bert_output # ## (Optional) Convert model to fp16 # In[ ]: import apex bert.half()
def home(): # global model, BERT_FP, bert, tokenizer, nlp model = torch.load('model_sciBERT_CRF10.pth') BERT_FP = 'scibert_scivocab_uncased' bert = BertModel.from_pretrained(BERT_FP) tokenizer = BertTokenizer(vocab_file=BERT_FP + '/vocab.txt') nlp = en_core_web_sm.load() datatowrite = [] result = '' if (request.method == 'POST'): token_indices = [] file_raw = request.form.get('abstract') actual_file = open('abstract_str/abstract.txt', 'w') actual_file.write(file_raw) actual_file.close() file = file_raw.lower() tokens_list = tokenizer.tokenize(file) n = 0 for i, item in enumerate(tokens_list): try: start_index = file.index(item.strip('#')) except: start_index = 100 if ((start_index < 5 or unk == 1) and item != '[UNK]'): token_indices.append( (start_index + n, n + start_index + len(item.strip('#')))) n = token_indices[-1][-1] file = file[start_index + len(item.strip('#')):] else: token_indices.append((-1, -1)) if (item != '[UNK]'): n += len(item.strip('#')) file = file[len(item.strip('#')):] with torch.no_grad(): inputs = tokenizer.convert_tokens_to_ids(tokens_list) inputs = bert(torch.tensor([inputs]))[0] for j in range(len(inputs)): inputs[j] = inputs[j].numpy() inputs = torch.tensor(np.array(inputs)) prediction = model(inputs.permute(1, 2, 0, 3).squeeze(0)) output = prediction[0] dic = {} dataarr = file_raw tagsarr = output indicesarr = token_indices indicesdata = [] datatowrite = [] for j in range(len(tagsarr)): if (tagsarr[j] == 0 or tagsarr[j] == 4): indicesdata.append(list(indicesarr[j])) if (tagsarr[j] == 1 or tagsarr[j] == 2): indicesdata[-1][1] = indicesarr[j][1] indicestowrite = indicesdata ind_temp = [] data_temp = [] for j in indicestowrite: ind_temp.append(j) data_temp.append(dataarr[j[0]:j[1]]) indicestowrite = [] datatowrite = [] for j in range(len(ind_temp)): temp = nlp(data_temp[j]) count = 0 for k in temp: count += 1 if (count == 1): ind = [ [k.start() + 1, k.start() + 1 + len(data_temp[j])] for k in re.finditer( '[^a-z]' + re.escape(data_temp[j].lower()) + '[^a-z]', dataarr.lower()) if [k.start() + 1, k.start() + 1 + len(data_temp[j])] not in ind_temp and [k.start() + 1, k.start() + 1 + len(data_temp[j])] not in indicestowrite ] temp_ind = [] dat = [] for l in ind: if (dataarr[l[0]:l[1]].lower() != dataarr[l[0]:l[1]]): dat.append(dataarr[l[0]:l[1]]) temp_ind.append(l) indicestowrite += temp_ind datatowrite += dat ind_temp = ind_temp + indicestowrite data_temp = data_temp + datatowrite indicestowrite = [] datatowrite = [] for j in range(len(data_temp)): temp_2 = nlp(data_temp[j]) temp = [] for word in temp_2: temp.append((len(word.text), word.text)) if (len(temp) == 1): if (str(temp[0][1]).lower() != str(temp[0][1]) or re.match('^[a-z]+$', temp[0][1]) == None or len(temp[0][1]) > 3): indicestowrite.append(ind_temp[j]) datatowrite.append(data_temp[j]) else: indicestowrite.append(ind_temp[j]) datatowrite.append(data_temp[j]) indicestowrite = sorted(indicestowrite, key=lambda x: x[0]) if (len(indicestowrite) == 0): return render_template("index.html", keyphrases=file_raw) print(indicestowrite) annotation_file = open('abstract_str/abstract.ann', 'w') for qwe in range(len(indicestowrite)): annotation_file.write( 'T' + str(qwe + 1) + '\t' + 'Process ' + str(indicestowrite[qwe][0]) + ' ' + str(indicestowrite[qwe][1]) + '\t' + file_raw[indicestowrite[qwe][0]:indicestowrite[qwe][1]] + '\n') annotation_file.close() X_test, y_test_gold, _, test_entities = read_and_map( 'abstract_str', mapper) loaded_model = pickle.load(open('finalized_model_joined.sav', 'rb')) predictions = loaded_model.predict(X_test) y_values = ['Process', 'Material', 'Task'] document_abbr = {} asd = os.listdir('abstract_str') for i in range(len(asd)): document_abbr[asd[i][:-4]] = {} for i in range(len(predictions)): if (test_entities[i].string == test_entities[i].string.upper() and len(test_entities[i].string) > 1): if (y_values[predictions[i]] == "Material"): predictions[i] = y_values.index("Process") if (test_entities[i].string == test_entities[i].string.capitalize() and len(test_entities[i].string) == 2): predictions[i] = y_values.index("Material") tmp = test_entities[i].string.split(" ") if (len(tmp) == 1): if (test_entities[i].string == test_entities[i].string.upper() and hasNumbers(test_entities[i].string)): predictions[i] = y_values.index("Material") if (test_entities[i].string == test_entities[i].string.upper()): try: predictions[i] = document_abbr[test_entities[i].docid][ test_entities[i].string] except: obracket = test_entities[i].start - 1 cbracket = test_entities[i].end file = open( 'abstract_str/' + test_entities[i].docid + '.txt', 'r').read() if (file[obracket] == '(' and file[cbracket] == ')'): if (test_entities[i].start - test_entities[i - 1].end == 2): # print(test_entities[i].string, '\t',test_entities[i-1].string ,'\t' ,test_entities[i].start, '\t',test_entities[i-1].end ) document_abbr[test_entities[i].docid][ test_entities[i].string] = predictions[i - 1] predictions[i] = predictions[i - 1] for j in range(len(tmp)): if (len(tmp[j]) == 1 and tmp[j] == tmp[j].upper()): predictions[i] = y_values.index("Material") # print(predictions) n = 0 result = [] last_closing = 0 for i in range(len(indicestowrite)): qwe_temp = file_raw[n:indicestowrite[i][0]] if (qwe_temp != ''): result.append(qwe_temp) temp = '' if (predictions[i] == 0): temp = '<span style="background-color:rgba(152, 252, 3, 0.5);"><strong>' + file_raw[ indicestowrite[i][0]:indicestowrite[i] [1]] + '</strong></span>' elif (predictions[i] == 1): temp = '<span style="background-color:rgba(252, 152, 3, 0.5);"><strong>' + file_raw[ indicestowrite[i][0]:indicestowrite[i] [1]] + '</strong></span>' elif (predictions[i] == 2): temp = '<span style="background-color:rgba(3, 152, 252, 0.5);"><strong>' + file_raw[ indicestowrite[i][0]:indicestowrite[i] [1]] + '</strong></span>' if (indicestowrite[i][1] > last_closing): result.append(temp) last_closing = indicestowrite[i][1] n = indicestowrite[i][1] # else: # ov_string = file_raw[indicestowrite[i][0]:indicestowrite[i][1]] # temp_start = result[-1].index(ov_string) # result[-1] = result[-1][:temp_start] + temp + result[-1][ temp_start+indicestowrite[i][1] - indicestowrite[i][0]:] # result += '<span style="background-color:rgba(152, 252, 3, 0.5);"><strong>' + file_raw[i[0]:i[1]] + '</strong></span>' result += file_raw[n:] # print(result) result = "".join(result) return render_template("index.html", keyphrases=result)
class BertWithJumanModel(): def __init__(self, bert_path, vocab_file_name="vocab.txt", use_cuda=False): # 日本語文章をBERTに食わせるためにJumanを読み込む self.juman_tokenizer = JumanTokenizer() # 事前学習済みのBERTモデルを読み込む self.model = BertModel.from_pretrained(bert_path) # 事前学習済みのBERTモデルのTokenizerを読み込む self.bert_tokenizer = BertTokenizer(Path(bert_path) / vocab_file_name, do_lower_case=False, do_basic_tokenize=False) # CUDA-GPUを利用するかどうかのフラグ読み込み self.use_cuda = use_cuda def _preprocess_text(self, text): # 事前処理、テキストの半角スペースは削除 try: return text.replace(" ", "") # for Juman except: return '' def get_sentence_embedding(self, text, pooling_layer=-2, pooling_strategy="REDUCE_MEAN"): # テキストの半角スペースを削除する preprocessed_text = self._preprocess_text(text) # 日本語のテキストを分かち書きし、トークンリストに変換する tokens = self.juman_tokenizer.tokenize(preprocessed_text) # トークンを半角スペースで結合しstrに変換する bert_tokens = self.bert_tokenizer.tokenize(" ".join(tokens)) # テキストのサイズは128までなので、ヘッダ + トークン126個 + フッタを作成 # トークンをidに置換する ids = self.bert_tokenizer.convert_tokens_to_ids( ["[CLS]"] + bert_tokens[:126] + ["[SEP]"]) # max_seq_len-2 tokens_tensor = torch.tensor(ids).reshape(1, -1) if self.use_cuda: # GPUの利用チェック、利用 tokens_tensor = tokens_tensor.to('cuda') self.model.to('cuda') # モデルを評価モードに変更 self.model.eval() with torch.no_grad(): # 自動微分を適用しない(メモリ・高速化などなど) # id列からベクトル表現を計算する all_encoder_layers, _ = self.model(tokens_tensor) # SWEMと同じ方法でベクトルを時間方向にaverage-poolingしているらしい # 文章列によって次元が可変になってしまうので、伸びていく方向に対してプーリングを行い次元を固定化する # https://yag-ays.github.io/project/swem/ embedding = all_encoder_layers[pooling_layer].cpu().numpy()[0] if pooling_strategy == "REDUCE_MEAN": return np.mean(embedding, axis=0) elif pooling_strategy == "REDUCE_MAX": return np.max(embedding, axis=0) elif pooling_strategy == "REDUCE_MEAN_MAX": return np.r_[np.max(embedding, axis=0), np.mean(embedding, axis=0)] elif pooling_strategy == "CLS_TOKEN": return embedding[0] else: raise ValueError( "specify valid pooling_strategy: {REDUCE_MEAN, REDUCE_MAX, REDUCE_MEAN_MAX, CLS_TOKEN}" )
tokenizer = BertTokenizer(vocab_file='biobert_v1.0_pubmed_pmc/vocab.txt', do_lower_case=False) tokenized_texts_and_labels = [ tokenize_and_preserve_labels(sent, labs) for sent, labs in zip(sentences, tags) ] tokenized_texts = [ token_label_pair[0] for token_label_pair in tokenized_texts_and_labels ] labels = [ token_label_pair[1] for token_label_pair in tokenized_texts_and_labels ] input_ids = pad_sequences( [tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts], maxlen=MAX_LEN, dtype="long", value=0.0, truncating="post", padding="post") tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels], maxlen=MAX_LEN, value=tag2idx["PAD"], padding="post", dtype="long", truncating="post") attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids] tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags,
class BertWithJumanModel: def __init__(self, bert_path, vocab_file_name="vocab.txt", use_cuda=False): self.juman_tokenizer = JumanTokenizer() self.model = BertModel.from_pretrained(bert_path) self.bert_tokenizer = BertTokenizer( Path(bert_path) / vocab_file_name, do_lower_case=False, do_basic_tokenize=False, ) self.use_cuda = use_cuda def _preprocess_text(self, text): return text.replace(" ", "") # for Juman def get_sentence_embedding(self, text, pooling_layer=-2, pooling_strategy="REDUCE_MEAN"): preprocessed_text = self._preprocess_text(text) tokens = self.juman_tokenizer.tokenize(preprocessed_text) bert_tokens = self.bert_tokenizer.tokenize(" ".join(tokens)) ids = self.bert_tokenizer.convert_tokens_to_ids( ["[CLS]"] + bert_tokens[:126] + ["[SEP]"]) # max_seq_len-2 tokens_tensor = torch.tensor(ids).reshape(1, -1) if self.use_cuda: tokens_tensor = tokens_tensor.to("cuda") self.model.to("cuda") self.model.eval() with torch.no_grad(): all_encoder_layers, _ = self.model(tokens_tensor) embedding = all_encoder_layers[pooling_layer].cpu().numpy()[0] if pooling_strategy == "REDUCE_MEAN": return np.mean(embedding, axis=0) elif pooling_strategy == "REDUCE_MAX": return np.max(embedding, axis=0) elif pooling_strategy == "REDUCE_MEAN_MAX": return np.r_[np.max(embedding, axis=0), np.mean(embedding, axis=0)] elif pooling_strategy == "CLS_TOKEN": return embedding[0] else: raise ValueError( "specify valid pooling_strategy: {REDUCE_MEAN, REDUCE_MAX, REDUCE_MEAN_MAX, CLS_TOKEN}" ) # edited def sentence_list_to_vec(self, sentence_list: list): """ 1文をリストにしたものを受け渡す """ vec_list = [] vec_mean = 0 if not sentence_list: return ([None], np.zeros(768)) try: for s in sentence_list: tmp = self.get_sentence_embedding(s) vec_list.append(tmp) vec_mean += tmp vec_mean = vec_mean / len(vec_list) return vec_list, vec_mean except ValueError: return ([None], np.zeros(768)) def sentence_list_to_vec_with_bug(self, sentence_list: list): """ 1文をリストにしたものを受け渡す """ vec_list = [] vec_mean = 0 for s in sentence_list: tmp = self.get_sentence_embedding(s) vec_list.append(tmp) vec_mean += tmp vec_mean = vec_mean / len(vec_mean) return vec_list, vec_mean
class Generater: def __init__(self, bert_path): vocab_file_name = 'vocab.txt' # 日本語文章をBERTに食わせるためにJumanを読み込む self.juman_tokenizer = JumanTokenizer() # 事前学習済みのBERTモデルを読み込む self.model = BertModel.from_pretrained(bert_path) # 事前学習済みのBERTモデルのTokenizerを読み込む self.bert_tokenizer = BertTokenizer(Path(bert_path) / vocab_file_name, do_lower_case=False, do_basic_tokenize=False) self.vocab_size = len(self.bert_tokenizer.vocab) # 事前学習済みのBERTモデルのMaskedLMタスクモデルを読み込む self.model = BertForMaskedLM.from_pretrained(bert_path) # 除外するヘッダ等トークン except_tokens = ["[MASK]", #"[PAD]", "[UNK]", "[CLS]", "[SEP]", "(", ")", "・", "/", "、", "。", "!", "?", "「", "」", "…", "’", "』", "『", ":", "※" ] self.except_ids = [self.bert_tokenizer.vocab[token] for token in except_tokens] # vocab_sizeのうち、except_ids以外は、利用する self.candidate_ids = [i for i in range(self.vocab_size) if i not in self.except_ids] def _preprocess_text(self, text): # 事前処理、テキストの半角スペースは削除 return text.replace(" ", "").replace('#', '') # for Juman def text2tokens(self, text): # テキストの半角スペースを削除する preprocessed_text = self._preprocess_text(text) # 日本語のテキストを分かち書きし、トークンリストに変換する tokens = self.juman_tokenizer.tokenize(preprocessed_text) # トークンを半角スペースで結合しstrに変換する bert_tokens = self.bert_tokenizer.tokenize(" ".join(tokens)) # テキストのサイズは128までなので、ヘッダ + トークン126個 + フッタを作成 # トークンをidに置換する ids = self.bert_tokenizer.convert_tokens_to_ids(["[CLS]"] + bert_tokens[:126] + ["[SEP]"]) # max_seq_len-2 generated_token_ids = torch.tensor(ids).reshape(1, -1) return generated_token_ids def tokens2text(self, tokens): sampled_sequence = [self.bert_tokenizer.ids_to_tokens[token_id] for token_id in tokens[0].cpu().numpy()] sampled_sequence = "".join( [ token[2:] if token.startswith("##") else token for token in list(filter(lambda x: x != '[PAD]' and x != '[CLS]' and x != '[SEP]', sampled_sequence)) ] ) return sampled_sequence def likelihood(self, tokens): outputs = self.model(tokens) predictions = outputs[0] score_sum = 0.0 for idx, scores in zip(tokens[0].tolist(), predictions[0].tolist()): score_sum += scores[idx] return score_sum def initialization_text(self, length=10): init_tokens = [] # ヘッダ init_tokens.append(self.bert_tokenizer.vocab["[CLS]"]) for _ in range(length): # ランダムに文字を選択 init_tokens.append(random.choice(self.candidate_ids)) # フッタ init_tokens.append(self.bert_tokenizer.vocab["[SEP]"]) return torch.tensor(init_tokens).reshape(1, -1) def scoring(self, tokens): return self.likelihood(tokens) + self.juman_tokenizer.tanka_score_subsets(self.tokens2text(tokens)) + self.juman_tokenizer.tanka_score_flow(self.tokens2text(tokens)) def select(self, l_tokens, size=5): scores = list(map(self.scoring, l_tokens)) print(sorted(scores, reverse=True)[:3]) selected = list(map( lambda x: x[0], sorted( list(zip(l_tokens, scores)), key=lambda x: x[1], reverse=True ) )) return selected def crossover(self, tokens_0, tokens_1): l_tokens_0 = tokens_0.numpy().reshape(-1).tolist() l_tokens_1 = tokens_1.numpy().reshape(-1).tolist() start = random.randint(1, len(l_tokens_0) - 3) end = random.randint(start, len(l_tokens_0) - 2) for num in range(start, end): l_tokens_0[num] = l_tokens_1[num] return torch.tensor(l_tokens_0).reshape(1, -1) def mutation(self, tokens, N=3): l_tokens = tokens.numpy().reshape(-1).tolist() for num in range(N): num = random.randint(1, len(l_tokens) - 2) l_tokens[num] = self.bert_tokenizer.vocab["[MASK]"] outputs = self.model(torch.tensor(l_tokens).reshape(1, -1)) predictions = outputs[0] _, predicted_indexes = torch.topk(predictions[0, num], k=10) # random_tokens = [random.choice(self.candidate_ids) for i in range(1)] random_tokens = [] predicted_indexes = list( set(predicted_indexes.tolist() + random_tokens) - set(self.except_ids) ) predicted_tokens = self.bert_tokenizer.convert_ids_to_tokens(predicted_indexes) predict_token = random.choice(predicted_indexes) l_tokens[num] = predict_token return torch.tensor(l_tokens).reshape(1, -1)
class BertWithJumanModel(): def __init__(self, bert_path, vocab_file_name="vocab.txt", use_cuda=False): # 日本語文章をBERTに食わせるためにJumanを読み込む self.juman_tokenizer = JumanTokenizer() # 事前学習済みのBERTモデルのMaskedLMタスクモデルを読み込む self.model = BertForMaskedLM.from_pretrained(bert_path) # 事前学習済みのBERTモデルのTokenizerを読み込む self.bert_tokenizer = BertTokenizer(Path(bert_path) / vocab_file_name, do_lower_case=False, do_basic_tokenize=False) # CUDA-GPUを利用するかどうかのフラグ読み込み self.use_cuda = use_cuda def _preprocess_text(self, text): # 事前処理、テキストの半角スペースは削除 return text.replace(" ", "") # for Juman def paraphrase(self, text): # テキストの半角スペースを削除する preprocessed_text = self._preprocess_text(text) # 日本語のテキストを分かち書きし、トークンリストに変換する tokens = self.juman_tokenizer.tokenize(preprocessed_text) # トークンを半角スペースで結合しstrに変換する bert_tokens = self.bert_tokenizer.tokenize(" ".join(tokens)) # テキストのサイズは128までなので、ヘッダ + トークン126個 + フッタを作成 # トークンをidに置換する ids = self.bert_tokenizer.convert_tokens_to_ids( ["[CLS]"] + bert_tokens[:126] + ["[SEP]"]) # max_seq_len-2 generated_token_ids = torch.tensor(ids).reshape(1, -1) if self.use_cuda: # GPUの利用チェック、利用 generated_token_ids = generated_token_ids.to('cuda') self.model.to('cuda') # モデルを評価モードに変更 self.model.eval() with torch.no_grad(): for i in range(10): for j, _ in enumerate(tokens): # 文章のトークン1つをMASKに変換する # ヘッダは除くから、+1から masked_index = j + 1 pre_token = generated_token_ids[0, masked_index].item() generated_token_ids[ 0, masked_index] = self.bert_tokenizer.vocab["[MASK]"] outputs = self.model(generated_token_ids) predictions = outputs[0] _, predicted_indexes = torch.topk( predictions[0, masked_index], k=5) predicted_tokens = self.bert_tokenizer.convert_ids_to_tokens( predicted_indexes.tolist()) print(predicted_tokens) predict_token = predicted_indexes.tolist()[0] # if pre_token == predict_token: # predict_token = predicted_indexes.tolist()[1] generated_token_ids[0, masked_index] = predict_token # idから文字列に変換、結合 sampled_sequence = [ self.bert_tokenizer.ids_to_tokens[token_id] for token_id in generated_token_ids[0].cpu().numpy() ] sampled_sequence = "".join([ token[2:] if token.startswith("##") else token for token in list( filter(lambda x: x != '[PAD]', sampled_sequence)) ]) logger.info( "sampled sequence: {}".format(sampled_sequence))