def convert_data_to_feature(filepath): DRCD = LoadJson(filepath) tokenizer = BertTokenizer(vocab_file='bert-base-chinese-vocab.txt') token_embeddings = [] segement_embeddings = [] attention_mask = [] masked_lm_labels = [] max_seq_len = 0 Context_count = 0 # BertForMaskedLM的訓練需要特殊符號('[MASK]')以及被mask掉的詞的id # context最大長度450,question最大長度42,answer最大長度16,以及4個特殊符號(1個[CLS],3個[SEP]),加起來最大不超過512 for data in DRCD["data"]: for paragraph in data["paragraphs"]: context = paragraph["context"] word_piece_list = tokenizer.tokenize(context) if len(word_piece_list) <= 450: for qa in paragraph["qas"]: question = qa["question"] word_piece_list = tokenizer.tokenize(question) if len(word_piece_list) <= 42: answer = qa["answers"][0]["text"] answer = answer + "[SEP]" word_piece_list = tokenizer.tokenize(answer) if len(word_piece_list) <= 16: max_seq_len = create_input_features(tokenizer, context, question, answer, token_embeddings, segement_embeddings, attention_mask, masked_lm_labels, max_seq_len) Context_count += 1 print("最大長度:",max_seq_len) print("符合條件的context有" + str(Context_count) + "筆資料") print("總共產生" + str(len(token_embeddings)) + "筆資料") assert max_seq_len <= 512 # 小於BERT-base長度限制 max_seq_len = 512 # 將長度統一補齊至512(避免traindata和testdata最大長度不一致) # 補齊長度 for c in token_embeddings: while len(c)<max_seq_len: c.append(0) for c in segement_embeddings: while len(c)<max_seq_len: c.append(0) for c in attention_mask: while len(c)<max_seq_len: c.append(0) for c_l in masked_lm_labels: while len(c_l)<max_seq_len: c_l.append(-1) # BERT input embedding assert len(token_embeddings) == len(segement_embeddings) and len(token_embeddings) == len(attention_mask) and len(token_embeddings) == len(masked_lm_labels) data_features = {'token_embeddings':token_embeddings, 'segement_embeddings':segement_embeddings, 'attention_mask':attention_mask, 'masked_lm_labels':masked_lm_labels} return data_features
def find_paragraph(tokenizer: BertTokenizer, model: BertForNextSentencePrediction, question: str, context: str, max_len=256, batch_size=16): q_len = len(tokenizer.tokenize(question)) context_tokens = tokenizer.tokenize(context) part_len = max_len - q_len - 3 parts = [] n = 0 while n < len(context_tokens): parts += [context_tokens[n:n + part_len]] n += part_len // 2 results = [] all_parts = parts[:] while len(parts) > 0: batch = tokenizer.batch_encode_plus(list( zip([question] * batch_size, parts[:batch_size])), max_length=max_len, truncation=True, pad_to_max_length=True, return_tensors="pt").to("cuda") with torch.no_grad(): output = model(**batch)[0] results += [a - b for a, b in output.cpu().tolist()] parts = parts[batch_size:] return np.array(results), [ tokenizer.decode(tokenizer.encode(part), skip_special_tokens=True) for part in all_parts ]
def str2id(tokenizer: BertTokenizer, sys_utter: str, usr_utter: str, source: str) -> Tuple[List[int], List[int]]: """Convert system, user utterance and source tokens to ids based on BertTokenizer. Args: tokenizer: BertTokenizer sys_utter: system utterance usr_utter: user utterance source: slot + value Returns: input_ids and token_type_ids """ sys_utter_tokens = tokenizer.tokenize(sys_utter) usr_utter_tokens = tokenizer.tokenize(usr_utter) source_tokens = tokenizer.tokenize(source) sys_utter_ids = tokenizer.convert_tokens_to_ids(sys_utter_tokens) usr_utter_ids = tokenizer.convert_tokens_to_ids(usr_utter_tokens) source_ids = tokenizer.convert_tokens_to_ids(source_tokens) input_ids = ([tokenizer.cls_token_id] + sys_utter_ids + [tokenizer.sep_token_id] + usr_utter_ids + [tokenizer.sep_token_id] + source_ids + [tokenizer.sep_token_id]) token_type_ids = ([0] + [0] * (len(sys_utter_ids) + 1) + [1] * (len(usr_utter_ids) + 1) + [0] * (len(source_ids) + 1)) return input_ids, token_type_ids
class Tokenizer: def __init__(self, tokenizer_name="komoran"): assert (tokenizer_name.lower() == "komoran") or (tokenizer_name.lower() == "mecab")\ or (tokenizer_name.lower() == "subword"), "Only 'komoran', 'mecab', and 'subword' is acceptable." if tokenizer_name == "komoran": self.tokenizer = Komoran("STABLE") elif tokenizer_name == "mecab": self.tokenizer = Mecab() elif tokenizer_name == "subword": self.tokenizer = BertTokenizer(resource_filename(__package__, "vocab_noised.txt"), do_lower_case=False) self.tokenizer_name = tokenizer_name def tokenize(self, text): if self.tokenizer_name == "komoran": return self.tokenizer.get_morphes_by_tags(text) elif self.tokenizer_name == "mecab": return self.tokenizer.morphs(text) else: # self.tokenizer_name 이 None return self.tokenizer.tokenize(text) def post_process(self, tokens): if self.tokenizer_name == "komoran": return " ".join(tokens) elif self.tokenizer_name == "mecab": return " ".join(tokens) else: # self.tokenizer_name 이 subword 또는 moduletype return self.tokenizer.convert_tokens_to_string(tokens)
def chat(folder_bert, voc, testing=False): tf.random.set_seed(1) tokenizer = BertTokenizer(vocab_file=folder_bert + voc) if testing: tokens = tokenizer.tokenize("jeg tror det skal regne") print(tokens) ids = tokenizer.convert_tokens_to_ids(tokens) print(ids) print("Vocab size:", len(tokenizer.vocab)) config = BertConfig.from_json_file(folder_bert + "/config.json") model = BertLMHeadModel.from_pretrained(folder_bert, config=config) while (1): text = input(">>User: "******"Bot: {}".format(tokenizer.decode(sample_output[0]))) print("Bot: {}".format( tokenizer.decode(sample_output[:, input_ids.shape[-1]:][0], skip_special_tokens=True)))
def read_data(filename: str, tokenizer: BertTokenizer, args: TrainingArguments) -> List: with open(filename, encoding="utf-8") as f: data = f.readlines() items = [ item for item in json.loads(data[0])['data'] if 'russian' in item['paragraphs'][0]['qas'][0]['id'] ] ds = [] for item in items: paragraph = item['paragraphs'][0] context = paragraph['context'] qas = paragraph['qas'][0] question = qas['question'] answer = qas['answers'][0] answer_start = answer['answer_start'] answer_text = answer['text'] ids = tokenizer.encode(question, context[:answer_start]) start = len(ids) - 1 end = start + len(tokenizer.tokenize(answer_text)) if end < args.block_size: ds += [{ "question": question, "context": context, "start": start, "end": end }] return ds
def featurize(self, df): bert_model = BertModel.from_pretrained(self.data_path) bert_tokenizer = BertTokenizer(self.data_path + "/vocab.txt", do_lower_case=False, do_basic_tokenize=False) mecab = MeCab.Tagger('-Ochasen') data_list = df.rdd.collect() label_list = [] vec_list = [] for data in data_list: tmp_list = [] node_list = data[1] for word in node_list: tmp_list.append(word) if len(tmp_list) != 0: label_list.append(float(data[0])) bert_tokens = bert_tokenizer.tokenize( " ".join(["[CLS]"] + tmp_list + ["[SEP]"])) token_ids = bert_tokenizer.convert_tokens_to_ids(bert_tokens) tokens_tensor = torch.tensor(token_ids).unsqueeze(0) all_outputs = bert_model(tokens_tensor) embedding = all_outputs[-2].detach().numpy()[0] vec = np.mean(embedding, axis=0).tolist() vec_list.append(Vectors.dense(vec)) zip_list = zip(label_list, vec_list) new_df = self.spark.createDataFrame(zip_list, ("label", "features")) return new_df
def generate_embedding( self, model: transformers.BertModel, tokenizer: transformers.BertTokenizer, product: pd.Series, feature_columns: List[str], ) -> torch.Tensor: model.eval() if (Project.exported_objects_dir / f"{product['product_id']}.obj").exists(): return self.load_already_geneated_embedding(product=product) product_description = self.generate_product_description( product=product, feature_columns=feature_columns) marked_text = "[CLS] " + product_description + " [SEP]" tokenized_text = tokenizer.tokenize(marked_text) indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) segments_ids = [1] * len(tokenized_text) tokens_tensor = torch.tensor([indexed_tokens]) segments_tensors = torch.tensor([segments_ids]) with torch.no_grad(): outputs = model(tokens_tensor, segments_tensors) hidden_states = outputs[2] token_vecs = hidden_states[-2][0] sentence_embedding = torch.mean(token_vecs, dim=0) torch.save( sentence_embedding, Project.exported_objects_dir / f"{product['product_id']}.obj", ) return sentence_embedding
def createVocabulary(reciepts): vocab = set() for reciept in reciepts: words = reciept.dataWords for word in words: vocab.add(word) path = './data/prod_vocab.txt' with open(path, 'r') as f: for line in f: vocab.add(line[:-1]) tokenizer=BertTokenizer(vocab_file=path,do_lower_case=False) new_set = set() for word in vocab: token_list = tokenizer.tokenize(word) if '[UNK]' in token_list: print(word) t = re.split(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./<>?]', word) for i, v in enumerate(token_list): if v == '[UNK]' and i < len(t): for x in t: new_set.add(x) with open('./data/prod_vocab.txt', 'w+') as f: for word in (vocab.union(new_set)): f.write(word + '\n') return vocab
class BertBPE(object): def __init__(self, cfg): try: from transformers import BertTokenizer except ImportError: raise ImportError( "Please install transformers with: pip install transformers") if cfg.bpe_vocab_file: self.bert_tokenizer = BertTokenizer( cfg.bpe_vocab_file, do_lower_case=not cfg.bpe_cased) else: vocab_file_name = ("bert-base-cased" if cfg.bpe_cased else "bert-base-uncased") self.bert_tokenizer = BertTokenizer.from_pretrained( vocab_file_name) def encode(self, x: str) -> str: return " ".join(self.bert_tokenizer.tokenize(x)) def decode(self, x: str) -> str: return self.bert_tokenizer.clean_up_tokenization( self.bert_tokenizer.convert_tokens_to_string(x.split(" "))) def is_beginning_of_word(self, x: str) -> bool: return not x.startswith("##")
def get_indices_and_masks(sent_tokens: List[str], in_sent_start: int, in_sent_end: int, tokenizer: BertTokenizer, mask_mention: bool = False) \ -> Tuple[List[int], List[float], int, int]: if in_sent_start not in range(len(sent_tokens)) or\ in_sent_end not in range(1, len(sent_tokens) + 1): raise ValueError( f'wrong input: tokens {sent_tokens} don\'t contain pos' f' ({in_sent_start}, {in_sent_end}).') if mask_mention: for n in range(in_sent_start, in_sent_end): sent_tokens[n] = tokenizer.mask_token sent_subword_idxs = [] sent_subwords = [] sent_hypo_mask = [] new_in_sent_start, new_in_sent_end = None, None for n, tok in enumerate(sent_tokens): if n == in_sent_start: new_in_sent_start = len(sent_subwords) subtokens = tokenizer.tokenize(tok) sent_subwords.extend(subtokens) subtok_idxs = tokenizer.convert_tokens_to_ids(subtokens) sent_subword_idxs.extend(subtok_idxs) # NOTE: absence of + 1 because absence of [CLS] token in the beginning mask_value = float(in_sent_start <= n < in_sent_end) sent_hypo_mask.extend([mask_value] * len(subtok_idxs)) if n == in_sent_end - 1: new_in_sent_end = len(sent_subwords) + 1 return sent_subword_idxs, sent_hypo_mask, new_in_sent_start, new_in_sent_end
def get_embedding(phrases: List[str], emb_mat: torch.Tensor, tokenizer: BertTokenizer, debug: bool = False) -> torch.Tensor: # emb_mat: [vocab_size, emb_size] # returns: [num_phrases, emb_size] subtok_ids, subtok_masks = [], [] max_len = 0 for w in phrases: subtok_toks = tokenizer.tokenize(w) subtok_ids.append(tokenizer.convert_tokens_to_ids(subtok_toks)) num_subtoks = len(subtok_ids[-1]) subtok_masks.append([1.] * num_subtoks) if debug: print(f"subtok_ids('{w}') = {subtok_ids[-1]}") print( f'{[tokenizer._convert_id_to_token(s) for s in subtok_ids[-1]]}' ) max_len = max_len if max_len > num_subtoks else num_subtoks # subtok_ids, subtok_masks: [num_phrases, max_len] subtok_ids = torch.tensor( [sw_list + [-1] * (max_len - len(sw_list)) for sw_list in subtok_ids]) subtok_masks = torch.tensor( [m + [0.] * (max_len - len(m)) for m in subtok_masks]) # subtok_sizes: [num_phrases] subtok_sizes = torch.sum(subtok_masks, 1) if debug: print(subtok_sizes) # emb_mat[subtok_ids]: [num_phrases, max_len, emb_size] return torch.sum(emb_mat[subtok_ids] * subtok_masks.unsqueeze(2), axis=1) \ / subtok_sizes.unsqueeze(1)
def _ner_bert_tokenize(tokens: List[str], tags: List[str], tokenizer: BertTokenizer, max_subword_len: int = None, mode: str = None, subword_mask_mode: str = "first", token_masking_prob: float = None) -> Tuple[List[str], List[int], List[str]]: do_masking = (mode == 'train') and (token_masking_prob is not None) do_cutting = (max_subword_len is not None) tokens_subword = ['[CLS]'] startofword_markers = [0] tags_subword = ['X'] for token, tag in zip(tokens, tags): token_marker = int(tag != 'X') subwords = tokenizer.tokenize(token) if not subwords or (do_cutting and (len(subwords) > max_subword_len)): tokens_subword.append('[UNK]') startofword_markers.append(token_marker) tags_subword.append(tag) else: if do_masking and (random.random() < token_masking_prob): tokens_subword.extend(['[MASK]'] * len(subwords)) else: tokens_subword.extend(subwords) if subword_mask_mode == "last": startofword_markers.extend([0] * (len(subwords) - 1) + [token_marker]) else: startofword_markers.extend([token_marker] + [0] * (len(subwords) - 1)) tags_subword.extend([tag] + ['X'] * (len(subwords) - 1)) tokens_subword.append('[SEP]') startofword_markers.append(0) tags_subword.append('X') return tokens_subword, startofword_markers, tags_subword
class JapaneseWorker: def __init__(self): self.juman_tokenizer = JumanTokenizer() self.bert_tokenizer = BertTokenizer(config['DEFAULT']['vocab_path'], do_basic_tokenize=False) self.cls_id = self.bert_tokenizer.vocab['[CLS]'] self.mask_id = self.bert_tokenizer.vocab['[MASK]'] self.bert_model = 'model/Japanese/' self.cp = 'checkpoint/jp/cp_step_1200000.pt' self.opt = 'checkpoint/jp/opt_step_1200000.pt' @staticmethod def linesplit(src): """ :param src: type str, String type article :return: type list, punctuation seperated sentences """ def remove_newline(x): x = x.replace('\n', '') return x def remove_blank(x): x = x.replace(' ', '') return x def remove_unknown(x): unknown = ['\u3000'] for h in unknown: x = x.replace(h, '') return x src = remove_blank(src) src = remove_newline(src) src = remove_unknown(src) src_line = re.split('。(?<!」)|!(?<!」)|?(?!」)', src) src_line = [x for x in src_line if x is not ''] return src_line def tokenizer(self, src): """ :param src: type list, punctuation seperated sentences :return: token: type list, numberized tokens token_id: type list, tokens """ token = [] token_id = [] def _preprocess_text(text): return text.replace(" ", "") # for Juman for sentence in src: preprocessed_text = _preprocess_text(sentence) juman_tokens = self.juman_tokenizer(preprocessed_text) tokens = self.bert_tokenizer.tokenize(" ".join(juman_tokens)) tokens = ["[CLS]"] + tokens + ["[SEP]"] ids = self.bert_tokenizer.convert_tokens_to_ids(tokens) token += tokens token_id += ids return token, token_id
def generate_template(tokenizer: BertTokenizer, first_name: str, last_name: str, mode: str) -> str: """Generate a template given the information given. @param tokenizer is the tokenizer for the model. @param first_name is the patient's first name @param last_name is the patient's last name. @param mode will determine if we mask out first or last name. @return the template to be encoded (with MASKs). """ if mode == "mask_first": tok_name = tokenizer.tokenize(first_name) mask_string = "[MASK] " * len(tok_name) return f"[CLS] {mask_string.strip()} {last_name} [SEP]" elif mode == "mask_last": tok_name = tokenizer.tokenize(last_name) mask_string = "[MASK] " * len(tok_name) return f"[CLS] {first_name} {mask_string.strip()} [SEP]"
class NemoBertTokenizer(TokenizerSpec): def __init__( self, pretrained_model=None, vocab_file=None, do_lower_case=True, max_len=None, do_basic_tokenize=True, never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"), ): if pretrained_model: self.tokenizer = BertTokenizer.from_pretrained(pretrained_model) if "uncased" not in pretrained_model: self.tokenizer.basic_tokenizer.do_lower_case = False else: self.tokenizer = BertTokenizer(vocab_file, do_lower_case, do_basic_tokenize) self.vocab_size = len(self.tokenizer.vocab) self.never_split = never_split def text_to_tokens(self, text): tokens = self.tokenizer.tokenize(text) return tokens def tokens_to_text(self, tokens): text = self.tokenizer.convert_tokens_to_string(tokens) return remove_spaces(handle_quotes(text.strip())) def token_to_id(self, token): return self.tokens_to_ids([token])[0] def tokens_to_ids(self, tokens): ids = self.tokenizer.convert_tokens_to_ids(tokens) return ids def ids_to_tokens(self, ids): tokens = self.tokenizer.convert_ids_to_tokens(ids) return tokens def text_to_ids(self, text): tokens = self.text_to_tokens(text) ids = self.tokens_to_ids(tokens) return ids def ids_to_text(self, ids): tokens = self.ids_to_tokens(ids) tokens_clean = [t for t in tokens if t not in self.never_split] text = self.tokens_to_text(tokens_clean) return text def pad_id(self): return self.tokens_to_ids(["[PAD]"])[0] def bos_id(self): return self.tokens_to_ids(["[CLS]"])[0] def eos_id(self): return self.tokens_to_ids(["[SEP]"])[0]
def example_to_input(lemma_list: List[str], tags_list: List[int], tok: BertTokenizer): subword_list, tags_map = tok.convert_tokens_to_ids(tok.tokenize('[CLS]')), [] for w in lemma_list: tags_map.append(len(subword_list)) subword_list += tok.convert_tokens_to_ids(tok.tokenize(w)) subword_list += tok.convert_tokens_to_ids(tok.tokenize('[SEP]')) mapped_tags = [0] * len(subword_list) # mapped_pos = [0] * len(subword_list) # mapped_lemmas = ["[UNK]"] * len(subword_list) # mapped_altern = [[]] * len(subword_list) for i, j in enumerate(tag_map): mapped_tags[j] = tags_list[i] # mapped_pos[j] = example['pos'][i] # mapped_lemmas[j] = example['lemmas'][i] # mapped_altern[j] = example['alternatives'][i] return subword_list, mapped_tags
def convert_data_to_feature(): #載入問題資料集 q = open('Dataset/Query_Train/Final_question.txt', "r", encoding="utf-8") questions = q.readlines() q.close() #載入答案資料集 a = open('Dataset/Train_Label/FinalDomainLabel.txt', "r", encoding="utf-8") answers = a.readlines() a.close() assert len(answers) == len(questions) # ans_dic 表示answer的類別 ans_dic = make_ans_dic(answers) # question_dic 表示question的類別 question_dic = make_question_dic(questions) tokenizer = BertTokenizer(vocab_file='bert-base-chinese-vocab.txt') q_tokens = [] max_seq_len = 0 for q in question_dic.data: bert_ids = tokenizer.build_inputs_with_special_tokens( tokenizer.convert_tokens_to_ids(tokenizer.tokenize(q))) if (len(bert_ids) > max_seq_len): max_seq_len = len(bert_ids) q_tokens.append(bert_ids) print("最長問句長度:", max_seq_len) assert max_seq_len <= 512 # 小於BERT-base長度限制 # 補齊長度 for q in q_tokens: while len(q) < max_seq_len: q.append(0) a_labels = [] for a in ans_dic.data: a_labels.append(ans_dic.to_id(a)) # BERT input embedding answer_lables = a_labels input_ids = q_tokens input_masks = [[1] * max_seq_len for i in range(len(question_dic))] input_segment_ids = [[0] * max_seq_len for i in range(len(question_dic))] assert len(input_ids) == len(question_dic) and len(input_ids) == len( input_masks) and len(input_ids) == len(input_segment_ids) data_features = { 'input_ids': input_ids, 'input_masks': input_masks, 'input_segment_ids': input_segment_ids, 'answer_lables': answer_lables, 'question_dic': question_dic, 'answer_dic': ans_dic } output = open('Dataset/data_features_domain.pkl', 'wb') pickle.dump(data_features, output) return data_features
def __init__(self, bert_tokenizer: BertTokenizer, jp_tokenizer: JumanTokenizer, args, file_path='train', block_size=512): assert os.path.isfile(file_path) directory, filename = os.path.split(file_path) cached_features_file = os.path.join( directory, 'cached_lm_' + str(block_size) + '_' + filename) if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) with open(cached_features_file, 'rb') as handle: self.examples = pickle.load(handle) else: logger.info("Creating features from dataset file at %s", directory) self.examples = [] with open(file_path, encoding="utf-8") as f: docs = f.readlines() exsamples = [] for _, line in enumerate(docs): text = line.rstrip(os.linesep) # separate text into tokens tokenized_text = bert_tokenizer.convert_tokens_to_ids( bert_tokenizer.tokenize(" ".join( jp_tokenizer.tokenize(text)))) # add special tokkens : [CLS] and [SEP] added_special = bert_tokenizer.build_inputs_with_special_tokens( tokenized_text) # Zero-pad up to the sequence length. diff = block_size - len(added_special) if diff < 0: added_special = added_special[:diff] else: # padding を 0 -> -1に変更 padding = [-1] * (block_size - len(added_special)) added_special += padding assert len(added_special) == block_size self.examples.append(added_special) logger.info("Saving features into cached file %s", cached_features_file) with open(cached_features_file, 'wb') as handle: pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
def generate_masked_sent(tokenizer: BertTokenizer, sent: str, mask_token: str, mask_rate:float=0.15): words = tokenizer.tokenize(sent) input_list = words output_list = words len_sent = len(words) num_mask_token = int(len_sent*mask_rate) masked_idx = random.sample(list(range(len_sent), num_mask_token)) for idx in masked_idx: # TODO: randomly masks / replaces / keeps the token input_list[idx] = mask_token return input_list, output_list
def bert_text_preparation(text: str, tokenizer: BertTokenizer): marked_text = "[CLS] " + text + " [SEP]" tokenized_text = tokenizer.tokenize(marked_text) indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) segments_ids = [1] * len(indexed_tokens) tokens_tensor = torch.tensor([indexed_tokens]) segments_tensor = torch.tensor([segments_ids]) return tokenized_text, tokens_tensor, segments_tensor
class testAnswerGeneration(): def __init__(self): self.tokenizer = BertTokenizer( vocab_file='bert-base-chinese-vocab.txt') self.config = BertConfig.from_pretrained('trained_model/1/config.json') self.model = BertForMaskedLM.from_pretrained( 'trained_model/1/pytorch_model.bin', from_tf=bool('.ckpt' in 'bert-base-chinese'), config=self.config) self.model.eval() def to_input_id(self, sentence_input): return self.tokenizer.convert_tokens_to_ids( self.tokenizer.tokenize(sentence_input)) def getAnswer(self, context, question): input_id = self.to_input_id("[CLS] " + context + " [SEP] " + question + " [SEP]") count = 0 answer = "" maskpos = len(input_id) # 標出要預測答案的位置 input_id.append(103) # 補齊長度 while len(input_id) < 512: input_id.append(0) # 限制答案最大長度為10 while (count < 10): input_id_tensor = torch.LongTensor([input_id]) outputs = self.model(input_id_tensor) predictions = outputs[0] predicted_index = torch.argmax( predictions[0, maskpos]).item() # 生出最有可能的token_id predicted_token = self.tokenizer.convert_ids_to_tokens( predicted_index) # id轉token # 當預測為[SEP]的時候,就結束生成答案 if predicted_token == '[SEP]': break answer = answer + predicted_token # 將生成的token連接起來 input_id[maskpos] = predicted_index # 用生成的token_id取代當前的[MASK]的id maskpos += 1 if maskpos < 512: input_id[maskpos] = 103 # 標出下一個預測的[MASK]的id else: break count += 1 return answer
def subword_tokenize(tokenizer: BertTokenizer, tokens: List[str]) -> List[Tuple[int, str]]: """ Returns: List of subword tokens, List of indices mapping each subword token to one real token. """ subtokens = [tokenizer.tokenize(t) for t in tokens] indexed_subtokens = [] for idx, subtoks in enumerate(subtokens): for subtok in subtoks: indexed_subtokens.append((idx, subtok)) return indexed_subtokens
def split_kc(input_dir: Path, output_dir: Path, max_subword_length: int, tokenizer: BertTokenizer): """ 各文書を,tokenize したあとの長さが max_subword_length 以下になるように複数の文書に分割する. 1文に分割しても max_subword_length を超えるような長い文はそのまま出力する """ did2sids: Dict[str, List[str]] = defaultdict(list) did2cumlens: Dict[str, List[int]] = {} sid2knp: Dict[str, str] = {} for knp_file in input_dir.glob('*.knp'): with knp_file.open() as fin: did = knp_file.stem did2cumlens[did] = [0] buff = '' for line in fin: buff += line if line.strip() == 'EOS': blist = BList(buff) did2sids[did].append(blist.sid) did2cumlens[did].append(did2cumlens[did][-1] + len( tokenizer.tokenize(' '.join( m.midasi for m in blist.mrph_list())))) sid2knp[blist.sid] = buff buff = '' for did, sids in did2sids.items(): cum: List[int] = did2cumlens[did] end = 1 # end を探索 while end < len(sids) and cum[end + 1] - cum[0] <= max_subword_length: end += 1 idx = 0 while end < len(sids) + 1: start = 0 # start を探索 while cum[end] - cum[start] > max_subword_length: start += 1 if start == end - 1: break with output_dir.joinpath(f'{did}-{idx:02}.knp').open( mode='w') as fout: fout.write(''.join( sid2knp[sid] for sid in sids[start:end])) # start から end まで書き出し idx += 1 end += 1
def map_labels_to_wordpiece(words: list, labels: list, tokenizer: BertTokenizer): """ Maps labels from original sentence to labels per bert wordpeace token @param words: words @param labels: labels per word @param wordpiece_tokenizer: """ assert len(words) == len(labels) wp_labels = [] for word, label in zip(words, labels): wp_labels += [label] * len(tokenizer.tokenize(word)) wp_labels = ['O'] + wp_labels + ['O'] return wp_labels
def get_pos_embedding(semantics_list, ori_syntactic_list, syntactic_list_in_dict, nlp): pos_embedding_list = [] # pos_encoder_dict_temp=convert_tuple_to_dict(nlp.get_pipe("tagger").labels) # pos_encoder_dict={} # for key_num,pos_tag in pos_encoder_dict_temp.items(): # pos_encoder_dict[key_num+1]=pos_tag reduced_pos_encoder_dict = get_reduced_pos_encoder_dict() # print(pos_encoder_dict) tokenizer = BertTokenizer(vocab_file='bert-base-uncased-vocab.txt') for i in tqdm(range(len(ori_syntactic_list))): # 先產生兩句話詞性的dict semantics_tag_dict = get_sentence_tag_dict(nlp, semantics_list[i]) syntactic_tag_dict = get_sentence_tag_dict(nlp, ori_syntactic_list[i]) semantics_token = [] semantics_token.append("[CLS]") semantics_token.extend(tokenizer.tokenize(semantics_list[i])) semantics_token.append("[SEP]") # semantics_pos=pos_match(semantics_tag_dict,semantics_token,pos_encoder_dict) semantics_pos = reduced_pos_match(semantics_tag_dict, semantics_token, reduced_pos_encoder_dict) for j, syntactic_sentence in enumerate(syntactic_list_in_dict[i]): input_pos = [] input_token = [] syntactic_token = [] # [CLS] + semantics_sentence(X2) + [SEP] + syntactic_sentence(X1) input_token = semantics_token.copy() syntactic_token = syntactic_sentence input_token.extend(syntactic_sentence) # syntactic_pos=pos_match(syntactic_tag_dict,syntactic_token,pos_encoder_dict) syntactic_pos = reduced_pos_match(syntactic_tag_dict, syntactic_token, reduced_pos_encoder_dict) input_pos.extend(semantics_pos) input_pos.extend(syntactic_pos) assert len(input_token) == len(input_pos) # print("input_token",input_token) # print("input_pos",input_pos) pos_embedding_list.append(input_pos.copy()) return pos_embedding_list
def get_encoder_embedding(phrases: List[str], bert: BertModel, tokenizer: BertTokenizer, embed_wo_special_tokens: bool) -> torch.Tensor: subtok_ids_list, hypo_mask_list = [], [] for phr in phrases: subtok_ids_list.append( tokenizer.convert_tokens_to_ids(['[CLS]'] + tokenizer.tokenize(phr) + ['[SEP]'])) hypo_mask_list.append([1.0] * len(subtok_ids_list[-1])) if embed_wo_special_tokens: hypo_mask_list[-1][0] = 0.0 hypo_mask_list[-1][-1] = 0.0 batch = HypoDataset.torchify_and_pad(subtok_ids_list, hypo_mask_list) subtok_ids_batch, hypo_mask_batch, attn_mask_batch = to_device(*batch) h = bert(subtok_ids_batch, attention_mask=attn_mask_batch)[0] m = hypo_mask_batch.unsqueeze(2) phrase_representations = torch.sum(h * m, 1) / torch.sum(m, 1) return phrase_representations
def bert_tokenize_sentence(self, tokens: List[str], tokenizer: BertTokenizer = None) -> List[str]: """ Auxiliary function that tokenize given context into subwords. Args: tokens: list of unsplitted tokens. tokenizer: tokenizer to be used for words tokenization into subwords. Returns: list of newly acquired tokens """ if tokenizer is None: tokenizer = self.tokenizer bert_tokens = list() for token in tokens: bert_tokens.extend(tokenizer.tokenize(token)) return bert_tokens
def process_trees_and_bpes(sst_dir='.data/sst/trees/', tokenizer: BertTokenizer = None, save_dir='data/sst/fine-tune', save_bpe=True, **kwargs): train_matrices, train_texts = trees2matrices_and_texts( os.path.join(sst_dir, 'train.txt'), **kwargs) torch.save(train_matrices, os.path.join(save_dir, 'trees')) if save_bpe: all_bpe_indices = [] for text in train_texts: # text is a list of tokens tokens = tokenizer.tokenize(' '.join(text)) bpe_indices = [] for i, t in enumerate(tokens): if '##' in t: bpe_indices.append(i) all_bpe_indices.append(bpe_indices) assert len(text) == len(tokens) - len(bpe_indices) torch.save(all_bpe_indices, os.path.join(save_dir, 'bpe'))
def to_action_tuple( word: str, grammar: state_tree.NQCFG, tokenizer: transformers.BertTokenizer, valid_actions: Optional[Collection[int]] = None) -> Tuple[int, ...]: # If you want to programmatically exclude certain kinds of tokens from the # tries, add the logic here and return an empty tuple for `word`s you do not # want to see included. if (common_flags.EXCLUDE_PUNCTUATION_FROM_TRIE.value == 1 and word in string.punctuation): return () # We need to skip `unks` that aren't even covered by word pieces. tokens = tokenizer.tokenize(word) actions = [] for i, token in enumerate(tokens): # The vocabulary might contain items like `toxin’s` which the BERT tokenizer # does tokenize into several initial word pieces. Presence of such a # sequence in the trie would be inconsistent with our insistence of deriving # words as exactly one initial word piece followed by several non-initial # ones. We can still return the "word" assembled so far, as this will match # the grammar, but cannot accumulate any more tokens. if i > 0 and not token.startswith('##'): break token = state_tree.NQStateTree.clean_escape_characters(token) if token not in grammar.terminal_to_action: return () token_action = grammar.terminal_to_action[token] # If we are in a "constrained" setting, we need to ensure that every token # of a word is actually present. This situation can arise if the state # truncation cuts off the final word piece(s) of a multi-piece sequence. # Implementing the constraint here allows us to not worry about it in the # transition model; otherwise, we would not just need to check for sub-trie # presence for a prefix to determine recursion, but check explicitly that at # least one path along the subtrie is "constructible". if valid_actions is not None: if token_action not in valid_actions: return () actions.append(token_action) return tuple(actions)