class MonoBERT_Numericalise_Transform(): def __init__(self, vocab_txt_file="saved_models/monoBERT/vocab.txt", **kwargs): self.numericalizer = BertTokenizer(vocab_txt_file) def __call__(self, samples): ''' sample_obj: [dict]: [{'query':"text and more", 'doc':"doc text" ...}] returns: [dict]: [{'input_ids':[34,2,8...], 'type_ids':[0,0,1,1], 'input_text':"text and more", ...}] ''' for sample_obj in samples: query_text = sample_obj['query'] query_ids = [self.numericalizer.cls_token_id ] + self.numericalizer.encode( query_text, add_special_tokens=False)[:62] + [ self.numericalizer.sep_token_id ] query_token_type_ids = [0] * len(query_ids) doc_text = sample_obj['doc'] doc_ids = self.numericalizer.encode( doc_text, add_special_tokens=False)[:445] + [ self.numericalizer.sep_token_id ] doc_token_type_ids = [1] * len(doc_ids) sample_obj["input_ids"] = query_ids + doc_ids sample_obj["type_ids"] = query_token_type_ids + doc_token_type_ids return samples
def _preprocess( data: List[Dict], slot_meta: List[str], gate_list: Dict[str, int], tokenizer: BertTokenizer, ) -> List[Dict]: """원 데이터에 Feature 추가 Args: data (List[Dict]): 데이터 원본 slot_meta (List[str]): Slot 전체 목록 (해당 슬롯 순서로 Feature 생성) gate_list (Dict[str, int]): 정의된 Gate 목록 tokenizer (BertTokenizer): 텍스트를 변환할 Tokenizer Returns: List[Dict]: 변경된 데이터 (주의: 데이터 원본을 변경시키고 원본 참조를 그대로 반환) """ none_value = tokenizer.encode( 'none', add_special_tokens=False) + [tokenizer.sep_token_id] empty_slot_values = {slot: none_value for slot in slot_meta} empty_gates = {slot: gate_list['none'] for slot in slot_meta} for example in tqdm(data): dialogue: List[Dict] = example['dialogue'] user_count = 0 for turn in dialogue: turn['text_idx'] = tokenizer.encode(turn['text']) slot_values = deepcopy(empty_slot_values) gates = deepcopy(empty_gates) if turn['role'] == 'user': user_count = user_count + 1 for state_raw in turn['state']: state_raw: List[str] = state_raw.split('-') slot = state_raw[0] + '-' + state_raw[1] value = state_raw[2] slot_values[slot] = tokenizer.encode( value, add_special_tokens=False) + [tokenizer.sep_token_id] gates[slot] = gate_list.get(value, gate_list['ptr']) turn['slot'] = slot_values turn['gate'] = gates example['count'] = user_count return data
def row_to_tensor(tokenizer: BertTokenizer, row: pd.Series) -> Tuple[torch.LongTensor, torch.LongTensor]: tokens = tokenizer.encode(row["sent"], add_special_tokens=True) if len(tokens) > 120: tokens = tokens[:119] + [tokens[-1]] x = torch.LongTensor(tokens) y = torch.FloatTensor(row[["CAG", "NAG", "OAG"]]) return x, y
def _tokenize_bert_sentence(text: str, tokenizer: BertTokenizer) -> Tuple: """ Given a sentence and a BertTokenizer, tokenizes the text, maps to BERT vocab indices, and make the segment IDs for the tokens before returning the tensors on GPU (add flag for GPU disable soon). :param text: The sentence being tokenized. A single sentence as str. :param tokenizer: The BertTokenizer object instantiated. :return token_tensor, segments_tensor: The tensors containing the segment IDs and the tokens themselves. On GPU. """ # Split the sentence into tokens. tokenized_text = tokenizer.encode(text, add_special_tokens=True) # Map the token strings to their vocabulary indices. indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) # Mark each of the tokens as belonging to sentence "1" (single # sentence). segments_ids = [1] * len(tokenized_text) # Convert inputs to PyTorch tensors, place on GPU. token_tensor = torch.tensor([indexed_tokens]).to('cuda:0') segments_tensor = torch.tensor([segments_ids]).to('cuda:0') return token_tensor, segments_tensor
def __init__(self, file_path, tokenizer: BertTokenizer, max_length=512, device=None): news_type = [] news_content = [] news_atten_mask = [] seq_typ_ids = [] with open(file_path, mode='r', encoding='utf8') as f: for line in tqdm(f.readlines()): line = line.strip() line = line.split('\t') news_type.append(news_type2id_dict[line[0]]) token_ids = tokenizer.encode(ILLEGAL_CHARACTERS_RE.sub( r'', line[1]), max_length=max_length, pad_to_max_length=True) news_content.append(token_ids) news_atten_mask.append(get_atten_mask(token_ids)) seq_typ_ids.append( tokenizer.create_token_type_ids_from_sequences( token_ids_0=token_ids[1:-1])) self.label = torch.from_numpy(np.array(news_type)).unsqueeze(1).long() self.token_ids = torch.from_numpy(np.array(news_content)).long() self.seq_type_ids = torch.from_numpy(np.array(seq_typ_ids)).long() self.atten_masks = torch.from_numpy(np.array(news_atten_mask)).long() if device is not None: self.label = self.label.to(device) self.token_ids = self.token_ids.to(device) self.seq_type_ids = self.seq_type_ids.to(device) self.atten_masks = self.atten_masks.to(device)
class MLMModel: def __init__(self): self.model: BertForMaskedLM = BertForMaskedLM.from_pretrained( pretrained_model_name_or_path='Foodbert/foodbert/data/mlm_output/checkpoint-final') with open('Foodbert/foodbert/data/used_ingredients.json', 'r') as f: used_ingredients = json.load(f) self.tokenizer = BertTokenizer(vocab_file='Foodbert/foodbert/data/bert-base-cased-vocab.txt', do_lower_case=False, max_len=128, never_split=used_ingredients) self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') self.model.to(self.device) def predict_substitutes(self, sentence, ingredient_name, with_masking=True): search_id = self.tokenizer.mask_token_id if with_masking else \ self.tokenizer.convert_tokens_to_ids([ingredient_name])[0] sentence = sentence.replace('!', ' !').replace('?', ' ?').replace('.', ' .').replace(':', ' :').replace(',', ' ,') sentence = ' ' + sentence + ' ' all_ordered_substitutes = [] masked_sentence = sentence.replace(f' {ingredient_name} ', ' [MASK] ') input_ids = torch.tensor(self.tokenizer.encode(masked_sentence, add_special_tokens=True)).unsqueeze(0).to(device=self.device) prediction_scores = self.model(input_ids, masked_lm_labels=input_ids)[1][0] ingredient_scores = prediction_scores[input_ids[0] == search_id] for i in range(len(ingredient_scores)): ingredient_score = ingredient_scores[i] softmax_scores = ingredient_score.softmax(dim=0) indices = torch.sort(ingredient_score, descending=True).indices ordered_substitutes = self.tokenizer.convert_ids_to_tokens(indices) softmax_scores = softmax_scores[indices].tolist() all_ordered_substitutes.append((ordered_substitutes, softmax_scores)) return all_ordered_substitutes
def find_answer(tokenizer: BertTokenizer, answer_model: BertForQuestionAnswering, query: str, text: str) -> str: with torch.no_grad(): start, end = answer_model(**tokenizer.encode_plus( query, text, max_length=256, truncation=True, return_tensors="pt")) start_pos = torch.argmax(start).item() end_pos = torch.argmax(end).item() if start_pos >= end_pos: start = torch.softmax(start, dim=1) end = torch.softmax(end, dim=1) k = -2 start_args = torch.argsort(start).tolist()[0] end_args = torch.argsort(end).tolist()[0] calc_score = lambda start_pos, end_pos: start[0][start_pos] * end[0][ end_pos] s_score, e_score = 0, 0 s_pos, e_pos = start_pos, end_pos while s_score == 0 or e_score == 0: s_pos = start_args[k] e_pos = end_args[k] s_score = 0 if s_pos > end_pos else calc_score(s_pos, end_pos) e_score = 0 if e_pos < start_pos else calc_score(start_pos, e_pos) k -= 1 if s_score > e_score: start_pos = s_pos else: end_pos = e_pos return tokenizer.decode(tokenizer.encode(query, text)[start_pos:end_pos])
def find_paragraph(tokenizer: BertTokenizer, model: BertForNextSentencePrediction, question: str, context: str, max_len=256, batch_size=16): q_len = len(tokenizer.tokenize(question)) context_tokens = tokenizer.tokenize(context) part_len = max_len - q_len - 3 parts = [] n = 0 while n < len(context_tokens): parts += [context_tokens[n:n + part_len]] n += part_len // 2 results = [] all_parts = parts[:] while len(parts) > 0: batch = tokenizer.batch_encode_plus(list( zip([question] * batch_size, parts[:batch_size])), max_length=max_len, truncation=True, pad_to_max_length=True, return_tensors="pt").to("cuda") with torch.no_grad(): output = model(**batch)[0] results += [a - b for a, b in output.cpu().tolist()] parts = parts[batch_size:] return np.array(results), [ tokenizer.decode(tokenizer.encode(part), skip_special_tokens=True) for part in all_parts ]
def explain_handle(self, model_wraper, text, target=1): # pylint: disable=too-many-locals,unused-argument,arguments-differ """Captum explanations handler. Args: data_preprocess (Torch Tensor): Preprocessed data to be used for captum raw_data (list): The unprocessed data to get target from the request Returns: dict : A dictionary response with the explanations response. """ model_wrapper = AGNewsmodelWrapper(self.model) tokenizer = BertTokenizer(self.vocab_file) model_wrapper.eval() model_wrapper.zero_grad() input_ids = torch.tensor( [tokenizer.encode(self.text, add_special_tokens=True)]) input_embedding_test = model_wrapper.model.bert_model.embeddings( input_ids) preds = model_wrapper(input_embedding_test) out = np.argmax(preds.cpu().detach(), axis=1) out = out.item() ig_1 = IntegratedGradients(model_wrapper) attributions, delta = ig_1.attribute( # pylint: disable=no-member input_embedding_test, n_steps=500, return_convergence_delta=True, target=1, ) tokens = tokenizer.convert_ids_to_tokens(input_ids[0].numpy().tolist()) feature_imp_dict = {} feature_imp_dict["words"] = tokens attributions_sum = self.summarize_attributions(attributions) feature_imp_dict["importances"] = attributions_sum.tolist() feature_imp_dict["delta"] = delta[0].tolist() return [feature_imp_dict]
def read_data(filename: str, tokenizer: BertTokenizer, args: TrainingArguments) -> List: with open(filename, encoding="utf-8") as f: data = f.readlines() items = [ item for item in json.loads(data[0])['data'] if 'russian' in item['paragraphs'][0]['qas'][0]['id'] ] ds = [] for item in items: paragraph = item['paragraphs'][0] context = paragraph['context'] qas = paragraph['qas'][0] question = qas['question'] answer = qas['answers'][0] answer_start = answer['answer_start'] answer_text = answer['text'] ids = tokenizer.encode(question, context[:answer_start]) start = len(ids) - 1 end = start + len(tokenizer.tokenize(answer_text)) if end < args.block_size: ds += [{ "question": question, "context": context, "start": start, "end": end }] return ds
def predict(inp: str, model: BertForMaskedLM, tokenizer: BertTokenizer, k: int = 3) -> List[str]: """ Predict the top-k substitutes for an input text containing a single MASK token. :param inp: the input text :param model: a masked language model :param tokenizer: the tokenizer corresponding to the model :param k: the number of predictions :return: the list of top-k substitutes for the MASK token """ kwargs = { 'add_prefix_space': True } if isinstance(tokenizer, GPT2Tokenizer) else {} input_ids = tokenizer.encode(inp, add_special_tokens=True, **kwargs) mask_idx = input_ids.index(tokenizer.mask_token_id) input_ids = torch.tensor([input_ids]) with torch.no_grad(): (predictions, ) = model(input_ids) predicted_tokens = [] _, predicted_indices = torch.topk(predictions[0, mask_idx], k) for predicted_index in predicted_indices: predicted_token = tokenizer.convert_ids_to_tokens( [predicted_index.item()])[0] predicted_tokens.append(predicted_token) return predicted_tokens
def chat(folder_bert, voc, testing=False): tf.random.set_seed(1) tokenizer = BertTokenizer(vocab_file=folder_bert + voc) if testing: tokens = tokenizer.tokenize("jeg tror det skal regne") print(tokens) ids = tokenizer.convert_tokens_to_ids(tokens) print(ids) print("Vocab size:", len(tokenizer.vocab)) config = BertConfig.from_json_file(folder_bert + "/config.json") model = BertLMHeadModel.from_pretrained(folder_bert, config=config) while (1): text = input(">>User: "******"Bot: {}".format(tokenizer.decode(sample_output[0]))) print("Bot: {}".format( tokenizer.decode(sample_output[:, input_ids.shape[-1]:][0], skip_special_tokens=True)))
def get_question_type(tokenizer: BertTokenizer, question_model: BertForSequenceClassification, question: str) -> str: input_ids = tokenizer.encode(question, return_tensors="pt") with torch.no_grad(): out = question_model(input_ids)[0] a, b = out.tolist()[0] return "YESNO" if b > a else "SPAN"
def main(): args = set_interact_args() logger = create_logger(args) # 当用户使用GPU,并且GPU可用时 args.cuda = torch.cuda.is_available() and not args.no_cuda # args.cuda = False device = 'cuda' if args.cuda else 'cpu' logger.info('using device:{}'.format(device)) os.environ["CUDA_VISIBLE_DEVICES"] = args.device tokenizer = BertTokenizer(vocab_file=args.voca_path) model = GPT2LMHeadModel.from_pretrained(args.dialogue_model_path) model.to(device) model.eval() print('***********************Summary model start************************') while True: try: text = input() for i in range(5): if len(text): text = text[:1000] input_ids = [tokenizer.cls_token_id] # 每个input以[CLS]为开头 input_ids.extend(tokenizer.encode(text)) input_ids.append(tokenizer.sep_token_id) curr_input_tensor = torch.tensor(input_ids).long().to(device) generated = [] # 最多生成max_len个token for _ in range(args.max_len): outputs = model(input_ids=curr_input_tensor) next_token_logits = outputs[0][-1, :] # 对于已生成的结果generated中的每个token添加一个重复惩罚项,降低其生成概率 for id in set(generated): next_token_logits[id] /= args.repetition_penalty next_token_logits = next_token_logits / args.temperature # 对于[UNK]的概率设为无穷小,也就是说模型的预测结果不可能是[UNK]这个token next_token_logits[tokenizer.convert_tokens_to_ids( '[UNK]')] = -float('Inf') filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=args.topk, top_p=args.topp) # torch.multinomial表示从候选集合中无放回地进行抽取num_samples个元素,权重越高,抽到的几率越高,返回元素的下标 next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1) if next_token == tokenizer.sep_token_id: # 遇到[SEP]则表明response生成结束 break generated.append(next_token.item()) curr_input_tensor = torch.cat( (curr_input_tensor, next_token), dim=0) text = tokenizer.convert_ids_to_tokens(generated) print("summary:" + "".join(text)) except KeyboardInterrupt: break
def find_answer(tokenizer: BertTokenizer, model: BertForQuestionAnswering, context: str, question: str): input_data = tokenizer.encode_plus(question, context, return_tensors="pt") with torch.no_grad(): out = model(**input_data) start, end = out[0], out[1] start = torch.argmax(start).item() end = torch.argmax(end).item() return tokenizer.decode(tokenizer.encode(question, context)[start:end])
def get_decoder_input(tokenizer: BertTokenizer, input_str: list, config: ModelConfig): decoder_input = torch.tensor( tokenizer.encode(input_str, add_special_tokens=False, max_length=config.max_seq_len, return_tensors='pt', truncation=True)) return decoder_input
def load_data(file: str, pickle_file: str, tokenizer: BertTokenizer = None): if not tokenizer: tokenizer = BertTokenizer.from_pretrained(model_name) if os.path.exists(pickle_file): return pickle.load(open(pickle_file, 'rb')) with open(file, 'r', encoding='utf-8') as f: all_results, all_labels = [], [] for line in tqdm(f.readlines()): # 1. 加载基础数据 data = json.loads(line) text: str = data['text'] if len(text) > 512: text = text[:512] # 2. 对数据进行分词,生成token_ids, mask 以及token_type inputs: BatchEncoding = tokenizer(text, return_token_type_ids=True, return_tensors='tf') # 3. 构造labels sequence_length = tf.shape(inputs['input_ids']).numpy()[-1] labels = ['O'] * sequence_length input_ids = inputs['input_ids'] # 4. 开始线性搜索实体数据 global_index = 0 # 对所有的实体进行排序 for entity_obj in sorted(data['entities'], key=lambda x: x['start']): entity: str = entity_obj['entity'] # 4.1 将label数据编码成token_ids,然后在全局进行搜索 tokenized_output: tf.Tensor = tokenizer.encode(entity, add_special_tokens=False, return_tensors='tf') entity_token_length: int = tf.shape(tokenized_output).numpy()[-1] # 4.2 全局线性搜索 for search_index in range(global_index, sequence_length - entity_token_length): # 如果搜索到对应的id if tf.reduce_all( tokenized_output == input_ids[:, search_index: search_index + entity_token_length] ).numpy(): labels[search_index] = "B-" + entity_obj['type'] for index in range(search_index + 1, search_index + entity_token_length): labels[index] = "I-" + entity_obj['type'] # 将下一轮的开始搜索索引设置成上一轮的结束字符 global_index = search_index + entity_token_length break inputs['labels'] = labels all_labels.append(labels) all_results.append(inputs) pickle.dump(all_results, open(pickle_file, 'wb')) # 将labels转化成最后的数据 labels = tf.data.Dataset() return all_results, labels
class SimpleBertEmbeddings(WordEmbeddings): tokenizer: BertTokenizer model: BertModel special_tokens = [] def __init__(self, bert_model_path: str): self.tokenizer = BertTokenizer(vocab_file=bert_model_path + '/vocab.txt') config = BertConfig.from_pretrained(bert_model_path + '/config.json', output_hidden_states=True) self.model = BertModel.from_pretrained(bert_model_path, config=config) self.model.eval() def convert(self, text: str) -> Dict[Word, List[float]]: print("[bert embeddings] analyze text:", text) lower_text = text.lower().replace("й", "и").replace("ё", "е").replace("́", "") token_ids = self.tokenizer.encode(lower_text) encoded_layers = self.model(input_ids=torch.tensor([token_ids])) hidden_layers = encoded_layers[2][1:] token_embeddings = torch.stack(hidden_layers, dim=0) token_embeddings = torch.squeeze(token_embeddings, dim=1) token_embeddings = token_embeddings.permute(1, 0, 2) result: Dict[Word, List[float]] = {} text_pos = 0 prev = None for i, token_vec in enumerate(token_embeddings): # todo: try only -12 layer: https://github.com/hanxiao/bert-as-service#q-so-which-layer-and-which-pooling-strategy-is-the-best # combine last 4 layers (best F1 score) cat_vec = torch.cat( (token_vec[-1], token_vec[-2], token_vec[-3], token_vec[-4]), dim=0) if token_ids[i] in self.tokenizer.all_special_ids: continue token: str = self.tokenizer.convert_ids_to_tokens(token_ids[i]) if token.startswith("##") and prev is not None: clear_token = token.replace("##", "") word = Word(prev.text + clear_token, prev.start, prev.end + len(clear_token)) result.update( {word: np.add(result[prev], cat_vec.tolist()).tolist()}) del result[prev] prev = word continue start = lower_text.find(token, text_pos) if start == -1: continue end = start + len(token) word = Word(token, start, end) text_pos = end prev = word result.update({word: cat_vec.tolist()}) return result
def make_predictions( sentence_array: np.array, model: BertForSequenceClassification, tokenizer: BertTokenizer, device: torch.device, hyperparameter_dict: dict) -> typing.Tuple[np.array, np.array]: """ Make predictions on DataFrame containing sentences with given model :param model: Torch model :param tokenizer: BERT-base tokenizer :param device: Torch device :param max_length: Max length of input sequence (for padding) :param hyperparameter_dict: Dictionary of model hyperparameters :return: NumPy array of label predictions """ # Prepare data encoded_sentences = [] for sentence in sentence_array: enc_sent_as_list = tokenizer.encode(sentence, add_special_tokens=True) encoded_sentences.append(enc_sent_as_list) input_array, input_attention_mask_array = _create_sentence_input_arrays( encoded_sentences, hyperparameter_dict['max_length']) input_tensor = torch.tensor(input_array) input_attention_mask_tensor = torch.tensor(input_attention_mask_array) input_dataset = TensorDataset(input_tensor, input_attention_mask_tensor) input_data_loader = DataLoader( input_dataset, batch_size=hyperparameter_dict['batch_size']) # Run model model.eval() logit_list = [] for batch in input_data_loader: batch_input_ids = batch[0].to(device) batch_attention_mask = batch[1].to(device) with torch.no_grad(): outputs = model(input_ids=batch_input_ids, token_type_ids=None, attention_mask=batch_attention_mask) logits = outputs[0] logit_list.append(logits) logits_tensor = torch.cat(logit_list, dim=0) prob_tensor = torch.softmax(logits_tensor, dim=1) return np.array(logits_tensor.cpu()), np.array(prob_tensor.cpu())
def get_encoder_input(tokenizer: BertTokenizer, input_str: list, config: ModelConfig): encoder_input = torch.tensor( tokenizer.encode(input_str, add_special_tokens=False, pad_to_max_length=True, max_length=config.max_seq_len, return_tensors='pt', truncation=True)) encoder_input_mask = encoder_input != 0 return encoder_input, encoder_input_mask
def row_to_tensor( tokenizer: BertTokenizer, row: pd.Series) -> Tuple[torch.LongTensor, torch.LongTensor]: tokens = tokenizer.encode(row["text"], add_special_tokens=True) if len(tokens) > config.MAX_LEN: tokens = tokens[:config.MAX_LEN - 1] + [tokens[-1]] x = torch.LongTensor(tokens) y = torch.FloatTensor( row[["Depression", "Alcohol", "Suicide", "Drugs"]]) return x, y
def generate_samples( model: PreTrainedModel, tokenizer: BertTokenizer, prompt_text: str, max_length=args['max_length'], temperature=args['temperature'], top_k=args['k'], top_p=args['p'], repetition_penalty=args['repetition_penalty'], num_return_sequences=args['num_return_sequences'], stop_token=args['stop'] ): encoded_prompt=tokenizer.encode(prompt_text, add_special_tokens=True, return_tensors='pt') encoded_prompt=encoded_prompt.to(model.device) input_ids=encoded_prompt if encoded_prompt.shape[-1]>0 else None output_sequences = model.generate( input_ids=input_ids, max_length=max_length, temperature=temperature, top_k=top_k, top_p=top_p, repetition_penalty=repetition_penalty, do_sample=True, num_return_sequences=num_return_sequences, ) if len(output_sequences.shape) > 2: output_sequences.squeeze_() generated_sequences = [] for generated_sequence_idx, generated_sequence in enumerate(output_sequences): print("=== GENERATED SEQUENCE {} ===".format(generated_sequence_idx + 1)) generated_sequence = generated_sequence.tolist() # Decode text text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True) # Remove all text after the stop token text = text[: text.find(stop_token) if args['stop_token'] else None] # Add the prompt at the beginning of the sequence. Remove the excess text that was used for pre-processing total_sequence = ( prompt_text + text[len(tokenizer.decode(encoded_prompt[0], clean_up_tokenization_spaces=True)) :] ) generated_sequences.append(total_sequence) print(total_sequence) return generated_sequences
def bert_collate_fn(tokenizer: BertTokenizer, max_len: int, pairs: Iterable[Tuple[str, int]]): pairs = [(text.split()[:max_len], label) for text, label in pairs] texts, labels = zip(*pairs) labels = torch.LongTensor(labels) # +1 for [CLS] token text_lens = torch.LongTensor([len(text) + 1 for text in texts]) max_len = text_lens.max().item() ids = torch.ones(len(texts), max_len).long() * tokenizer.pad_token_id for i, text in enumerate(texts): ids[i][:len(text) + 1] = torch.LongTensor( tokenizer.encode(text, add_special_tokens=True)[:-1]) return ids, text_lens, labels
def __init__(self, file_path, ent_path, tokenizer: BertTokenizer, batch_size=32, max_len=80): self.batch_size = batch_size self.tokenizer = tokenizer self.max_len = max_len self.num_examples_per_record = 10 # 每个数据生成的负类个数 # 读取实体 with open(ent_path, "r", encoding="utf8") as fr: self.ents = json.load(fr) # 实体信息变为字符串 self.ent_id2str = None self.ent_to_string() self.all_ent_id = list(self.ents.keys()) # 训练数据 self.titles = [] with open(file_path, "r", encoding="utf8") as fr: for line in fr: ss = line.strip().split("\t") self.titles.append(ss) # 把title和训练数据都变成topken_id for i in self.titles: i.append(tokenizer.encode(i[0])) # 因为是拼接不需要[CLS] self.ent_id2token_id = { k: tokenizer.encode(v[0:self.max_len])[1:] for k, v in self.ent_id2str.items() } self.pos_input_ids, self.pos_attention_mask, self.pos_token_type_ids = None, None, None self.neg_input_ids, self.neg_attention_mask, self.neg_token_type_ids = None, None, None self.idx = None self.start, self.end = -1, -1 # 计算 每一轮会迭代的steps self.steps = len( self.titles) * self.num_examples_per_record // self.batch_size
def predict_fn(input_data, model): vocab_path = '/opt/ml/model/vocab.txt' tokenizer = BertTokenizer(vocab_path, do_lower_case=True) question, context = input_data['question'], input_data['context'] input_ids = tokenizer.encode(question, context) token_type_ids = [ 0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids)) ] start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor( [token_type_ids])) all_tokens = tokenizer.convert_ids_to_tokens(input_ids) answer = ' '.join( all_tokens[torch.argmax(start_scores):torch.argmax(end_scores) + 1]) return answer
def prediction(sentence): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") config = BertConfig.from_json_file('model/bert_config.json') model = BERTClassifierModel(config).to(device) model.load_state_dict(torch.load('model/model_bert.bin')) tokenizer = BertTokenizer('model/vocab.txt') input_id = torch.tensor( tokenizer.encode(sentence, add_special_tokens=True, truncation=True, max_length=256, pad_to_max_length=True)) attention_mask = torch.tensor([float(i > 0) for i in input_id]) input_id = input_id.unsqueeze(0).to(device) attention_mask = attention_mask.unsqueeze(0).to(device) result = model((input_id, attention_mask)).detach() return result
def turn2example( tokenizer: BertTokenizer, domain: str, slot: str, value: str, context_ids: List[int], triple_labels: Optional[Set[tuple]] = None, belief_state: Optional[List[Tuple[str, str, str]]] = None, dial_id: str = None, turn_id: int = None, ) -> tuple: """Convert turn data to example based on ontology. Args: tokenizer: BertTokenizer, see https://huggingface.co/transformers/model_doc/bert.html#berttokenizer domain: domain of current example slot: slot of current example value: value of current example context_ids: context token's id in bert vocab triple_labels: set of (domain, slot, value) belief_state: list of (domain, slot, value) dial_id: current dialogue id turn_id: current turn id Returns: example, (input_ids, token_type_ids, domain, slot, value, ...) """ candidate = domain + "-" + slot + " = " + value candidate_ids = tokenizer.encode(candidate, add_special_tokens=False) input_ids = ( [tokenizer.cls_token_id] + context_ids + [tokenizer.sep_token_id] + candidate_ids + [tokenizer.sep_token_id] ) token_type_ids = [0] + [0] * len(context_ids) + [0] + [1] * len(candidate_ids) + [1] example = (input_ids, token_type_ids, domain, slot, value) if dial_id is not None: label = int((domain, slot, value) in triple_labels) example += (belief_state, label, dial_id, str(turn_id)) return example
def row_to_tensor( tokenizer: BertTokenizer, row: pd.Series ) -> Tuple[torch.LongTensor, torch.LongTensor]: tokens = tokenizer.encode(row["comment_text"], add_special_tokens=True) if len(tokens) > 120: tokens = tokens[:119] + [tokens[-1]] x = torch.LongTensor(tokens) y = torch.FloatTensor( row[ [ "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate", ] ] ) return x, y
def word_embedding_tf_2d(tokenizer: BertTokenizer, model: TFBertModel, text: str) -> np.ndarray: """ get the vector representation for the sentence :param tokenizer: :param model: :param text: :return: """ # tokenize and get the input ids for the tokens input_ids = tf.constant(tokenizer.encode(text))[None, :] # get the embedding vectors outputs = model(input_ids) hidden_states = outputs[0] # calculate the mean on axis 0 to reduce the vector to 1D output_vector = np.mean(hidden_states[0], axis=0) # print(output_vector.shape) return output_vector
def build_bert_input(text_data, max_text_length=150): tokens, segments, input_masks = [], [], [] tokenizer = BertTokenizer( vocab_file=os.path.join(bert_path, "vocab.txt") ) # 初始化分词器 for text in text_data: indexed_tokens = tokenizer.encode(text) # 索引列表 if len(indexed_tokens) > max_text_length: indexed_tokens = ( indexed_tokens[: max_text_length // 2] + indexed_tokens[-max_text_length // 2 :] ) tokens.append(indexed_tokens) segments.append([0] * len(indexed_tokens)) input_masks.append([1] * len(indexed_tokens)) for j in range(len(tokens)): padding = [0] * (max_text_length - len(tokens[j])) tokens[j] += padding segments[j] += padding input_masks[j] += padding return tokens, segments, input_masks