def predict(inp: str, model: BertForMaskedLM, tokenizer: BertTokenizer, k: int = 3) -> List[str]: """ Predict the top-k substitutes for an input text containing a single MASK token. :param inp: the input text :param model: a masked language model :param tokenizer: the tokenizer corresponding to the model :param k: the number of predictions :return: the list of top-k substitutes for the MASK token """ kwargs = { 'add_prefix_space': True } if isinstance(tokenizer, GPT2Tokenizer) else {} input_ids = tokenizer.encode(inp, add_special_tokens=True, **kwargs) mask_idx = input_ids.index(tokenizer.mask_token_id) input_ids = torch.tensor([input_ids]) with torch.no_grad(): (predictions, ) = model(input_ids) predicted_tokens = [] _, predicted_indices = torch.topk(predictions[0, mask_idx], k) for predicted_index in predicted_indices: predicted_token = tokenizer.convert_ids_to_tokens( [predicted_index.item()])[0] predicted_tokens.append(predicted_token) return predicted_tokens
class MLMModel: def __init__(self): self.model: BertForMaskedLM = BertForMaskedLM.from_pretrained( pretrained_model_name_or_path='Foodbert/foodbert/data/mlm_output/checkpoint-final') with open('Foodbert/foodbert/data/used_ingredients.json', 'r') as f: used_ingredients = json.load(f) self.tokenizer = BertTokenizer(vocab_file='Foodbert/foodbert/data/bert-base-cased-vocab.txt', do_lower_case=False, max_len=128, never_split=used_ingredients) self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') self.model.to(self.device) def predict_substitutes(self, sentence, ingredient_name, with_masking=True): search_id = self.tokenizer.mask_token_id if with_masking else \ self.tokenizer.convert_tokens_to_ids([ingredient_name])[0] sentence = sentence.replace('!', ' !').replace('?', ' ?').replace('.', ' .').replace(':', ' :').replace(',', ' ,') sentence = ' ' + sentence + ' ' all_ordered_substitutes = [] masked_sentence = sentence.replace(f' {ingredient_name} ', ' [MASK] ') input_ids = torch.tensor(self.tokenizer.encode(masked_sentence, add_special_tokens=True)).unsqueeze(0).to(device=self.device) prediction_scores = self.model(input_ids, masked_lm_labels=input_ids)[1][0] ingredient_scores = prediction_scores[input_ids[0] == search_id] for i in range(len(ingredient_scores)): ingredient_score = ingredient_scores[i] softmax_scores = ingredient_score.softmax(dim=0) indices = torch.sort(ingredient_score, descending=True).indices ordered_substitutes = self.tokenizer.convert_ids_to_tokens(indices) softmax_scores = softmax_scores[indices].tolist() all_ordered_substitutes.append((ordered_substitutes, softmax_scores)) return all_ordered_substitutes
class NemoBertTokenizer(TokenizerSpec): def __init__( self, pretrained_model=None, vocab_file=None, do_lower_case=True, max_len=None, do_basic_tokenize=True, never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"), ): if pretrained_model: self.tokenizer = BertTokenizer.from_pretrained(pretrained_model) if "uncased" not in pretrained_model: self.tokenizer.basic_tokenizer.do_lower_case = False else: self.tokenizer = BertTokenizer(vocab_file, do_lower_case, do_basic_tokenize) self.vocab_size = len(self.tokenizer.vocab) self.never_split = never_split def text_to_tokens(self, text): tokens = self.tokenizer.tokenize(text) return tokens def tokens_to_text(self, tokens): text = self.tokenizer.convert_tokens_to_string(tokens) return remove_spaces(handle_quotes(text.strip())) def token_to_id(self, token): return self.tokens_to_ids([token])[0] def tokens_to_ids(self, tokens): ids = self.tokenizer.convert_tokens_to_ids(tokens) return ids def ids_to_tokens(self, ids): tokens = self.tokenizer.convert_ids_to_tokens(ids) return tokens def text_to_ids(self, text): tokens = self.text_to_tokens(text) ids = self.tokens_to_ids(tokens) return ids def ids_to_text(self, ids): tokens = self.ids_to_tokens(ids) tokens_clean = [t for t in tokens if t not in self.never_split] text = self.tokens_to_text(tokens_clean) return text def pad_id(self): return self.tokens_to_ids(["[PAD]"])[0] def bos_id(self): return self.tokens_to_ids(["[CLS]"])[0] def eos_id(self): return self.tokens_to_ids(["[SEP]"])[0]
class CustomBertVocab(object): def __init__(self, lang='en'): """Basic Vocabulary object""" self.lang = lang self.vocab_size = 0 self.tokenizer = None def load(self, bert_vocab_path): """load 词汇表""" self.tokenizer = BertTokenizer( vocab_file=bert_vocab_path, never_split=['<num>', '<url>', '<img>', '</s>']) self.vocab_size = self.tokenizer.vocab_size def encode(self, words: list): """words 编码""" ids = [] for word in words: ids.append(self.tokenizer.convert_tokens_to_ids(word)) return ids def decode(self, ids, decode_type: str): """ids 解码""" sentence = [] for id in ids: if isinstance(id, torch.Tensor): word = self.tokenizer.convert_ids_to_tokens(id.item()) else: word = self.tokenizer.convert_ids_to_tokens(id) if decode_type == 'predict': if word not in [ EOS_TOKEN, SOS_TOKEN, PAD_TOKEN, IMG_TOKEN, MSP_TOKEN ]: sentence.append(word) if word == PAD_TOKEN or word == EOS_TOKEN: break else: # context question sentence.append(word) if word == PAD_TOKEN: break if self.lang == 'zh': return ''.join(sentence) return ' '.join(sentence)
def _find_best_answer(self, input_ids: Optional[torch.FloatTensor], start_logits: Optional[torch.FloatTensor], end_logits: Optional[torch.FloatTensor], tokenizer: BertTokenizer) -> List[str]: start_ids = torch.argmax(start_logits, dim=-1) end_ids = torch.argmax(end_logits, dim=-1) return [''.join(tokenizer.convert_ids_to_tokens(input_id[start:end].numpy())).replace(" ##", "").replace("##", "") for input_id, start, end in zip(input_ids, start_ids, end_ids)]
def main(): args = set_interact_args() logger = create_logger(args) # 当用户使用GPU,并且GPU可用时 args.cuda = torch.cuda.is_available() and not args.no_cuda # args.cuda = False device = 'cuda' if args.cuda else 'cpu' logger.info('using device:{}'.format(device)) os.environ["CUDA_VISIBLE_DEVICES"] = args.device tokenizer = BertTokenizer(vocab_file=args.voca_path) model = GPT2LMHeadModel.from_pretrained(args.dialogue_model_path) model.to(device) model.eval() print('***********************Summary model start************************') while True: try: text = input() for i in range(5): if len(text): text = text[:1000] input_ids = [tokenizer.cls_token_id] # 每个input以[CLS]为开头 input_ids.extend(tokenizer.encode(text)) input_ids.append(tokenizer.sep_token_id) curr_input_tensor = torch.tensor(input_ids).long().to(device) generated = [] # 最多生成max_len个token for _ in range(args.max_len): outputs = model(input_ids=curr_input_tensor) next_token_logits = outputs[0][-1, :] # 对于已生成的结果generated中的每个token添加一个重复惩罚项,降低其生成概率 for id in set(generated): next_token_logits[id] /= args.repetition_penalty next_token_logits = next_token_logits / args.temperature # 对于[UNK]的概率设为无穷小,也就是说模型的预测结果不可能是[UNK]这个token next_token_logits[tokenizer.convert_tokens_to_ids( '[UNK]')] = -float('Inf') filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=args.topk, top_p=args.topp) # torch.multinomial表示从候选集合中无放回地进行抽取num_samples个元素,权重越高,抽到的几率越高,返回元素的下标 next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1) if next_token == tokenizer.sep_token_id: # 遇到[SEP]则表明response生成结束 break generated.append(next_token.item()) curr_input_tensor = torch.cat( (curr_input_tensor, next_token), dim=0) text = tokenizer.convert_ids_to_tokens(generated) print("summary:" + "".join(text)) except KeyboardInterrupt: break
class SimpleBertEmbeddings(WordEmbeddings): tokenizer: BertTokenizer model: BertModel special_tokens = [] def __init__(self, bert_model_path: str): self.tokenizer = BertTokenizer(vocab_file=bert_model_path + '/vocab.txt') config = BertConfig.from_pretrained(bert_model_path + '/config.json', output_hidden_states=True) self.model = BertModel.from_pretrained(bert_model_path, config=config) self.model.eval() def convert(self, text: str) -> Dict[Word, List[float]]: print("[bert embeddings] analyze text:", text) lower_text = text.lower().replace("й", "и").replace("ё", "е").replace("́", "") token_ids = self.tokenizer.encode(lower_text) encoded_layers = self.model(input_ids=torch.tensor([token_ids])) hidden_layers = encoded_layers[2][1:] token_embeddings = torch.stack(hidden_layers, dim=0) token_embeddings = torch.squeeze(token_embeddings, dim=1) token_embeddings = token_embeddings.permute(1, 0, 2) result: Dict[Word, List[float]] = {} text_pos = 0 prev = None for i, token_vec in enumerate(token_embeddings): # todo: try only -12 layer: https://github.com/hanxiao/bert-as-service#q-so-which-layer-and-which-pooling-strategy-is-the-best # combine last 4 layers (best F1 score) cat_vec = torch.cat( (token_vec[-1], token_vec[-2], token_vec[-3], token_vec[-4]), dim=0) if token_ids[i] in self.tokenizer.all_special_ids: continue token: str = self.tokenizer.convert_ids_to_tokens(token_ids[i]) if token.startswith("##") and prev is not None: clear_token = token.replace("##", "") word = Word(prev.text + clear_token, prev.start, prev.end + len(clear_token)) result.update( {word: np.add(result[prev], cat_vec.tolist()).tolist()}) del result[prev] prev = word continue start = lower_text.find(token, text_pos) if start == -1: continue end = start + len(token) word = Word(token, start, end) text_pos = end prev = word result.update({word: cat_vec.tolist()}) return result
def create_masked_lm_predictions(input_ids, masked_lm_prob, max_predictions_per_seq, tokenizer: BertTokenizer, rng): cand_indexes = [] for (i, input_id) in enumerate(input_ids): token = tokenizer.convert_ids_to_tokens(input_id) if token == "[CLS]" or token == "[SEP]": continue cand_indexes.append(i) rng.shuffle(cand_indexes) output = list(input_ids) num_to_predict = min(max_predictions_per_seq, max(1, int(round(len(input_ids) * masked_lm_prob)))) masked_lms = [] covered_indexes = set() for index in cand_indexes: if len(masked_lms) >= num_to_predict: break if index in covered_indexes: continue covered_indexes.add(index) masked_token_id = None # 80% of the time, replace with [MASK] if rng.random() < 0.8: masked_token_id = tokenizer.mask_token_id else: # 10% of the time, keep original if rng.random() < 0.5: masked_token_id = input_ids[index] # 10% of the time, replace with random word else: masked_token_id = rng.randint(0, tokenizer.vocab_size - 1) output[index] = masked_token_id MaskedLmInstance = collections.namedtuple("MaskedLmInstance", ["index", "id"]) masked_lms.append(MaskedLmInstance(index=index, id=input_ids[index])) masked_lms = sorted(masked_lms, key=lambda x: x.index) ## size=[num_to_predict] masked_lm_positions = [] masked_lm_ids = [] for p in masked_lms: masked_lm_positions.append(p.index) masked_lm_ids.append(p.id) return (output, masked_lm_positions, masked_lm_ids)
class testAnswerGeneration(): def __init__(self): self.tokenizer = BertTokenizer( vocab_file='bert-base-chinese-vocab.txt') self.config = BertConfig.from_pretrained('trained_model/1/config.json') self.model = BertForMaskedLM.from_pretrained( 'trained_model/1/pytorch_model.bin', from_tf=bool('.ckpt' in 'bert-base-chinese'), config=self.config) self.model.eval() def to_input_id(self, sentence_input): return self.tokenizer.convert_tokens_to_ids( self.tokenizer.tokenize(sentence_input)) def getAnswer(self, context, question): input_id = self.to_input_id("[CLS] " + context + " [SEP] " + question + " [SEP]") count = 0 answer = "" maskpos = len(input_id) # 標出要預測答案的位置 input_id.append(103) # 補齊長度 while len(input_id) < 512: input_id.append(0) # 限制答案最大長度為10 while (count < 10): input_id_tensor = torch.LongTensor([input_id]) outputs = self.model(input_id_tensor) predictions = outputs[0] predicted_index = torch.argmax( predictions[0, maskpos]).item() # 生出最有可能的token_id predicted_token = self.tokenizer.convert_ids_to_tokens( predicted_index) # id轉token # 當預測為[SEP]的時候,就結束生成答案 if predicted_token == '[SEP]': break answer = answer + predicted_token # 將生成的token連接起來 input_id[maskpos] = predicted_index # 用生成的token_id取代當前的[MASK]的id maskpos += 1 if maskpos < 512: input_id[maskpos] = 103 # 標出下一個預測的[MASK]的id else: break count += 1 return answer
def main(): # model files check and download check_and_download_models(WEIGHT_PATH, MODEL_PATH, REMOTE_PATH) # bert tokenizer if LANG == 'en': tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') elif LANG == 'jp': tokenizer = BertTokenizer( 'vocab.txt', do_lower_case=False, do_basic_tokenize=False ) # prepare data dummy_input = np.ones((1, MAX_SEQ_LEN), dtype=np.int64) tokens_ts, segments_ts, masked_index = text2token( SENTENCE, tokenizer, lang=LANG ) input_data = np.array([tokens_ts, segments_ts]) # net initialize env_id = ailia.get_gpu_environment_id() print(f'env_id: {env_id}') net = ailia.Net(MODEL_PATH, WEIGHT_PATH, env_id=env_id) # compute execution time for i in range(5): start = int(round(time.time() * 1000)) input_blobs = net.get_input_blob_list() for i, idx in enumerate(input_blobs): if i < len(input_data): net.set_input_blob_data(input_data[i], idx) else: net.set_input_blob_data(dummy_input, idx) net.update() preds_ailia = net.get_results() # preds_ailia = net.predict(dummy_input)[0] end = int(round(time.time() * 1000)) print("ailia processing time {} ms".format(end-start)) # masked word prediction predicted_indices = np.argsort( preds_ailia[0][0][masked_index] )[-NUM_PREDICT:][::-1] predicted_tokens = tokenizer.convert_ids_to_tokens(predicted_indices) print('Input sentence: ' + SENTENCE) print(f'predicted top {NUM_PREDICT} words: {predicted_tokens}') print('Script finished successfully.')
def load_word2id(tokenizer: BertTokenizer) -> Dict[str, int]: """ Loads model vocabulary in the form of mapping from words to their indexes. Args: tokenizer: `transformers.BertTokenizer` tokenizer Returns: model vocabulary """ word2id = dict() for word_idx in range(tokenizer.vocab_size): word = tokenizer.convert_ids_to_tokens([word_idx])[0] word2id[word] = word_idx return word2id
def predict_fn(input_data, model): vocab_path = '/opt/ml/model/vocab.txt' tokenizer = BertTokenizer(vocab_path, do_lower_case=True) question, context = input_data['question'], input_data['context'] input_ids = tokenizer.encode(question, context) token_type_ids = [ 0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids)) ] start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor( [token_type_ids])) all_tokens = tokenizer.convert_ids_to_tokens(input_ids) answer = ' '.join( all_tokens[torch.argmax(start_scores):torch.argmax(end_scores) + 1]) return answer
def explain_handle(self, model_wraper, text, target=1): """Captum explanations handler Args: data_preprocess (Torch Tensor): Preprocessed data to be used for captum raw_data (list): The unprocessed data to get target from the request Returns: dict : A dictionary response with the explanations response. """ vis_data_records_base = [] model_wrapper = AGNewsmodelWrapper(self.model) tokenizer = BertTokenizer(self.VOCAB_FILE) model_wrapper.eval() model_wrapper.zero_grad() encoding = tokenizer.encode_plus(self.text, return_attention_mask=True, return_tensors="pt", add_special_tokens=False) input_ids = encoding["input_ids"] attention_mask = encoding["attention_mask"] input_ids = input_ids.to(self.device) attention_mask = attention_mask.to(self.device) input_embedding_test = model_wrapper.model.bert_model.embeddings( input_ids) preds = model_wrapper(input_embedding_test, attention_mask) out = np.argmax(preds.cpu().detach(), axis=1) out = out.item() ig_1 = IntegratedGradients(model_wrapper) attributions, delta = ig_1.attribute( # pylint: disable=no-member input_embedding_test, n_steps=500, return_convergence_delta=True, target=1, ) tokens = tokenizer.convert_ids_to_tokens( input_ids[0].cpu().numpy().tolist()) feature_imp_dict = {} feature_imp_dict["words"] = tokens attributions_sum = self.summarize_attributions(attributions) feature_imp_dict["importances"] = attributions_sum.tolist() feature_imp_dict["delta"] = delta[0].tolist() self.add_attributions_to_visualizer(attributions, tokens, self.score_func(preds), out, 2, 1, delta, vis_data_records_base) return [feature_imp_dict]
def from_model_predictions(cls, probs: np.ndarray, tokenizer: BertTokenizer, max_len: int = 1000): probs_with_indices = list(enumerate(probs)) best_probs_with_indices = sorted(probs_with_indices, key=lambda x: x[1], reverse=True) best_words = {} for i, prob in best_probs_with_indices: word = tokenizer.convert_ids_to_tokens(i).lower() if word in punctuation | stopwords or word in best_words: continue best_words[word] = prob if len(best_words) == max_len: break best_probs = list(best_words.values()) best_words = list(best_words.keys()) return cls(best_words, best_probs)
class TrainLoop_BERT(): def __init__(self, opt, args): self.opt = opt self.args = args self.batch_size = self.opt['batch_size'] self.epoch = self.opt['epoch'] self.use_cuda = opt['use_cuda'] self.device = "cuda:{}".format( self.args.gpu) if self.use_cuda else 'cpu' self.args.device = self.device self.build_data() self.build_model() self.init_optim() def build_data(self): self.tokenizer = BertTokenizer( vocab_file=self.opt['vocab_path']) # 初始化分词器 # build and save dataset self.dataset = {'train': None, 'valid': None, 'test': None} self.dataset_loader = {'train': None, 'valid': None, 'test': None} for subset in self.dataset: self.dataset[subset] = CRSdataset(logger, subset, self.opt[f'{subset}_data_file'], self.args, self.tokenizer) self.dataset_loader[subset] = torch.utils.data.DataLoader( dataset=self.dataset[subset], batch_size=self.batch_size, shuffle=True) self.movie_num = self.dataset['train'].movie_num def build_model(self): self.model = BERTModel(self.args, self.movie_num) if self.use_cuda: self.model.to(self.device) def train(self): losses = [] # 预报一次清零一次 best_val_NDCG = 0.0 gen_stop = False patience = 0 max_patience = 5 for i in range(self.epoch): train_loss = [] for batch_idx, batch_data in tqdm( enumerate(self.dataset_loader['train'])): self.model.train() self.zero_grad() contexts, types, masks, y, _, _, _, _ = (data.to( self.device) for data in batch_data) # 检验输入输出ok # logger.info("[Context] ", batch_data[0]) # logger.info("[Context] ", '\n'.join(self.vector2sentence(contexts.cpu()))) # logger.info("[GT] ", y) # ipdb.set_trace() logit = self.model([contexts, types, masks], raw_return=False) # logger.info(logit[y]) loss = self.model.compute_loss(logit, y, 'train') train_loss.append(loss.item()) losses.append(loss.item()) loss.backward() self.optimizer.step() # logger.info('loss = ', loss) if (batch_idx + 1) % 50 == 0: # 从上次预报到现在为止的loss均值,每50个batch预报一次 loss = sum(losses) / len(losses) logger.info('loss is %.4f' % (loss)) losses = [] logger.info( f'Epoch {i}, train loss = {sum(train_loss)/len(train_loss)}') # metrics_test = self.val('train') metrics_test = self.val('valid') _ = self.val('test') if best_val_NDCG > metrics_test["NDCG50"]: patience += 1 logger.info(f"[Patience = {patience}]") if patience >= max_patience: gen_stop = True else: patience = 0 best_val_NDCG = metrics_test["NDCG50"] self.model.save_model(self.opt['model_save_path']) logger.info("[Model saved in {}]".format( self.opt['model_save_path'])) if gen_stop: break def val(self, subset): assert subset in ['train', 'test', 'valid'] self.model.eval() val_dataset_loader = self.dataset_loader[subset] metrics_test = { "Loss": 0, "NDCG1": 0, "NDCG10": 0, "NDCG50": 0, "MRR1": 0, "MRR10": 0, "MRR50": 0, "count": 0 } losses = [] for batch_idx, batch_data in enumerate(val_dataset_loader): with torch.no_grad(): contexts, types, masks, y, _, _, _, _ = (data.to( self.device) for data in batch_data) logit = self.model([contexts, types, masks], raw_return=False) # ipdb.set_trace() loss = self.model.compute_loss(logit, y) self.compute_metircs(logit, y, metrics_test) losses.append(loss.item()) metrics_test['Loss'] = sum(losses) / len(losses) for key in metrics_test: if 'NDCG' in key or 'MRR' in key: metrics_test[key] = round( metrics_test[key] / metrics_test['count'], 4) logger.info(f"{subset} set's metrics = {metrics_test}") return metrics_test def compute_metircs(self, logit, y, metrics): for K in [1, 10, 50]: # pred = logit.max(-1, keepdim=True)[1] # acc += pred.eq(y.view_as(pred)).sum().item() # 记得加item() pred, pred_id = torch.topk(logit, K, dim=1) # id=[bs, K] for i, gt in enumerate(y): gt = gt.item() cand_ids = pred_id[i].tolist() if gt in cand_ids: rank = cand_ids.index(gt) metrics['NDCG' + str(K)] += 1.0 / math.log(rank + 2.0, 2) metrics['MRR' + str(K)] += 1.0 / (rank + 1.0) # metrics['count'] += 1 # metrics['count'] = int(metrics['count']/3) assert len(y.shape) == 1 metrics['count'] += y.shape[0] def vector2sentence(self, batch_sen, compat=True): # 一个batch的sentence 从id换成token sentences = [] # for sen in batch_sen.numpy(): # sentences.append(self.tokenizer.convert_ids_to_tokens(sen)) for sen in batch_sen.numpy().tolist(): sentence = [] for word in sen: if word != 0: sentence.append(self.tokenizer.convert_ids_to_tokens(word)) # elif word==3: # sentence.append('_UNK_') if compat: sentence = ''.join(sentence) sentences.append(sentence) return sentences @classmethod def optim_opts(self): """ Fetch optimizer selection. By default, collects everything in torch.optim, as well as importing: - qhm / qhmadam if installed from github.com/facebookresearch/qhoptim Override this (and probably call super()) to add your own optimizers. """ # first pull torch.optim in optims = { k.lower(): v for k, v in optim.__dict__.items() if not k.startswith('__') and k[0].isupper() } try: import apex.optimizers.fused_adam as fused_adam optims['fused_adam'] = fused_adam.FusedAdam except ImportError: pass try: # https://openreview.net/pdf?id=S1fUpoR5FQ from qhoptim.pyt import QHM, QHAdam optims['qhm'] = QHM optims['qhadam'] = QHAdam except ImportError: # no QHM installed pass logger.info(optims) return optims def init_optim(self): param_optimizer = list(self.model.bert.named_parameters()) # 模型参数名字列表 no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer] }] fc_optimizer = list(self.model.fc.named_parameters()) # 模型参数名字列表 optimizer_grouped_parameters += [{ 'params': [p for n, p in fc_optimizer], 'lr': self.opt['lr_sasrec'] }] # self.optimizer = transformers.AdamW(self.model.parameters(), lr=self.opt['lr']) self.optimizer = transformers.AdamW(optimizer_grouped_parameters, lr=self.opt['lr_bert']) # self.scheduler = transformers.WarmupLinearSchedule( # self.optimizer, warmup_steps=self.opt['warmup_steps'], t_total=len(self.dataset_loader['train']) * self.epoch) def zero_grad(self): """ Zero out optimizer. It is recommended you call this in train_step. It automatically handles gradient accumulation if agent is called with --update-freq. """ self.optimizer.zero_grad()
class TrainLoop_SASRec(): def __init__(self, opt, args): self.opt = opt self.args = args self.batch_size = self.args.batch_size self.epoch = self.args.epoch self.use_cuda = self.args.use_cuda self.device = "cuda:{}".format( self.args.gpu) if self.use_cuda else 'cpu' self.args.device = self.device self.build_data() # bs, item_num+1: [gt, all_item_id] self.default_neg_sampled = torch.tensor( [0] + [i for i in range(1, self.args.item_size)], dtype=torch.long).repeat(self.args.batch_size, 1).to(self.device) self.build_model() self.init_optim() def build_data(self): # 初始化分词器 self.tokenizer = BertTokenizer( vocab_file=self.args.vocab_path) # 初始化分词器 # build and save self.dataset self.dataset = {'train': None, 'valid': None, 'test': None} self.dataset_loader = {'train': None, 'valid': None, 'test': None} for subset in self.dataset: self.dataset[subset] = CRSdataset(logger, subset, self.opt[f'{subset}_data_file'], self.args, self.tokenizer) self.dataset_loader[subset] = torch.utils.data.DataLoader( dataset=self.dataset[subset], batch_size=self.batch_size, shuffle=True) # self.dataset['train'].movie_num 是增加了unk之后的电影数量,+1是他们提高1位,增加0的电影总数 self.item_size = self.dataset['train'].movie_num + 1 self.args.item_size = self.item_size def build_model(self): self.model = SASRecModel(args=self.args) if self.args.load_model: self.model.load_model(self.args.sasrec_load_path) if self.use_cuda: self.model.to(self.device) def train(self): losses = [] # 预报一次清零一IC best_val_NDCG = 0.0 gen_stop = False patience = 0 max_patience = 5 for i in range(self.epoch): train_loss = [] # for batch_idx, batch_data in tqdm(enumerate(self.rec_train_dataloader)): for batch_idx, batch_data in enumerate( self.dataset_loader['train']): self.model.train() self.zero_grad() batch_data = [data.to(self.device) for data in batch_data] input_ids, target_pos, input_mask, sample_negs = batch_data[ -4:] # print(input_ids) # print(target_pos) sequence_output = self.model(input_ids, input_mask, self.args.use_cuda) loss = self.model.cross_entropy(sequence_output, target_pos, sample_negs, self.use_cuda) train_loss.append(loss.item()) losses.append(loss.item()) loss.backward() self.optimizer.step() if (batch_idx + 1) % 1000000000000000 == 0: loss = sum(losses) / len(losses) logger.info('loss is %.4f' % (loss)) losses = [] logger.info( f'Epoch {i}, train loss = {sum(train_loss)/len(train_loss)}') # metrics_test = self.val('train') metrics_test = self.val('valid') _ = self.val('test') # False是什么鬼 if best_val_NDCG > metrics_test["NDCG50"]: patience += 1 logger.info(f"[Patience = {patience}]") if patience >= max_patience: gen_stop = True else: patience = 0 best_val_NDCG = metrics_test["NDCG50"] self.model.save_model(self.args.sasrec_save_path) logger.info(f"[Model saved in {self.args.sasrec_save_path}]") if gen_stop: break # metrics_test = self.val('test') def val(self, subset): assert subset in ['train', 'test', 'valid'] self.model.eval() val_dataset_loader = self.dataset_loader[subset] metrics_test = { "Loss": 0, "NDCG1": 0, "NDCG10": 0, "NDCG50": 0, "MRR1": 0, "MRR10": 0, "MRR50": 0, "count": 0 } losses = [] for batch_idx, batch_data in enumerate(val_dataset_loader): with torch.no_grad(): batch_data = [data.to(self.device) for data in batch_data] _, _, _, predict_ids, input_ids, target_pos, input_mask, sample_negs = batch_data # print(input_ids) # print(target_pos) # print(predict_ids) # bs, max_len, hidden_size2 sequence_output = self.model(input_ids, input_mask, self.args.use_cuda) loss = self.model.cross_entropy(sequence_output, target_pos, sample_negs, self.use_cuda) # bs, item_num for i in range(predict_ids.shape[0]): self.default_neg_sampled[i][0] = predict_ids[i] # 推荐的结果 test_logits = self.predict( sequence_output, self.default_neg_sampled[:predict_ids.shape[0]], self.use_cuda) self.compute_metircs(test_logits, metrics_test) losses.append(loss.item()) # test 结束 metrics_test['Loss'] = sum(losses) / len(losses) for key in metrics_test: if 'NDCG' in key or 'MRR' in key: metrics_test[key] = round( metrics_test[key] / metrics_test['count'], 4) logger.info(f"{subset} set's metrics = {metrics_test}") return metrics_test def predict(self, seq_out, test_neg_sample, use_cuda=True): # shorten: 只要每个batch最后一个item的representation与所有candidate rep的点击 # [batch item_num hidden_size] test_item_emb = self.model.embeddings.item_embeddings(test_neg_sample) # [batch 1 hidden] seq_out = seq_out[:, -1, :].unsqueeze(1) # [batch 1 item_num] test_logits = torch.matmul(seq_out, test_item_emb.transpose(1, 2)) # print(test_logits.shape) #p # [batch item_num] test_logits = test_logits[:, -1, :] return test_logits def compute_metircs(self, logit, metrics): MRR1, NDCG1 = self.get_metric(logit, topk=1) # ipdb.set_trace() metrics['MRR1'] += MRR1 metrics['NDCG1'] += NDCG1 MRR10, NDCG10 = self.get_metric(logit, topk=10) metrics['MRR10'] += MRR10 metrics['NDCG10'] += NDCG10 MRR50, NDCG50 = self.get_metric(logit, topk=50) metrics['MRR50'] += MRR50 metrics['NDCG50'] += NDCG50 metrics['count'] += 1 def get_metric(self, test_logits, topk=10): NDCG = 0.0 MRR = 0.0 # [batch] 最终每个 example 中 正确答案的排位 ranks = test_logits.argsort(descending=True).argsort()[:, 0].cpu() ranks_size = int(ranks.size(0)) for rank in ranks: if rank < topk: NDCG += float(1.0 / np.log2(rank + 2.0)) MRR += float(1.0 / np.array(rank + 1.0)) return MRR / ranks_size, NDCG / ranks_size def save_embed(self): torch.save(self.model.embeddings.item_embeddings.state_dict(), self.args.sasrec_emb_save_path) def vector2sentence(self, batch_sen, compat=True): # 一个batch的sentence 从id换成token sentences = [] # for sen in batch_sen.numpy(): # sentences.append(self.tokenizer.convert_ids_to_tokens(sen)) for sen in batch_sen.numpy().tolist(): sentence = [] for word in sen: if word != 0: sentence.append(self.tokenizer.convert_ids_to_tokens(word)) # elif word==3: # sentence.append('_UNK_') if compat: sentence = ''.join(sentence) sentences.append(sentence) return sentences @classmethod def optim_opts(self): """ Fetch optimizer selection. By default, collects everything in torch.optim, as well as importing: - qhm / qhmadam if installed from github.com/facebookresearch/qhoptim Override this (and probably call super()) to add your own optimizers. """ # first pull torch.optim in optims = { k.lower(): v for k, v in optim.__dict__.items() if not k.startswith('__') and k[0].isupper() } try: import apex.optimizers.fused_adam as fused_adam optims['fused_adam'] = fused_adam.FusedAdam except ImportError: pass try: # https://openreview.net/pdf?id=S1fUpoR5FQ from qhoptim.pyt import QHM, QHAdam optims['qhm'] = QHM optims['qhadam'] = QHAdam except ImportError: # no QHM installed pass logger.info(optims) return optims def init_optim(self): betas = (self.args.adam_beta1, self.args.adam_beta2) self.optimizer = Adam(self.model.parameters(), lr=self.args.lr_sasrec, betas=betas, weight_decay=self.args.weight_decay) print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()])) def zero_grad(self): """ Zero out optimizer. It is recommended you call this in train_step. It automatically handles gradient accumulation if agent is called with --update-freq. """ self.optimizer.zero_grad()
def main(): ## parser settings parser = argparse.ArgumentParser() parser.add_argument('--file_path', '-fp', type=str) parser.add_argument('--model_path', '-mdp', type=str) parser.add_argument('--evaluate_file_name', '-evaflnm', type=str) parser.add_argument('--analysis_file_name', '-aysflnm', type=str) parser.add_argument('--gpu_num', '-gpun', type=int) args = parser.parse_args() ## 預設值 tokenizer = BertTokenizer(vocab_file='bert-base-uncased-vocab.txt') nlp = spacy.load( "model/spacy/en_core_web_md-2.3.1/en_core_web_md/en_core_web_md-2.3.1") accepted_pos_list = get_accepted_pos_list() ## for debug if args.file_path == None: args.file_path = 'mingda_chen_dataset/test_input.txt' if args.model_path == None: args.model_path = 'trained_model/sequential_6000/4/pytorch_model.bin' if args.evaluate_file_name == None: args.evaluate_file_name = "sequence" if args.analysis_file_name == None: args.analysis_file_name = "analysis_" + args.evaluate_file_name if args.gpu_num == None: args.gpu_num = torch.cuda.device_count() ## ## GPU setting if torch.cuda.device_count() > 1: device = torch.device('cuda') os.environ["CUDA_VISIBLE_DEVICES"] = "0,1" else: device = torch.device('cpu') os.environ["CUDA_VISIBLE_DEVICES"] = "-1" ## # # load model config = BertConfig.from_pretrained('bert-base-uncased') model = BertForMaskedLM.from_pretrained(args.model_path, config=config) model.to(device) model.eval() # read data semantic_list, syntactic_list = get_dataset_list(args.file_path) all_syntactic_keyword_list = get_all_syntactic_keyword_list( syntactic_list, accepted_pos_list, tokenizer, nlp) # write_syntactic_keyword(all_syntactic_keyword_list) all_syntactic_keyword_with_sep_list = insert_sep_token( all_syntactic_keyword_list) predict_sentence_list = [] for index, semantic_sentence in enumerate(tqdm(semantic_list)): predict_sentence = ["[MASK]"] repeat_flag = False while ("[MASK]" in predict_sentence): input_ids_list, input_segment_list, input_attention_list = data_preprocess( semantic_sentence, all_syntactic_keyword_with_sep_list[index], predict_sentence, tokenizer) input_id_tensor, input_segment_tensor, input_attention_tensor = convert_to_tensor( input_ids_list, input_segment_list, input_attention_list, device) outputs = model(return_dict=True, input_ids=input_id_tensor, token_type_ids=input_segment_tensor, attention_mask=input_attention_tensor) logits = outputs[0] maskpos = input_ids_list.index(103) predicted_index = torch.argmax(logits[0, maskpos]).item() predicted_token = tokenizer.convert_ids_to_tokens( [predicted_index])[0] predict_sentence.remove("[MASK]") ## 檢查重複 count_dict = dict(Counter(predict_sentence)) for key in count_dict: if count_dict[key] > 3: repeat_flag = True if repeat_flag: predict_sentence_list.append( extract_sentence_from_list(predict_sentence)) break if predicted_token != "[SEP]": predict_sentence.append(predicted_token) predict_sentence.append("[MASK]") else: predict_sentence_list.append( extract_sentence_from_list(predict_sentence)) break output_evaluate(ref=predict_sentence_list, filename=args.evaluate_file_name) produce_analysis_file(all_syntactic_keyword_list, predict_sentence_list, args.analysis_file_name) return 0
class Scoring(object): def __init__(self, BERT_PATH): self.config = BertConfig.from_json_file(BERT_PATH + "/bert_config.json") self.model = BertForPreTraining.from_pretrained(BERT_PATH + "/bert_model.ckpt", from_tf=True, config=self.config) self.tokenizer = BertTokenizer(BERT_PATH + "/vocab.txt") self.model.eval() self.model.cuda(args.gpu_id) def sentence_preprocese(self, text): tokenized_text = np.array(self.tokenizer.tokenize(text)) find_sep = np.argwhere(tokenized_text == '[SEP]') segments_ids = np.zeros(tokenized_text.shape, dtype=int) if find_sep.size == 1: start_point = 1 else: start_point = find_sep[0, 0] + 1 segments_ids[start_point:] = 1 end_point = tokenized_text.size - 1 tokenized_text = self.tokenizer.convert_tokens_to_ids(tokenized_text) masked_texts = [] # mask with l2r fashion for masked_index in range(start_point, end_point): new_tokenized_text = np.array(tokenized_text, dtype=int) new_tokenized_text[ masked_index] = self.tokenizer.convert_tokens_to_ids( ['[MASK]'])[0] masked_texts.append(new_tokenized_text) # copy the segments_ids segments_ids = np.tile(segments_ids, (end_point - start_point, 1)) return masked_texts, segments_ids, start_point, end_point, tokenized_text[ start_point:end_point] def metric(self, text): indexed_tokens, segments_ids, start_point, end_point, real_indexs = self.sentence_preprocese( text) tokens_tensor = torch.tensor(indexed_tokens) segments_tensors = torch.tensor(segments_ids) tokens_tensor = tokens_tensor.cuda(args.gpu_id) segments_tensors = segments_tensors.cuda(args.gpu_id) # model return: tuple() # 1. prediction_scores (batch_size X sequence_length X config.vocab_size); # 2. seq_relationship_scores (batch_size X 2) with torch.no_grad(): outputs = self.model(tokens_tensor, token_type_ids=segments_tensors) predictions = torch.softmax(outputs[0], -1) log_likelihood = 0 # cumulated negative log likelihood for i, step in enumerate(range(start_point, end_point)): predicted_index = torch.argmax(predictions[i, step]).item() predicted_token = self.tokenizer.convert_ids_to_tokens( [predicted_index]) real_pos_prob = predictions[i, step, real_indexs[i]].item() real_token = self.tokenizer.convert_ids_to_tokens([real_indexs[i]]) if args.prob_token == True: print("The",i+1,"th position: {pred_token}",predicted_token, round(predictions[i, step, predicted_index].item(), 4), \ "\t\t\t {golden_token}", real_token, round(real_pos_prob, 4)) log_likelihood += np.log2(real_pos_prob) prob = np.exp2(log_likelihood) nll = -log_likelihood / (end_point - start_point) ppl = np.exp2(nll) return nll, ppl