def predict(inp: str,
            model: BertForMaskedLM,
            tokenizer: BertTokenizer,
            k: int = 3) -> List[str]:
    """
    Predict the top-k substitutes for an input text containing a single MASK token.
    :param inp: the input text
    :param model: a masked language model
    :param tokenizer: the tokenizer corresponding to the model
    :param k: the number of predictions
    :return: the list of top-k substitutes for the MASK token
    """
    kwargs = {
        'add_prefix_space': True
    } if isinstance(tokenizer, GPT2Tokenizer) else {}
    input_ids = tokenizer.encode(inp, add_special_tokens=True, **kwargs)
    mask_idx = input_ids.index(tokenizer.mask_token_id)
    input_ids = torch.tensor([input_ids])

    with torch.no_grad():
        (predictions, ) = model(input_ids)

    predicted_tokens = []
    _, predicted_indices = torch.topk(predictions[0, mask_idx], k)

    for predicted_index in predicted_indices:
        predicted_token = tokenizer.convert_ids_to_tokens(
            [predicted_index.item()])[0]
        predicted_tokens.append(predicted_token)
    return predicted_tokens
Exemple #2
0
class MLMModel:

    def __init__(self):
        self.model: BertForMaskedLM = BertForMaskedLM.from_pretrained(
            pretrained_model_name_or_path='Foodbert/foodbert/data/mlm_output/checkpoint-final')
        with open('Foodbert/foodbert/data/used_ingredients.json', 'r') as f:
            used_ingredients = json.load(f)
        self.tokenizer = BertTokenizer(vocab_file='Foodbert/foodbert/data/bert-base-cased-vocab.txt', do_lower_case=False,
                                       max_len=128, never_split=used_ingredients)

        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
        self.model.to(self.device)

    def predict_substitutes(self, sentence, ingredient_name, with_masking=True):
        search_id = self.tokenizer.mask_token_id if with_masking else \
            self.tokenizer.convert_tokens_to_ids([ingredient_name])[0]
        sentence = sentence.replace('!', ' !').replace('?', ' ?').replace('.', ' .').replace(':', ' :').replace(',', ' ,')
        sentence = ' ' + sentence + ' '

        all_ordered_substitutes = []

        masked_sentence = sentence.replace(f' {ingredient_name} ', ' [MASK] ')
        input_ids = torch.tensor(self.tokenizer.encode(masked_sentence, add_special_tokens=True)).unsqueeze(0).to(device=self.device)
        prediction_scores = self.model(input_ids, masked_lm_labels=input_ids)[1][0]
        ingredient_scores = prediction_scores[input_ids[0] == search_id]

        for i in range(len(ingredient_scores)):
            ingredient_score = ingredient_scores[i]
            softmax_scores = ingredient_score.softmax(dim=0)
            indices = torch.sort(ingredient_score, descending=True).indices
            ordered_substitutes = self.tokenizer.convert_ids_to_tokens(indices)
            softmax_scores = softmax_scores[indices].tolist()
            all_ordered_substitutes.append((ordered_substitutes, softmax_scores))

        return all_ordered_substitutes
Exemple #3
0
class NemoBertTokenizer(TokenizerSpec):
    def __init__(
            self,
            pretrained_model=None,
            vocab_file=None,
            do_lower_case=True,
            max_len=None,
            do_basic_tokenize=True,
            never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"),
    ):
        if pretrained_model:
            self.tokenizer = BertTokenizer.from_pretrained(pretrained_model)
            if "uncased" not in pretrained_model:
                self.tokenizer.basic_tokenizer.do_lower_case = False
        else:
            self.tokenizer = BertTokenizer(vocab_file, do_lower_case,
                                           do_basic_tokenize)
        self.vocab_size = len(self.tokenizer.vocab)
        self.never_split = never_split

    def text_to_tokens(self, text):
        tokens = self.tokenizer.tokenize(text)
        return tokens

    def tokens_to_text(self, tokens):
        text = self.tokenizer.convert_tokens_to_string(tokens)
        return remove_spaces(handle_quotes(text.strip()))

    def token_to_id(self, token):
        return self.tokens_to_ids([token])[0]

    def tokens_to_ids(self, tokens):
        ids = self.tokenizer.convert_tokens_to_ids(tokens)
        return ids

    def ids_to_tokens(self, ids):
        tokens = self.tokenizer.convert_ids_to_tokens(ids)
        return tokens

    def text_to_ids(self, text):
        tokens = self.text_to_tokens(text)
        ids = self.tokens_to_ids(tokens)
        return ids

    def ids_to_text(self, ids):
        tokens = self.ids_to_tokens(ids)
        tokens_clean = [t for t in tokens if t not in self.never_split]
        text = self.tokens_to_text(tokens_clean)
        return text

    def pad_id(self):
        return self.tokens_to_ids(["[PAD]"])[0]

    def bos_id(self):
        return self.tokens_to_ids(["[CLS]"])[0]

    def eos_id(self):
        return self.tokens_to_ids(["[SEP]"])[0]
Exemple #4
0
class CustomBertVocab(object):
    def __init__(self, lang='en'):
        """Basic Vocabulary object"""
        self.lang = lang
        self.vocab_size = 0
        self.tokenizer = None

    def load(self, bert_vocab_path):
        """load 词汇表"""
        self.tokenizer = BertTokenizer(
            vocab_file=bert_vocab_path,
            never_split=['<num>', '<url>', '<img>', '</s>'])
        self.vocab_size = self.tokenizer.vocab_size

    def encode(self, words: list):
        """words 编码"""
        ids = []
        for word in words:
            ids.append(self.tokenizer.convert_tokens_to_ids(word))

        return ids

    def decode(self, ids, decode_type: str):
        """ids 解码"""
        sentence = []
        for id in ids:
            if isinstance(id, torch.Tensor):
                word = self.tokenizer.convert_ids_to_tokens(id.item())
            else:
                word = self.tokenizer.convert_ids_to_tokens(id)
            if decode_type == 'predict':
                if word not in [
                        EOS_TOKEN, SOS_TOKEN, PAD_TOKEN, IMG_TOKEN, MSP_TOKEN
                ]:
                    sentence.append(word)
                if word == PAD_TOKEN or word == EOS_TOKEN:
                    break
            else:  # context question
                sentence.append(word)
                if word == PAD_TOKEN:
                    break
        if self.lang == 'zh':
            return ''.join(sentence)

        return ' '.join(sentence)
Exemple #5
0
 def _find_best_answer(self,
                       input_ids: Optional[torch.FloatTensor],
                       start_logits: Optional[torch.FloatTensor],
                       end_logits: Optional[torch.FloatTensor],
                       tokenizer: BertTokenizer) -> List[str]:
     start_ids = torch.argmax(start_logits, dim=-1)
     end_ids = torch.argmax(end_logits, dim=-1)
     return [''.join(tokenizer.convert_ids_to_tokens(input_id[start:end].numpy())).replace(" ##", "").replace("##", "")
             for input_id, start, end in zip(input_ids, start_ids, end_ids)]
Exemple #6
0
def main():
    args = set_interact_args()
    logger = create_logger(args)
    # 当用户使用GPU,并且GPU可用时
    args.cuda = torch.cuda.is_available() and not args.no_cuda
    # args.cuda = False
    device = 'cuda' if args.cuda else 'cpu'
    logger.info('using device:{}'.format(device))
    os.environ["CUDA_VISIBLE_DEVICES"] = args.device
    tokenizer = BertTokenizer(vocab_file=args.voca_path)
    model = GPT2LMHeadModel.from_pretrained(args.dialogue_model_path)
    model.to(device)
    model.eval()

    print('***********************Summary model start************************')

    while True:
        try:

            text = input()
            for i in range(5):
                if len(text): text = text[:1000]
                input_ids = [tokenizer.cls_token_id]  # 每个input以[CLS]为开头
                input_ids.extend(tokenizer.encode(text))
                input_ids.append(tokenizer.sep_token_id)
                curr_input_tensor = torch.tensor(input_ids).long().to(device)

                generated = []
                # 最多生成max_len个token
                for _ in range(args.max_len):
                    outputs = model(input_ids=curr_input_tensor)
                    next_token_logits = outputs[0][-1, :]
                    # 对于已生成的结果generated中的每个token添加一个重复惩罚项,降低其生成概率
                    for id in set(generated):
                        next_token_logits[id] /= args.repetition_penalty
                    next_token_logits = next_token_logits / args.temperature
                    # 对于[UNK]的概率设为无穷小,也就是说模型的预测结果不可能是[UNK]这个token
                    next_token_logits[tokenizer.convert_tokens_to_ids(
                        '[UNK]')] = -float('Inf')
                    filtered_logits = top_k_top_p_filtering(next_token_logits,
                                                            top_k=args.topk,
                                                            top_p=args.topp)
                    # torch.multinomial表示从候选集合中无放回地进行抽取num_samples个元素,权重越高,抽到的几率越高,返回元素的下标
                    next_token = torch.multinomial(F.softmax(filtered_logits,
                                                             dim=-1),
                                                   num_samples=1)
                    if next_token == tokenizer.sep_token_id:  # 遇到[SEP]则表明response生成结束
                        break
                    generated.append(next_token.item())
                    curr_input_tensor = torch.cat(
                        (curr_input_tensor, next_token), dim=0)

                text = tokenizer.convert_ids_to_tokens(generated)
                print("summary:" + "".join(text))

        except KeyboardInterrupt:
            break
class SimpleBertEmbeddings(WordEmbeddings):
    tokenizer: BertTokenizer
    model: BertModel
    special_tokens = []

    def __init__(self, bert_model_path: str):
        self.tokenizer = BertTokenizer(vocab_file=bert_model_path +
                                       '/vocab.txt')
        config = BertConfig.from_pretrained(bert_model_path + '/config.json',
                                            output_hidden_states=True)
        self.model = BertModel.from_pretrained(bert_model_path, config=config)
        self.model.eval()

    def convert(self, text: str) -> Dict[Word, List[float]]:
        print("[bert embeddings] analyze text:", text)
        lower_text = text.lower().replace("й",
                                          "и").replace("ё",
                                                       "е").replace("́", "")
        token_ids = self.tokenizer.encode(lower_text)

        encoded_layers = self.model(input_ids=torch.tensor([token_ids]))
        hidden_layers = encoded_layers[2][1:]
        token_embeddings = torch.stack(hidden_layers, dim=0)
        token_embeddings = torch.squeeze(token_embeddings, dim=1)
        token_embeddings = token_embeddings.permute(1, 0, 2)
        result: Dict[Word, List[float]] = {}
        text_pos = 0
        prev = None
        for i, token_vec in enumerate(token_embeddings):
            # todo: try only -12 layer: https://github.com/hanxiao/bert-as-service#q-so-which-layer-and-which-pooling-strategy-is-the-best
            # combine last 4 layers (best F1 score)
            cat_vec = torch.cat(
                (token_vec[-1], token_vec[-2], token_vec[-3], token_vec[-4]),
                dim=0)
            if token_ids[i] in self.tokenizer.all_special_ids:
                continue
            token: str = self.tokenizer.convert_ids_to_tokens(token_ids[i])
            if token.startswith("##") and prev is not None:
                clear_token = token.replace("##", "")
                word = Word(prev.text + clear_token, prev.start,
                            prev.end + len(clear_token))
                result.update(
                    {word: np.add(result[prev], cat_vec.tolist()).tolist()})
                del result[prev]
                prev = word
                continue
            start = lower_text.find(token, text_pos)
            if start == -1:
                continue
            end = start + len(token)
            word = Word(token, start, end)
            text_pos = end
            prev = word
            result.update({word: cat_vec.tolist()})
        return result
Exemple #8
0
def create_masked_lm_predictions(input_ids, masked_lm_prob,
                                 max_predictions_per_seq,
                                 tokenizer: BertTokenizer, rng):
    cand_indexes = []
    for (i, input_id) in enumerate(input_ids):
        token = tokenizer.convert_ids_to_tokens(input_id)
        if token == "[CLS]" or token == "[SEP]":
            continue
        cand_indexes.append(i)

    rng.shuffle(cand_indexes)

    output = list(input_ids)

    num_to_predict = min(max_predictions_per_seq,
                         max(1, int(round(len(input_ids) * masked_lm_prob))))

    masked_lms = []
    covered_indexes = set()
    for index in cand_indexes:
        if len(masked_lms) >= num_to_predict:
            break
        if index in covered_indexes:
            continue
        covered_indexes.add(index)

        masked_token_id = None
        # 80% of the time, replace with [MASK]
        if rng.random() < 0.8:
            masked_token_id = tokenizer.mask_token_id
        else:
            # 10% of the time, keep original
            if rng.random() < 0.5:
                masked_token_id = input_ids[index]
            # 10% of the time, replace with random word
            else:
                masked_token_id = rng.randint(0, tokenizer.vocab_size - 1)

        output[index] = masked_token_id

        MaskedLmInstance = collections.namedtuple("MaskedLmInstance",
                                                  ["index", "id"])

        masked_lms.append(MaskedLmInstance(index=index, id=input_ids[index]))

    masked_lms = sorted(masked_lms,
                        key=lambda x: x.index)  ## size=[num_to_predict]

    masked_lm_positions = []
    masked_lm_ids = []
    for p in masked_lms:
        masked_lm_positions.append(p.index)
        masked_lm_ids.append(p.id)

    return (output, masked_lm_positions, masked_lm_ids)
Exemple #9
0
class testAnswerGeneration():
    def __init__(self):
        self.tokenizer = BertTokenizer(
            vocab_file='bert-base-chinese-vocab.txt')
        self.config = BertConfig.from_pretrained('trained_model/1/config.json')
        self.model = BertForMaskedLM.from_pretrained(
            'trained_model/1/pytorch_model.bin',
            from_tf=bool('.ckpt' in 'bert-base-chinese'),
            config=self.config)
        self.model.eval()

    def to_input_id(self, sentence_input):
        return self.tokenizer.convert_tokens_to_ids(
            self.tokenizer.tokenize(sentence_input))

    def getAnswer(self, context, question):
        input_id = self.to_input_id("[CLS] " + context + " [SEP] " + question +
                                    " [SEP]")

        count = 0
        answer = ""
        maskpos = len(input_id)  # 標出要預測答案的位置
        input_id.append(103)
        # 補齊長度
        while len(input_id) < 512:
            input_id.append(0)

        # 限制答案最大長度為10
        while (count < 10):
            input_id_tensor = torch.LongTensor([input_id])
            outputs = self.model(input_id_tensor)
            predictions = outputs[0]
            predicted_index = torch.argmax(
                predictions[0, maskpos]).item()  # 生出最有可能的token_id
            predicted_token = self.tokenizer.convert_ids_to_tokens(
                predicted_index)  # id轉token

            # 當預測為[SEP]的時候,就結束生成答案
            if predicted_token == '[SEP]':
                break

            answer = answer + predicted_token  # 將生成的token連接起來
            input_id[maskpos] = predicted_index  # 用生成的token_id取代當前的[MASK]的id
            maskpos += 1
            if maskpos < 512:
                input_id[maskpos] = 103  # 標出下一個預測的[MASK]的id
            else:
                break

            count += 1

        return answer
Exemple #10
0
def main():
    # model files check and download
    check_and_download_models(WEIGHT_PATH, MODEL_PATH, REMOTE_PATH)
    
    # bert tokenizer
    if LANG == 'en':
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    elif LANG == 'jp':
        tokenizer = BertTokenizer(
            'vocab.txt',
            do_lower_case=False,
            do_basic_tokenize=False
        )

    # prepare data
    dummy_input = np.ones((1, MAX_SEQ_LEN), dtype=np.int64)
    tokens_ts, segments_ts, masked_index = text2token(
        SENTENCE, tokenizer, lang=LANG
    )
    input_data = np.array([tokens_ts, segments_ts])

    # net initialize
    env_id = ailia.get_gpu_environment_id()
    print(f'env_id: {env_id}')
    net = ailia.Net(MODEL_PATH, WEIGHT_PATH, env_id=env_id)

    # compute execution time
    for i in range(5):
        start = int(round(time.time() * 1000))
        input_blobs = net.get_input_blob_list()
        for i, idx in enumerate(input_blobs):
            if i < len(input_data):
                net.set_input_blob_data(input_data[i], idx)
            else:
                net.set_input_blob_data(dummy_input, idx)
        net.update()
        preds_ailia = net.get_results()
        # preds_ailia = net.predict(dummy_input)[0]
        end = int(round(time.time() * 1000))
        print("ailia processing time {} ms".format(end-start))

    # masked word prediction
    predicted_indices = np.argsort(
        preds_ailia[0][0][masked_index]
    )[-NUM_PREDICT:][::-1]

    predicted_tokens = tokenizer.convert_ids_to_tokens(predicted_indices)

    print('Input sentence: ' + SENTENCE)
    print(f'predicted top {NUM_PREDICT} words: {predicted_tokens}')
    print('Script finished successfully.')
Exemple #11
0
    def load_word2id(tokenizer: BertTokenizer) -> Dict[str, int]:
        """
        Loads model vocabulary in the form of mapping from words to their indexes.

        Args:
            tokenizer: `transformers.BertTokenizer` tokenizer

        Returns:
            model vocabulary
        """
        word2id = dict()
        for word_idx in range(tokenizer.vocab_size):
            word = tokenizer.convert_ids_to_tokens([word_idx])[0]
            word2id[word] = word_idx
        return word2id
Exemple #12
0
def predict_fn(input_data, model):
    vocab_path = '/opt/ml/model/vocab.txt'
    tokenizer = BertTokenizer(vocab_path, do_lower_case=True)

    question, context = input_data['question'], input_data['context']

    input_ids = tokenizer.encode(question, context)
    token_type_ids = [
        0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))
    ]
    start_scores, end_scores = model(torch.tensor([input_ids]),
                                     token_type_ids=torch.tensor(
                                         [token_type_ids]))
    all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
    answer = ' '.join(
        all_tokens[torch.argmax(start_scores):torch.argmax(end_scores) + 1])
    return answer
Exemple #13
0
 def explain_handle(self, model_wraper, text, target=1):
     """Captum explanations handler
     Args:
         data_preprocess (Torch Tensor):
         Preprocessed data to be used for captum
         raw_data (list): The unprocessed data to get target from the request
     Returns:
         dict : A dictionary response with the explanations response.
     """
     vis_data_records_base = []
     model_wrapper = AGNewsmodelWrapper(self.model)
     tokenizer = BertTokenizer(self.VOCAB_FILE)
     model_wrapper.eval()
     model_wrapper.zero_grad()
     encoding = tokenizer.encode_plus(self.text,
                                      return_attention_mask=True,
                                      return_tensors="pt",
                                      add_special_tokens=False)
     input_ids = encoding["input_ids"]
     attention_mask = encoding["attention_mask"]
     input_ids = input_ids.to(self.device)
     attention_mask = attention_mask.to(self.device)
     input_embedding_test = model_wrapper.model.bert_model.embeddings(
         input_ids)
     preds = model_wrapper(input_embedding_test, attention_mask)
     out = np.argmax(preds.cpu().detach(), axis=1)
     out = out.item()
     ig_1 = IntegratedGradients(model_wrapper)
     attributions, delta = ig_1.attribute(  # pylint: disable=no-member
         input_embedding_test,
         n_steps=500,
         return_convergence_delta=True,
         target=1,
     )
     tokens = tokenizer.convert_ids_to_tokens(
         input_ids[0].cpu().numpy().tolist())
     feature_imp_dict = {}
     feature_imp_dict["words"] = tokens
     attributions_sum = self.summarize_attributions(attributions)
     feature_imp_dict["importances"] = attributions_sum.tolist()
     feature_imp_dict["delta"] = delta[0].tolist()
     self.add_attributions_to_visualizer(attributions, tokens,
                                         self.score_func(preds), out, 2, 1,
                                         delta, vis_data_records_base)
     return [feature_imp_dict]
Exemple #14
0
    def from_model_predictions(cls,
                               probs: np.ndarray,
                               tokenizer: BertTokenizer,
                               max_len: int = 1000):
        probs_with_indices = list(enumerate(probs))
        best_probs_with_indices = sorted(probs_with_indices,
                                         key=lambda x: x[1],
                                         reverse=True)

        best_words = {}
        for i, prob in best_probs_with_indices:
            word = tokenizer.convert_ids_to_tokens(i).lower()
            if word in punctuation | stopwords or word in best_words:
                continue

            best_words[word] = prob

            if len(best_words) == max_len:
                break

        best_probs = list(best_words.values())
        best_words = list(best_words.keys())

        return cls(best_words, best_probs)
Exemple #15
0
class TrainLoop_BERT():
    def __init__(self, opt, args):
        self.opt = opt
        self.args = args

        self.batch_size = self.opt['batch_size']
        self.epoch = self.opt['epoch']
        self.use_cuda = opt['use_cuda']

        self.device = "cuda:{}".format(
            self.args.gpu) if self.use_cuda else 'cpu'
        self.args.device = self.device

        self.build_data()
        self.build_model()
        self.init_optim()

    def build_data(self):
        self.tokenizer = BertTokenizer(
            vocab_file=self.opt['vocab_path'])  # 初始化分词器

        # build and save dataset
        self.dataset = {'train': None, 'valid': None, 'test': None}
        self.dataset_loader = {'train': None, 'valid': None, 'test': None}
        for subset in self.dataset:
            self.dataset[subset] = CRSdataset(logger, subset,
                                              self.opt[f'{subset}_data_file'],
                                              self.args, self.tokenizer)
            self.dataset_loader[subset] = torch.utils.data.DataLoader(
                dataset=self.dataset[subset],
                batch_size=self.batch_size,
                shuffle=True)
        self.movie_num = self.dataset['train'].movie_num

    def build_model(self):
        self.model = BERTModel(self.args, self.movie_num)
        if self.use_cuda:
            self.model.to(self.device)

    def train(self):
        losses = []  # 预报一次清零一次
        best_val_NDCG = 0.0
        gen_stop = False
        patience = 0
        max_patience = 5

        for i in range(self.epoch):
            train_loss = []
            for batch_idx, batch_data in tqdm(
                    enumerate(self.dataset_loader['train'])):
                self.model.train()
                self.zero_grad()

                contexts, types, masks, y, _, _, _, _ = (data.to(
                    self.device) for data in batch_data)
                # 检验输入输出ok
                # logger.info("[Context] ", batch_data[0])
                # logger.info("[Context] ", '\n'.join(self.vector2sentence(contexts.cpu())))
                # logger.info("[GT] ", y)
                # ipdb.set_trace()

                logit = self.model([contexts, types, masks], raw_return=False)
                # logger.info(logit[y])

                loss = self.model.compute_loss(logit, y, 'train')
                train_loss.append(loss.item())
                losses.append(loss.item())

                loss.backward()
                self.optimizer.step()

                # logger.info('loss = ', loss)

                if (batch_idx + 1) % 50 == 0:
                    # 从上次预报到现在为止的loss均值,每50个batch预报一次
                    loss = sum(losses) / len(losses)
                    logger.info('loss is %.4f' % (loss))
                    losses = []

            logger.info(
                f'Epoch {i}, train loss = {sum(train_loss)/len(train_loss)}')

            # metrics_test = self.val('train')
            metrics_test = self.val('valid')
            _ = self.val('test')

            if best_val_NDCG > metrics_test["NDCG50"]:
                patience += 1
                logger.info(f"[Patience = {patience}]")
                if patience >= max_patience:
                    gen_stop = True
            else:
                patience = 0
                best_val_NDCG = metrics_test["NDCG50"]
                self.model.save_model(self.opt['model_save_path'])
                logger.info("[Model saved in {}]".format(
                    self.opt['model_save_path']))

            if gen_stop:
                break

    def val(self, subset):
        assert subset in ['train', 'test', 'valid']
        self.model.eval()
        val_dataset_loader = self.dataset_loader[subset]

        metrics_test = {
            "Loss": 0,
            "NDCG1": 0,
            "NDCG10": 0,
            "NDCG50": 0,
            "MRR1": 0,
            "MRR10": 0,
            "MRR50": 0,
            "count": 0
        }
        losses = []
        for batch_idx, batch_data in enumerate(val_dataset_loader):
            with torch.no_grad():
                contexts, types, masks, y, _, _, _, _ = (data.to(
                    self.device) for data in batch_data)
                logit = self.model([contexts, types, masks], raw_return=False)
                # ipdb.set_trace()
                loss = self.model.compute_loss(logit, y)

                self.compute_metircs(logit, y, metrics_test)
                losses.append(loss.item())

        metrics_test['Loss'] = sum(losses) / len(losses)

        for key in metrics_test:
            if 'NDCG' in key or 'MRR' in key:
                metrics_test[key] = round(
                    metrics_test[key] / metrics_test['count'], 4)

        logger.info(f"{subset} set's metrics = {metrics_test}")

        return metrics_test

    def compute_metircs(self, logit, y, metrics):
        for K in [1, 10, 50]:
            # pred = logit.max(-1, keepdim=True)[1]
            # acc += pred.eq(y.view_as(pred)).sum().item()    # 记得加item()
            pred, pred_id = torch.topk(logit, K, dim=1)  # id=[bs, K]
            for i, gt in enumerate(y):
                gt = gt.item()
                cand_ids = pred_id[i].tolist()
                if gt in cand_ids:
                    rank = cand_ids.index(gt)
                    metrics['NDCG' + str(K)] += 1.0 / math.log(rank + 2.0, 2)
                    metrics['MRR' + str(K)] += 1.0 / (rank + 1.0)
                # metrics['count'] += 1
        # metrics['count'] = int(metrics['count']/3)
        assert len(y.shape) == 1
        metrics['count'] += y.shape[0]

    def vector2sentence(self, batch_sen, compat=True):
        # 一个batch的sentence 从id换成token
        sentences = []
        # for sen in batch_sen.numpy():
        #     sentences.append(self.tokenizer.convert_ids_to_tokens(sen))
        for sen in batch_sen.numpy().tolist():
            sentence = []
            for word in sen:
                if word != 0:
                    sentence.append(self.tokenizer.convert_ids_to_tokens(word))
                # elif word==3:
                #     sentence.append('_UNK_')
            if compat:
                sentence = ''.join(sentence)
            sentences.append(sentence)
        return sentences

    @classmethod
    def optim_opts(self):
        """
        Fetch optimizer selection.

        By default, collects everything in torch.optim, as well as importing:
        - qhm / qhmadam if installed from github.com/facebookresearch/qhoptim

        Override this (and probably call super()) to add your own optimizers.
        """
        # first pull torch.optim in
        optims = {
            k.lower(): v
            for k, v in optim.__dict__.items()
            if not k.startswith('__') and k[0].isupper()
        }
        try:
            import apex.optimizers.fused_adam as fused_adam
            optims['fused_adam'] = fused_adam.FusedAdam
        except ImportError:
            pass

        try:
            # https://openreview.net/pdf?id=S1fUpoR5FQ
            from qhoptim.pyt import QHM, QHAdam
            optims['qhm'] = QHM
            optims['qhadam'] = QHAdam
        except ImportError:
            # no QHM installed
            pass
        logger.info(optims)
        return optims

    def init_optim(self):
        param_optimizer = list(self.model.bert.named_parameters())  # 模型参数名字列表
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [p for n, p in param_optimizer]
        }]

        fc_optimizer = list(self.model.fc.named_parameters())  # 模型参数名字列表
        optimizer_grouped_parameters += [{
            'params': [p for n, p in fc_optimizer],
            'lr': self.opt['lr_sasrec']
        }]

        # self.optimizer = transformers.AdamW(self.model.parameters(), lr=self.opt['lr'])
        self.optimizer = transformers.AdamW(optimizer_grouped_parameters,
                                            lr=self.opt['lr_bert'])
        # self.scheduler = transformers.WarmupLinearSchedule(
        #     self.optimizer, warmup_steps=self.opt['warmup_steps'], t_total=len(self.dataset_loader['train']) * self.epoch)

    def zero_grad(self):
        """
        Zero out optimizer.

        It is recommended you call this in train_step. It automatically handles
        gradient accumulation if agent is called with --update-freq.
        """
        self.optimizer.zero_grad()
Exemple #16
0
class TrainLoop_SASRec():
    def __init__(self, opt, args):
        self.opt = opt
        self.args = args

        self.batch_size = self.args.batch_size
        self.epoch = self.args.epoch
        self.use_cuda = self.args.use_cuda

        self.device = "cuda:{}".format(
            self.args.gpu) if self.use_cuda else 'cpu'
        self.args.device = self.device

        self.build_data()
        # bs, item_num+1: [gt, all_item_id]
        self.default_neg_sampled = torch.tensor(
            [0] + [i for i in range(1, self.args.item_size)],
            dtype=torch.long).repeat(self.args.batch_size, 1).to(self.device)

        self.build_model()
        self.init_optim()

    def build_data(self):
        # 初始化分词器
        self.tokenizer = BertTokenizer(
            vocab_file=self.args.vocab_path)  # 初始化分词器
        # build and save self.dataset
        self.dataset = {'train': None, 'valid': None, 'test': None}
        self.dataset_loader = {'train': None, 'valid': None, 'test': None}
        for subset in self.dataset:
            self.dataset[subset] = CRSdataset(logger, subset,
                                              self.opt[f'{subset}_data_file'],
                                              self.args, self.tokenizer)
            self.dataset_loader[subset] = torch.utils.data.DataLoader(
                dataset=self.dataset[subset],
                batch_size=self.batch_size,
                shuffle=True)

        # self.dataset['train'].movie_num 是增加了unk之后的电影数量,+1是他们提高1位,增加0的电影总数
        self.item_size = self.dataset['train'].movie_num + 1
        self.args.item_size = self.item_size

    def build_model(self):
        self.model = SASRecModel(args=self.args)
        if self.args.load_model:
            self.model.load_model(self.args.sasrec_load_path)
        if self.use_cuda:
            self.model.to(self.device)

    def train(self):
        losses = []  # 预报一次清零一IC
        best_val_NDCG = 0.0
        gen_stop = False
        patience = 0
        max_patience = 5

        for i in range(self.epoch):
            train_loss = []
            # for batch_idx, batch_data in tqdm(enumerate(self.rec_train_dataloader)):
            for batch_idx, batch_data in enumerate(
                    self.dataset_loader['train']):
                self.model.train()
                self.zero_grad()
                batch_data = [data.to(self.device) for data in batch_data]

                input_ids, target_pos, input_mask, sample_negs = batch_data[
                    -4:]
                # print(input_ids)
                # print(target_pos)

                sequence_output = self.model(input_ids, input_mask,
                                             self.args.use_cuda)

                loss = self.model.cross_entropy(sequence_output, target_pos,
                                                sample_negs, self.use_cuda)

                train_loss.append(loss.item())
                losses.append(loss.item())

                loss.backward()
                self.optimizer.step()

                if (batch_idx + 1) % 1000000000000000 == 0:
                    loss = sum(losses) / len(losses)
                    logger.info('loss is %.4f' % (loss))
                    losses = []

            logger.info(
                f'Epoch {i}, train loss = {sum(train_loss)/len(train_loss)}')

            # metrics_test = self.val('train')
            metrics_test = self.val('valid')
            _ = self.val('test')
            # False是什么鬼
            if best_val_NDCG > metrics_test["NDCG50"]:
                patience += 1
                logger.info(f"[Patience = {patience}]")
                if patience >= max_patience:
                    gen_stop = True
            else:
                patience = 0
                best_val_NDCG = metrics_test["NDCG50"]
                self.model.save_model(self.args.sasrec_save_path)
                logger.info(f"[Model saved in {self.args.sasrec_save_path}]")

            if gen_stop:
                break
        # metrics_test = self.val('test')

    def val(self, subset):
        assert subset in ['train', 'test', 'valid']
        self.model.eval()
        val_dataset_loader = self.dataset_loader[subset]

        metrics_test = {
            "Loss": 0,
            "NDCG1": 0,
            "NDCG10": 0,
            "NDCG50": 0,
            "MRR1": 0,
            "MRR10": 0,
            "MRR50": 0,
            "count": 0
        }
        losses = []
        for batch_idx, batch_data in enumerate(val_dataset_loader):
            with torch.no_grad():
                batch_data = [data.to(self.device) for data in batch_data]
                _, _, _, predict_ids, input_ids, target_pos, input_mask, sample_negs = batch_data
                # print(input_ids)
                # print(target_pos)
                # print(predict_ids)

                # bs, max_len, hidden_size2
                sequence_output = self.model(input_ids, input_mask,
                                             self.args.use_cuda)

                loss = self.model.cross_entropy(sequence_output, target_pos,
                                                sample_negs, self.use_cuda)
                # bs, item_num
                for i in range(predict_ids.shape[0]):
                    self.default_neg_sampled[i][0] = predict_ids[i]
                # 推荐的结果
                test_logits = self.predict(
                    sequence_output,
                    self.default_neg_sampled[:predict_ids.shape[0]],
                    self.use_cuda)

                self.compute_metircs(test_logits, metrics_test)

                losses.append(loss.item())
        # test 结束
        metrics_test['Loss'] = sum(losses) / len(losses)

        for key in metrics_test:
            if 'NDCG' in key or 'MRR' in key:
                metrics_test[key] = round(
                    metrics_test[key] / metrics_test['count'], 4)

        logger.info(f"{subset} set's metrics = {metrics_test}")

        return metrics_test

    def predict(self, seq_out, test_neg_sample, use_cuda=True):
        # shorten: 只要每个batch最后一个item的representation与所有candidate rep的点击
        # [batch item_num hidden_size]
        test_item_emb = self.model.embeddings.item_embeddings(test_neg_sample)
        # [batch 1 hidden]
        seq_out = seq_out[:, -1, :].unsqueeze(1)
        # [batch 1 item_num]
        test_logits = torch.matmul(seq_out, test_item_emb.transpose(1, 2))
        # print(test_logits.shape) #p
        # [batch item_num]
        test_logits = test_logits[:, -1, :]

        return test_logits

    def compute_metircs(self, logit, metrics):
        MRR1, NDCG1 = self.get_metric(logit, topk=1)
        # ipdb.set_trace()
        metrics['MRR1'] += MRR1
        metrics['NDCG1'] += NDCG1

        MRR10, NDCG10 = self.get_metric(logit, topk=10)
        metrics['MRR10'] += MRR10
        metrics['NDCG10'] += NDCG10

        MRR50, NDCG50 = self.get_metric(logit, topk=50)
        metrics['MRR50'] += MRR50
        metrics['NDCG50'] += NDCG50

        metrics['count'] += 1

    def get_metric(self, test_logits, topk=10):
        NDCG = 0.0
        MRR = 0.0
        # [batch] 最终每个 example 中 正确答案的排位
        ranks = test_logits.argsort(descending=True).argsort()[:, 0].cpu()
        ranks_size = int(ranks.size(0))
        for rank in ranks:
            if rank < topk:
                NDCG += float(1.0 / np.log2(rank + 2.0))
                MRR += float(1.0 / np.array(rank + 1.0))

        return MRR / ranks_size, NDCG / ranks_size

    def save_embed(self):
        torch.save(self.model.embeddings.item_embeddings.state_dict(),
                   self.args.sasrec_emb_save_path)

    def vector2sentence(self, batch_sen, compat=True):
        # 一个batch的sentence 从id换成token
        sentences = []
        # for sen in batch_sen.numpy():
        #     sentences.append(self.tokenizer.convert_ids_to_tokens(sen))
        for sen in batch_sen.numpy().tolist():
            sentence = []
            for word in sen:
                if word != 0:
                    sentence.append(self.tokenizer.convert_ids_to_tokens(word))
                # elif word==3:
                #     sentence.append('_UNK_')
            if compat:
                sentence = ''.join(sentence)
            sentences.append(sentence)
        return sentences

    @classmethod
    def optim_opts(self):
        """
        Fetch optimizer selection.

        By default, collects everything in torch.optim, as well as importing:
        - qhm / qhmadam if installed from github.com/facebookresearch/qhoptim

        Override this (and probably call super()) to add your own optimizers.
        """
        # first pull torch.optim in
        optims = {
            k.lower(): v
            for k, v in optim.__dict__.items()
            if not k.startswith('__') and k[0].isupper()
        }
        try:
            import apex.optimizers.fused_adam as fused_adam
            optims['fused_adam'] = fused_adam.FusedAdam
        except ImportError:
            pass

        try:
            # https://openreview.net/pdf?id=S1fUpoR5FQ
            from qhoptim.pyt import QHM, QHAdam
            optims['qhm'] = QHM
            optims['qhadam'] = QHAdam
        except ImportError:
            # no QHM installed
            pass
        logger.info(optims)
        return optims

    def init_optim(self):
        betas = (self.args.adam_beta1, self.args.adam_beta2)
        self.optimizer = Adam(self.model.parameters(),
                              lr=self.args.lr_sasrec,
                              betas=betas,
                              weight_decay=self.args.weight_decay)
        print("Total Parameters:",
              sum([p.nelement() for p in self.model.parameters()]))

    def zero_grad(self):
        """
        Zero out optimizer.

        It is recommended you call this in train_step. It automatically handles
        gradient accumulation if agent is called with --update-freq.
        """
        self.optimizer.zero_grad()
def main():

    ## parser settings
    parser = argparse.ArgumentParser()
    parser.add_argument('--file_path', '-fp', type=str)
    parser.add_argument('--model_path', '-mdp', type=str)
    parser.add_argument('--evaluate_file_name', '-evaflnm', type=str)
    parser.add_argument('--analysis_file_name', '-aysflnm', type=str)
    parser.add_argument('--gpu_num', '-gpun', type=int)
    args = parser.parse_args()

    ## 預設值
    tokenizer = BertTokenizer(vocab_file='bert-base-uncased-vocab.txt')
    nlp = spacy.load(
        "model/spacy/en_core_web_md-2.3.1/en_core_web_md/en_core_web_md-2.3.1")
    accepted_pos_list = get_accepted_pos_list()

    ## for debug
    if args.file_path == None:
        args.file_path = 'mingda_chen_dataset/test_input.txt'
    if args.model_path == None:
        args.model_path = 'trained_model/sequential_6000/4/pytorch_model.bin'
    if args.evaluate_file_name == None:
        args.evaluate_file_name = "sequence"
    if args.analysis_file_name == None:
        args.analysis_file_name = "analysis_" + args.evaluate_file_name
    if args.gpu_num == None:
        args.gpu_num = torch.cuda.device_count()
    ##

    ## GPU setting
    if torch.cuda.device_count() > 1:
        device = torch.device('cuda')
        os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
    else:
        device = torch.device('cpu')
        os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
    ##

    # # load model
    config = BertConfig.from_pretrained('bert-base-uncased')
    model = BertForMaskedLM.from_pretrained(args.model_path, config=config)
    model.to(device)
    model.eval()

    # read data
    semantic_list, syntactic_list = get_dataset_list(args.file_path)
    all_syntactic_keyword_list = get_all_syntactic_keyword_list(
        syntactic_list, accepted_pos_list, tokenizer, nlp)
    # write_syntactic_keyword(all_syntactic_keyword_list)
    all_syntactic_keyword_with_sep_list = insert_sep_token(
        all_syntactic_keyword_list)

    predict_sentence_list = []
    for index, semantic_sentence in enumerate(tqdm(semantic_list)):
        predict_sentence = ["[MASK]"]
        repeat_flag = False
        while ("[MASK]" in predict_sentence):
            input_ids_list, input_segment_list, input_attention_list = data_preprocess(
                semantic_sentence, all_syntactic_keyword_with_sep_list[index],
                predict_sentence, tokenizer)
            input_id_tensor, input_segment_tensor, input_attention_tensor = convert_to_tensor(
                input_ids_list, input_segment_list, input_attention_list,
                device)

            outputs = model(return_dict=True,
                            input_ids=input_id_tensor,
                            token_type_ids=input_segment_tensor,
                            attention_mask=input_attention_tensor)
            logits = outputs[0]
            maskpos = input_ids_list.index(103)
            predicted_index = torch.argmax(logits[0, maskpos]).item()
            predicted_token = tokenizer.convert_ids_to_tokens(
                [predicted_index])[0]

            predict_sentence.remove("[MASK]")

            ## 檢查重複
            count_dict = dict(Counter(predict_sentence))
            for key in count_dict:
                if count_dict[key] > 3:
                    repeat_flag = True
            if repeat_flag:
                predict_sentence_list.append(
                    extract_sentence_from_list(predict_sentence))
                break

            if predicted_token != "[SEP]":
                predict_sentence.append(predicted_token)
                predict_sentence.append("[MASK]")
            else:
                predict_sentence_list.append(
                    extract_sentence_from_list(predict_sentence))
                break

    output_evaluate(ref=predict_sentence_list,
                    filename=args.evaluate_file_name)
    produce_analysis_file(all_syntactic_keyword_list, predict_sentence_list,
                          args.analysis_file_name)
    return 0
Exemple #18
0
class Scoring(object):
    def __init__(self, BERT_PATH):
        self.config = BertConfig.from_json_file(BERT_PATH +
                                                "/bert_config.json")
        self.model = BertForPreTraining.from_pretrained(BERT_PATH +
                                                        "/bert_model.ckpt",
                                                        from_tf=True,
                                                        config=self.config)
        self.tokenizer = BertTokenizer(BERT_PATH + "/vocab.txt")
        self.model.eval()
        self.model.cuda(args.gpu_id)

    def sentence_preprocese(self, text):
        tokenized_text = np.array(self.tokenizer.tokenize(text))
        find_sep = np.argwhere(tokenized_text == '[SEP]')
        segments_ids = np.zeros(tokenized_text.shape, dtype=int)
        if find_sep.size == 1:
            start_point = 1
        else:
            start_point = find_sep[0, 0] + 1
            segments_ids[start_point:] = 1

        end_point = tokenized_text.size - 1

        tokenized_text = self.tokenizer.convert_tokens_to_ids(tokenized_text)
        masked_texts = []

        # mask with l2r fashion
        for masked_index in range(start_point, end_point):
            new_tokenized_text = np.array(tokenized_text, dtype=int)
            new_tokenized_text[
                masked_index] = self.tokenizer.convert_tokens_to_ids(
                    ['[MASK]'])[0]
            masked_texts.append(new_tokenized_text)

        # copy the segments_ids
        segments_ids = np.tile(segments_ids, (end_point - start_point, 1))

        return masked_texts, segments_ids, start_point, end_point, tokenized_text[
            start_point:end_point]

    def metric(self, text):
        indexed_tokens, segments_ids, start_point, end_point, real_indexs = self.sentence_preprocese(
            text)

        tokens_tensor = torch.tensor(indexed_tokens)
        segments_tensors = torch.tensor(segments_ids)

        tokens_tensor = tokens_tensor.cuda(args.gpu_id)
        segments_tensors = segments_tensors.cuda(args.gpu_id)

        # model return: tuple()
        # 1. prediction_scores (batch_size X sequence_length X config.vocab_size);
        # 2. seq_relationship_scores (batch_size X 2)

        with torch.no_grad():
            outputs = self.model(tokens_tensor,
                                 token_type_ids=segments_tensors)
            predictions = torch.softmax(outputs[0], -1)

        log_likelihood = 0

        # cumulated negative log likelihood

        for i, step in enumerate(range(start_point, end_point)):
            predicted_index = torch.argmax(predictions[i, step]).item()
            predicted_token = self.tokenizer.convert_ids_to_tokens(
                [predicted_index])

            real_pos_prob = predictions[i, step, real_indexs[i]].item()
            real_token = self.tokenizer.convert_ids_to_tokens([real_indexs[i]])

            if args.prob_token == True:
                print("The",i+1,"th position: {pred_token}",predicted_token, round(predictions[i, step, predicted_index].item(), 4), \
                    "\t\t\t {golden_token}", real_token, round(real_pos_prob, 4))

            log_likelihood += np.log2(real_pos_prob)

        prob = np.exp2(log_likelihood)
        nll = -log_likelihood / (end_point - start_point)
        ppl = np.exp2(nll)

        return nll, ppl