Exemple #1
0
def str2id(tokenizer: BertTokenizer, sys_utter: str, usr_utter: str,
           source: str) -> Tuple[List[int], List[int]]:
    """Convert system, user utterance and source tokens to ids based on BertTokenizer.

    Args:
        tokenizer: BertTokenizer
        sys_utter: system utterance
        usr_utter: user utterance
        source: slot + value

    Returns:
        input_ids and token_type_ids
    """
    sys_utter_tokens = tokenizer.tokenize(sys_utter)
    usr_utter_tokens = tokenizer.tokenize(usr_utter)
    source_tokens = tokenizer.tokenize(source)
    sys_utter_ids = tokenizer.convert_tokens_to_ids(sys_utter_tokens)
    usr_utter_ids = tokenizer.convert_tokens_to_ids(usr_utter_tokens)
    source_ids = tokenizer.convert_tokens_to_ids(source_tokens)
    input_ids = ([tokenizer.cls_token_id] + sys_utter_ids +
                 [tokenizer.sep_token_id] + usr_utter_ids +
                 [tokenizer.sep_token_id] + source_ids +
                 [tokenizer.sep_token_id])
    token_type_ids = ([0] + [0] * (len(sys_utter_ids) + 1) + [1] *
                      (len(usr_utter_ids) + 1) + [0] * (len(source_ids) + 1))
    return input_ids, token_type_ids
Exemple #2
0
class MLMModel:

    def __init__(self):
        self.model: BertForMaskedLM = BertForMaskedLM.from_pretrained(
            pretrained_model_name_or_path='Foodbert/foodbert/data/mlm_output/checkpoint-final')
        with open('Foodbert/foodbert/data/used_ingredients.json', 'r') as f:
            used_ingredients = json.load(f)
        self.tokenizer = BertTokenizer(vocab_file='Foodbert/foodbert/data/bert-base-cased-vocab.txt', do_lower_case=False,
                                       max_len=128, never_split=used_ingredients)

        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
        self.model.to(self.device)

    def predict_substitutes(self, sentence, ingredient_name, with_masking=True):
        search_id = self.tokenizer.mask_token_id if with_masking else \
            self.tokenizer.convert_tokens_to_ids([ingredient_name])[0]
        sentence = sentence.replace('!', ' !').replace('?', ' ?').replace('.', ' .').replace(':', ' :').replace(',', ' ,')
        sentence = ' ' + sentence + ' '

        all_ordered_substitutes = []

        masked_sentence = sentence.replace(f' {ingredient_name} ', ' [MASK] ')
        input_ids = torch.tensor(self.tokenizer.encode(masked_sentence, add_special_tokens=True)).unsqueeze(0).to(device=self.device)
        prediction_scores = self.model(input_ids, masked_lm_labels=input_ids)[1][0]
        ingredient_scores = prediction_scores[input_ids[0] == search_id]

        for i in range(len(ingredient_scores)):
            ingredient_score = ingredient_scores[i]
            softmax_scores = ingredient_score.softmax(dim=0)
            indices = torch.sort(ingredient_score, descending=True).indices
            ordered_substitutes = self.tokenizer.convert_ids_to_tokens(indices)
            softmax_scores = softmax_scores[indices].tolist()
            all_ordered_substitutes.append((ordered_substitutes, softmax_scores))

        return all_ordered_substitutes
    def generate_embedding(
        self,
        model: transformers.BertModel,
        tokenizer: transformers.BertTokenizer,
        product: pd.Series,
        feature_columns: List[str],
    ) -> torch.Tensor:
        model.eval()
        if (Project.exported_objects_dir /
                f"{product['product_id']}.obj").exists():
            return self.load_already_geneated_embedding(product=product)
        product_description = self.generate_product_description(
            product=product, feature_columns=feature_columns)
        marked_text = "[CLS] " + product_description + " [SEP]"
        tokenized_text = tokenizer.tokenize(marked_text)

        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

        segments_ids = [1] * len(tokenized_text)
        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensors = torch.tensor([segments_ids])
        with torch.no_grad():
            outputs = model(tokens_tensor, segments_tensors)
            hidden_states = outputs[2]
        token_vecs = hidden_states[-2][0]
        sentence_embedding = torch.mean(token_vecs, dim=0)
        torch.save(
            sentence_embedding,
            Project.exported_objects_dir / f"{product['product_id']}.obj",
        )
        return sentence_embedding
Exemple #4
0
def _tokenize_bert_sentence(text: str, tokenizer: BertTokenizer) -> Tuple:
    """
    Given a sentence and a BertTokenizer, tokenizes the text, maps to
    BERT vocab indices, and make the segment IDs for the tokens before
    returning the tensors on GPU (add flag for GPU disable soon).

    :param text: The sentence being tokenized. A single sentence as str.
    :param tokenizer: The BertTokenizer object instantiated.
    :return token_tensor, segments_tensor: The tensors containing the
                    segment IDs and the tokens themselves. On GPU.
    """
    # Split the sentence into tokens.
    tokenized_text = tokenizer.encode(text, add_special_tokens=True)

    # Map the token strings to their vocabulary indices.
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    # Mark each of the tokens as belonging to sentence "1" (single
    #  sentence).
    segments_ids = [1] * len(tokenized_text)

    # Convert inputs to PyTorch tensors, place on GPU.
    token_tensor = torch.tensor([indexed_tokens]).to('cuda:0')
    segments_tensor = torch.tensor([segments_ids]).to('cuda:0')

    return token_tensor, segments_tensor
def main():
    args = config_parse()
    device = 'cuda' if torch.cuda.is_available() and args.use_cuda else 'cpu'
    if not os.path.exists(args.output_path):
        os.mkdir(args.output_path)
    model, n_ctx = load_pretrained_model(args)
    if args.vocab_path:
        tokenizer = BertTokenizer(vocab_file=args.vocab_path)
    else:
        tokenizer = BertTokenizer.from_pretrained(
            args.pretrained_tokenizer_model)
    global pad_id
    pad_id = tokenizer.convert_tokens_to_ids('[PAD]')
    if args.seed:
        set_random_seed(args)
    if args.raw_file_path:
        logger.info("start processing raw data....")
        process_raw_data(args, tokenizer, n_ctx)
    check_model_parameters(model)
    raw_token = load_train_data(args)
    train_data, dev_data = train_test_split(raw_token, test_size=.2)
    logger.info(
        f"raw data: {len(raw_token)}, train_data: {len(train_data)}, dev_data: {len(dev_data)}"
    )
    if args.do_train:
        train(model, device, train_data, args)
    if args.do_eval:
        evaluate(model, device, dev_data, args)
Exemple #6
0
def bert_pretraining(dataset, config):
    bert_tokenizer = BertTokenizer('./bert-base-chinese' + '/vocab.txt')
    model = BertModel.from_pretrained('./bert-base-chinese')
    model.eval()
    model.to(config.device)

    for batch in batch_slice(dataset, config.train_batch_size):
        tokens_tensor = []

        for instance in batch:
            instance.ids = bert_tokenizer.convert_tokens_to_ids(instance.chars)
            tokens_tensor.append(torch.tensor(instance.ids))

        tokens_tensor = pad_sequence(tokens_tensor).T
        attention_mask = torch.ne(tokens_tensor,
                                  torch.zeros_like(tokens_tensor))

        tokens_tensor = tokens_tensor.to(config.device)
        attention_mask = attention_mask.to(config.device)

        with torch.no_grad():
            outputs = model(tokens_tensor, attention_mask=attention_mask)
            encoded_layers = outputs[0]

        for index, instance in enumerate(batch):
            instance.embeddings = encoded_layers[
                index, 0:len(instance.ids), :].cpu().numpy()
Exemple #7
0
def load_and_cache_examples(
    task: str,
    tokenizer: BertTokenizer,
    model_path: str,
    data_dir: str,
    overwrite_cache: bool,
    max_seq_length: int,
    model_type: str,
    cache_root: str = "../../data/preprocessed",
    product: bool = False,
):
    processor = processors[task]()
    output_mode = output_modes[task]
    # if args.active_learning:

    # Load data features from cache or dataset file
    # if active learning, the train data will be saved inside each learning iteration directory
    cached_features_file = os.path.join(cache_root, "cached_{}_{}_{}".format("inference", list(
        filter(None, model_path.split("/"))).pop(), str(task), ), )

    # if os.path.exists(cached_features_file) and not overwrite_cache:
    #    logger.info("Loading features from cached file %s", cached_features_file)
    #    features = torch.load(cached_features_file)
    #    examples = None
    # else:
    logger.info("Creating features from dataset file at %s", data_dir)
    label_list = processor.get_labels()
    examples = processor.get_examples(data_dir=data_dir, product=product)
    log_param("  Num examples training", len(examples))

    features = convert_examples_to_features(
        examples,
        tokenizer,
        label_list=label_list,
        max_length=max_seq_length,
        output_mode=output_mode,
        pad_on_left=bool(model_type in ["xlnet"]),
        pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
        pad_token_segment_id=4 if model_type in ["xlnet"] else 0,
    )
    logger.info("Saving features into cached file %s", cached_features_file)
    torch.save(features, cached_features_file)

    # Convert to Tensors and build dataset
    all_input_ids = torch.as_tensor([f.input_ids for f in features], dtype=torch.long)
    all_attention_mask = torch.as_tensor(
        [f.attention_mask for f in features], dtype=torch.long
    )
    all_token_type_ids = torch.as_tensor(
        [f.token_type_ids for f in features], dtype=torch.long
    )
    #    if output_mode == "classification":
    #        all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
    #    elif output_mode == "regression":
    #        all_labels = torch.tensor([f.label for f in features], dtype=torch.float)

    dataset = TensorDataset(
        all_input_ids, all_attention_mask, all_token_type_ids
    )
    return dataset, examples
Exemple #8
0
def data_augment(tokenizer: BertTokenizer, augument_size=3):
    assert Text_Example
    aug_data = []

    entity_data = pickle.load(open(statist_entity_data_path, "rb"))
    all_train_data = pickle.load(open(pickle_all_train_data_path, "rb"))
    for data in all_train_data:
        text = data.text
        anns = data.anns
        if len(anns) < 3:
            aug_data.append(data)
            continue
        new_text = text
        for aug in range(augument_size):
            sample_anns_idx = random.sample(range(len(anns)), 2)
            for idx in sample_anns_idx:
                _tag, ann = anns[idx]
                ann_replace_list = entity_data[_tag][len(ann)]
                if len(ann_replace_list) < 2:
                    break
                repalce_ann = random.choice(ann_replace_list)
                while repalce_ann == ann:
                    repalce_ann = random.choice(ann_replace_list)
                new_text = text.replace(ann, repalce_ann, 1)

        new_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in new_text]
        new_text_example = data
        new_text_example.text = new_text
        new_text_example.token_ids = new_token_ids
Exemple #9
0
 def featurize(self, df):
     bert_model = BertModel.from_pretrained(self.data_path)
     bert_tokenizer = BertTokenizer(self.data_path + "/vocab.txt",
                                    do_lower_case=False,
                                    do_basic_tokenize=False)
     mecab = MeCab.Tagger('-Ochasen')
     data_list = df.rdd.collect()
     label_list = []
     vec_list = []
     for data in data_list:
         tmp_list = []
         node_list = data[1]
         for word in node_list:
             tmp_list.append(word)
         if len(tmp_list) != 0:
             label_list.append(float(data[0]))
             bert_tokens = bert_tokenizer.tokenize(
                 " ".join(["[CLS]"] + tmp_list + ["[SEP]"]))
             token_ids = bert_tokenizer.convert_tokens_to_ids(bert_tokens)
             tokens_tensor = torch.tensor(token_ids).unsqueeze(0)
             all_outputs = bert_model(tokens_tensor)
             embedding = all_outputs[-2].detach().numpy()[0]
             vec = np.mean(embedding, axis=0).tolist()
             vec_list.append(Vectors.dense(vec))
     zip_list = zip(label_list, vec_list)
     new_df = self.spark.createDataFrame(zip_list, ("label", "features"))
     return new_df
Exemple #10
0
def get_indices_and_masks(sent_tokens: List[str],
                          in_sent_start: int,
                          in_sent_end: int,
                          tokenizer: BertTokenizer,
                          mask_mention: bool = False) \
        -> Tuple[List[int], List[float], int, int]:
    if in_sent_start not in range(len(sent_tokens)) or\
            in_sent_end not in range(1, len(sent_tokens) + 1):
        raise ValueError(
            f'wrong input: tokens {sent_tokens} don\'t contain pos'
            f' ({in_sent_start}, {in_sent_end}).')
    if mask_mention:
        for n in range(in_sent_start, in_sent_end):
            sent_tokens[n] = tokenizer.mask_token
    sent_subword_idxs = []
    sent_subwords = []
    sent_hypo_mask = []
    new_in_sent_start, new_in_sent_end = None, None
    for n, tok in enumerate(sent_tokens):
        if n == in_sent_start:
            new_in_sent_start = len(sent_subwords)
        subtokens = tokenizer.tokenize(tok)
        sent_subwords.extend(subtokens)
        subtok_idxs = tokenizer.convert_tokens_to_ids(subtokens)
        sent_subword_idxs.extend(subtok_idxs)
        # NOTE: absence of + 1 because absence of [CLS] token in the beginning
        mask_value = float(in_sent_start <= n < in_sent_end)
        sent_hypo_mask.extend([mask_value] * len(subtok_idxs))
        if n == in_sent_end - 1:
            new_in_sent_end = len(sent_subwords) + 1
    return sent_subword_idxs, sent_hypo_mask, new_in_sent_start, new_in_sent_end
Exemple #11
0
def get_embedding(phrases: List[str],
                  emb_mat: torch.Tensor,
                  tokenizer: BertTokenizer,
                  debug: bool = False) -> torch.Tensor:
    # emb_mat: [vocab_size, emb_size]
    # returns: [num_phrases, emb_size]
    subtok_ids, subtok_masks = [], []
    max_len = 0
    for w in phrases:
        subtok_toks = tokenizer.tokenize(w)
        subtok_ids.append(tokenizer.convert_tokens_to_ids(subtok_toks))
        num_subtoks = len(subtok_ids[-1])
        subtok_masks.append([1.] * num_subtoks)
        if debug:
            print(f"subtok_ids('{w}') = {subtok_ids[-1]}")
            print(
                f'{[tokenizer._convert_id_to_token(s) for s in subtok_ids[-1]]}'
            )
        max_len = max_len if max_len > num_subtoks else num_subtoks
    # subtok_ids, subtok_masks: [num_phrases, max_len]
    subtok_ids = torch.tensor(
        [sw_list + [-1] * (max_len - len(sw_list)) for sw_list in subtok_ids])
    subtok_masks = torch.tensor(
        [m + [0.] * (max_len - len(m)) for m in subtok_masks])
    # subtok_sizes: [num_phrases]
    subtok_sizes = torch.sum(subtok_masks, 1)
    if debug:
        print(subtok_sizes)
    # emb_mat[subtok_ids]: [num_phrases, max_len, emb_size]
    return torch.sum(emb_mat[subtok_ids] * subtok_masks.unsqueeze(2), axis=1) \
        / subtok_sizes.unsqueeze(1)
def tokenize_and_pad_samples(genes, labels):
    k = len(genes[0][0])
    if k == 4:
        kmer_filepath = '/home/brian/Downloads/fourmers.txt'
    elif k == 6:
        kmer_filepath = '/home/brian/Downloads/hexamers.txt'
    elif k == 8:
        kmer_filepath = '/home/brian/Downloads/octamers.txt'
    formatted_samples = [['[CLS]'] + sample + ['[SEP]'] for sample in genes]
    formatted_labels = [[0] + l + [0] for l in labels]
    tokenizer = BertTokenizer(kmer_filepath, max_len=MAX_LEN)
    print("TOKENIZER LENGTH", len(tokenizer))
    attention_masks = [
        np.concatenate([np.ones(len(l)),
                        np.zeros(MAX_LEN - len(l))]) for l in formatted_labels
    ]
    #seq_ids = tokenizer.convert_tokens_to_ids(formatted_samples)
    seq_ids = [
        tokenizer.convert_tokens_to_ids(sample) for sample in formatted_samples
    ]
    seq_ids = pad_sequences(seq_ids,
                            maxlen=MAX_LEN,
                            truncating='post',
                            padding='post')

    return seq_ids, attention_masks, formatted_labels
def chat(folder_bert, voc, testing=False):
    tf.random.set_seed(1)
    tokenizer = BertTokenizer(vocab_file=folder_bert + voc)
    if testing:
        tokens = tokenizer.tokenize("jeg tror det skal regne")
        print(tokens)
        ids = tokenizer.convert_tokens_to_ids(tokens)
        print(ids)
        print("Vocab size:", len(tokenizer.vocab))

    config = BertConfig.from_json_file(folder_bert + "/config.json")
    model = BertLMHeadModel.from_pretrained(folder_bert, config=config)
    while (1):
        text = input(">>User: "******"Bot: {}".format(tokenizer.decode(sample_output[0])))
        print("Bot: {}".format(
            tokenizer.decode(sample_output[:, input_ids.shape[-1]:][0],
                             skip_special_tokens=True)))
Exemple #14
0
class JapaneseWorker:
    def __init__(self):
        self.juman_tokenizer = JumanTokenizer()
        self.bert_tokenizer = BertTokenizer(config['DEFAULT']['vocab_path'],
                                            do_basic_tokenize=False)
        self.cls_id = self.bert_tokenizer.vocab['[CLS]']
        self.mask_id = self.bert_tokenizer.vocab['[MASK]']
        self.bert_model = 'model/Japanese/'

        self.cp = 'checkpoint/jp/cp_step_1200000.pt'
        self.opt = 'checkpoint/jp/opt_step_1200000.pt'

    @staticmethod
    def linesplit(src):
        """
        :param src: type str, String type article
        :return: type list, punctuation seperated sentences
        """
        def remove_newline(x):
            x = x.replace('\n', '')
            return x

        def remove_blank(x):
            x = x.replace(' ', '')
            return x

        def remove_unknown(x):
            unknown = ['\u3000']
            for h in unknown:
                x = x.replace(h, '')
            return x
        src = remove_blank(src)
        src = remove_newline(src)
        src = remove_unknown(src)
        src_line = re.split('。(?<!」)|!(?<!」)|?(?!」)', src)
        src_line = [x for x in src_line if x is not '']
        return src_line

    def tokenizer(self, src):
        """
        :param src: type list, punctuation seperated sentences
        :return: token: type list, numberized tokens
                 token_id: type list, tokens
        """
        token = []
        token_id = []

        def _preprocess_text(text):
            return text.replace(" ", "")  # for Juman

        for sentence in src:
            preprocessed_text = _preprocess_text(sentence)
            juman_tokens = self.juman_tokenizer(preprocessed_text)
            tokens = self.bert_tokenizer.tokenize(" ".join(juman_tokens))
            tokens = ["[CLS]"] + tokens + ["[SEP]"]
            ids = self.bert_tokenizer.convert_tokens_to_ids(tokens)
            token += tokens
            token_id += ids
        return token, token_id
Exemple #15
0
class NemoBertTokenizer(TokenizerSpec):
    def __init__(
            self,
            pretrained_model=None,
            vocab_file=None,
            do_lower_case=True,
            max_len=None,
            do_basic_tokenize=True,
            never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"),
    ):
        if pretrained_model:
            self.tokenizer = BertTokenizer.from_pretrained(pretrained_model)
            if "uncased" not in pretrained_model:
                self.tokenizer.basic_tokenizer.do_lower_case = False
        else:
            self.tokenizer = BertTokenizer(vocab_file, do_lower_case,
                                           do_basic_tokenize)
        self.vocab_size = len(self.tokenizer.vocab)
        self.never_split = never_split

    def text_to_tokens(self, text):
        tokens = self.tokenizer.tokenize(text)
        return tokens

    def tokens_to_text(self, tokens):
        text = self.tokenizer.convert_tokens_to_string(tokens)
        return remove_spaces(handle_quotes(text.strip()))

    def token_to_id(self, token):
        return self.tokens_to_ids([token])[0]

    def tokens_to_ids(self, tokens):
        ids = self.tokenizer.convert_tokens_to_ids(tokens)
        return ids

    def ids_to_tokens(self, ids):
        tokens = self.tokenizer.convert_ids_to_tokens(ids)
        return tokens

    def text_to_ids(self, text):
        tokens = self.text_to_tokens(text)
        ids = self.tokens_to_ids(tokens)
        return ids

    def ids_to_text(self, ids):
        tokens = self.ids_to_tokens(ids)
        tokens_clean = [t for t in tokens if t not in self.never_split]
        text = self.tokens_to_text(tokens_clean)
        return text

    def pad_id(self):
        return self.tokens_to_ids(["[PAD]"])[0]

    def bos_id(self):
        return self.tokens_to_ids(["[CLS]"])[0]

    def eos_id(self):
        return self.tokens_to_ids(["[SEP]"])[0]
Exemple #16
0
def main():
    args = setup_train_args()
    # 日志同时输出到文件和console
    global logger
    logger = create_logger(args)
    # 当用户使用GPU,并且GPU可用时
    args.cuda = torch.cuda.is_available() and not args.no_cuda
    device = 'cuda' if args.cuda else 'cpu'
    logger.info('using device:{}'.format(device))
    # 为CPU设置种子用于生成随机数,以使得结果是确定的
    # 为当前GPU设置随机种子;如果使用多个GPU,应该使用torch.cuda.manual_seed_all()为所有的GPU设置种子。
    # 当得到比较好的结果时我们通常希望这个结果是可以复现
    if args.seed:
        set_random_seed(args)

    # 设置使用哪些显卡进行训练
    os.environ["CUDA_VISIBLE_DEVICES"] = args.device

    # 初始化tokenizer
    tokenizer = BertTokenizer(vocab_file=args.vocab_path)
    # tokenizer的字典大小
    vocab_size = len(tokenizer)

    global pad_id
    pad_id = tokenizer.convert_tokens_to_ids(PAD)

    # 创建modle的输出目录
    if not os.path.exists(args.output_dir):
        os.mkdir(args.output_dir)
    # 加载dialogue GPT2模型
    model, n_ctx = create_model(args, vocab_size)
    model.to(device)
    # 对原始数据进行预处理,将原始语料转换成对应的token_id
    if args.raw:
        preprocess_raw_data(args, tokenizer, n_ctx)
    # 是否使用多块GPU进行并行运算
    multi_gpu = False
    if args.cuda and torch.cuda.device_count() > 1:
        logger.info("Let's use GPUs to train")
        model = DataParallel(model, device_ids=[int(i) for i in args.device.split(',')])
        multi_gpu = True
    # 记录模型参数数量
    num_parameters = 0
    parameters = model.parameters()
    for parameter in parameters:
        num_parameters += parameter.numel()
    logger.info('number of model parameters: {}'.format(num_parameters))

    # 加载数据
    logger.info("loading traing data")
    with open(args.train_tokenized_path, "r", encoding="utf8") as f:
        data = f.read()
    data_list = data.split("\n")
    train_list, test_list = train_test_split(data_list, test_size=0.2, random_state=1)
    # 开始训练
    train(model, device, train_list, multi_gpu, args)
    # 测试模型
    evaluate(model, device, test_list, multi_gpu, args)
Exemple #17
0
def main():
    args = set_interact_args()
    logger = create_logger(args)
    # 当用户使用GPU,并且GPU可用时
    args.cuda = torch.cuda.is_available() and not args.no_cuda
    # args.cuda = False
    device = 'cuda' if args.cuda else 'cpu'
    logger.info('using device:{}'.format(device))
    os.environ["CUDA_VISIBLE_DEVICES"] = args.device
    tokenizer = BertTokenizer(vocab_file=args.voca_path)
    model = GPT2LMHeadModel.from_pretrained(args.dialogue_model_path)
    model.to(device)
    model.eval()

    print('***********************Summary model start************************')

    while True:
        try:

            text = input()
            for i in range(5):
                if len(text): text = text[:1000]
                input_ids = [tokenizer.cls_token_id]  # 每个input以[CLS]为开头
                input_ids.extend(tokenizer.encode(text))
                input_ids.append(tokenizer.sep_token_id)
                curr_input_tensor = torch.tensor(input_ids).long().to(device)

                generated = []
                # 最多生成max_len个token
                for _ in range(args.max_len):
                    outputs = model(input_ids=curr_input_tensor)
                    next_token_logits = outputs[0][-1, :]
                    # 对于已生成的结果generated中的每个token添加一个重复惩罚项,降低其生成概率
                    for id in set(generated):
                        next_token_logits[id] /= args.repetition_penalty
                    next_token_logits = next_token_logits / args.temperature
                    # 对于[UNK]的概率设为无穷小,也就是说模型的预测结果不可能是[UNK]这个token
                    next_token_logits[tokenizer.convert_tokens_to_ids(
                        '[UNK]')] = -float('Inf')
                    filtered_logits = top_k_top_p_filtering(next_token_logits,
                                                            top_k=args.topk,
                                                            top_p=args.topp)
                    # torch.multinomial表示从候选集合中无放回地进行抽取num_samples个元素,权重越高,抽到的几率越高,返回元素的下标
                    next_token = torch.multinomial(F.softmax(filtered_logits,
                                                             dim=-1),
                                                   num_samples=1)
                    if next_token == tokenizer.sep_token_id:  # 遇到[SEP]则表明response生成结束
                        break
                    generated.append(next_token.item())
                    curr_input_tensor = torch.cat(
                        (curr_input_tensor, next_token), dim=0)

                text = tokenizer.convert_ids_to_tokens(generated)
                print("summary:" + "".join(text))

        except KeyboardInterrupt:
            break
Exemple #18
0
def example_to_input(lemma_list: List[str],
                     tags_list: List[int],
                     tok: BertTokenizer):
    subword_list, tags_map = tok.convert_tokens_to_ids(tok.tokenize('[CLS]')), []
    for w in lemma_list:
        tags_map.append(len(subword_list))
        subword_list += tok.convert_tokens_to_ids(tok.tokenize(w))
    subword_list += tok.convert_tokens_to_ids(tok.tokenize('[SEP]'))
    mapped_tags = [0] * len(subword_list)
    # mapped_pos = [0] * len(subword_list)
    # mapped_lemmas = ["[UNK]"] * len(subword_list)
    # mapped_altern = [[]] * len(subword_list)
    for i, j in enumerate(tag_map):
        mapped_tags[j] = tags_list[i]
        # mapped_pos[j] = example['pos'][i]
        # mapped_lemmas[j] = example['lemmas'][i]
        # mapped_altern[j] = example['alternatives'][i]
    return subword_list, mapped_tags
Exemple #19
0
def create_model(model_class: BertPreTrainedModel,
                 encoder_config: BertConfig,
                 tokenizer: BertTokenizer,
                 encoder_path=None,
                 entity_types: dict = None,
                 relation_types: dict = None,
                 prop_drop: float = 0.1,
                 meta_embedding_size: int = 25,
                 size_embeddings_count: int = 10,
                 ed_embeddings_count: int = 300,
                 token_dist_embeddings_count: int = 700,
                 sentence_dist_embeddings_count: int = 50,
                 mention_threshold: float = 0.5,
                 coref_threshold: float = 0.5,
                 rel_threshold: float = 0.5,
                 position_embeddings_count: int = 700,
                 cache_path=None):
    params = dict(
        config=encoder_config,
        # JEREX model parameters
        cls_token=tokenizer.convert_tokens_to_ids('[CLS]'),
        entity_types=len(entity_types),
        relation_types=len(relation_types),
        prop_drop=prop_drop,
        meta_embedding_size=meta_embedding_size,
        size_embeddings_count=size_embeddings_count,
        ed_embeddings_count=ed_embeddings_count,
        token_dist_embeddings_count=token_dist_embeddings_count,
        sentence_dist_embeddings_count=sentence_dist_embeddings_count,
        mention_threshold=mention_threshold,
        coref_threshold=coref_threshold,
        rel_threshold=rel_threshold,
        tokenizer=tokenizer,
        cache_dir=cache_path,
    )

    if encoder_path is not None:
        model = model_class.from_pretrained(encoder_path, **params)
    else:
        model = model_class(**params)

    # conditionally increase position embedding count
    if encoder_config.max_position_embeddings < position_embeddings_count:
        old = model.bert.embeddings.position_embeddings

        new = nn.Embedding(position_embeddings_count,
                           encoder_config.hidden_size)
        new.weight.data[:encoder_config.
                        max_position_embeddings, :] = old.weight.data
        model.bert.embeddings.position_embeddings = new
        model.bert.embeddings.register_buffer(
            "position_ids",
            torch.arange(position_embeddings_count).expand((1, -1)))

        encoder_config.max_position_embeddings = position_embeddings_count

    return model
def transformer_preprocess(src_path,
                           tgt_path,
                           tokenized_file,
                           vocab_file='./config/vocab_en.txt',
                           ctx=200):
    '''
    tokenize the dataset for NLG (GPT2), write the tokenized id into the tokenized_file.
    more details can be found in https://github.com/yangjianxin1/GPT2-chitchat
    '''
    def clean_inside(s):
        s = s.replace('<user0>', '')
        s = s.replace('<user1>', '')
        s = s.strip()
        s = clean(s)
        return s

    # create the Bert tokenizer of the GPT2 model
    tokenizer = BertTokenizer(vocab_file=vocab_file)

    src_data, tgt_data = read_file(src_path), read_file(tgt_path)
    src_data = [' '.join(i) for i in src_data]
    tgt_data = [' '.join(i) for i in tgt_data]
    assert len(src_data) == len(
        tgt_data
    ), f'[!] length of src and tgt: {len(src_data)}/{len(tgt_data)}'

    # combine them
    corpus = []
    longest = 0
    for s, t in tqdm(list(zip(src_data, tgt_data))):
        item = [tokenizer.cls_token_id
                ]  # [CLS] for each dialogue in the begining
        s = s + ' __eou__ ' + t
        s = clean_inside(s)
        utterances = s.split('__eou__')
        for utterance in utterances:
            words = nltk.word_tokenize(utterance)
            item.extend(
                [tokenizer.convert_tokens_to_ids(word) for word in words])
            item.append(tokenizer.sep_token_id)
        if len(item) > longest:
            longest = len(item)
        item = item[:ctx]
        corpus.append(item)

    # write into the file
    with open(tokenized_file, 'w') as f:
        for i in range(len(corpus)):
            words = [str(word) for word in corpus[i]]
            f.write(f'{" ".join(words)}')
            if i < len(corpus) - 1:
                f.write('\n')

    print(
        f'[!] Preprocess the data for the transformers(GPT2), the longest sentence :{longest}, write the data into {tokenized_file}.'
    )
def convert_data_to_feature():
    #載入問題資料集
    q = open('Dataset/Query_Train/Final_question.txt', "r", encoding="utf-8")
    questions = q.readlines()
    q.close()
    #載入答案資料集
    a = open('Dataset/Train_Label/FinalDomainLabel.txt', "r", encoding="utf-8")
    answers = a.readlines()
    a.close()
    assert len(answers) == len(questions)
    # ans_dic 表示answer的類別
    ans_dic = make_ans_dic(answers)
    # question_dic 表示question的類別
    question_dic = make_question_dic(questions)

    tokenizer = BertTokenizer(vocab_file='bert-base-chinese-vocab.txt')
    q_tokens = []
    max_seq_len = 0

    for q in question_dic.data:
        bert_ids = tokenizer.build_inputs_with_special_tokens(
            tokenizer.convert_tokens_to_ids(tokenizer.tokenize(q)))
        if (len(bert_ids) > max_seq_len):
            max_seq_len = len(bert_ids)
        q_tokens.append(bert_ids)

    print("最長問句長度:", max_seq_len)
    assert max_seq_len <= 512  # 小於BERT-base長度限制
    # 補齊長度
    for q in q_tokens:
        while len(q) < max_seq_len:
            q.append(0)
    a_labels = []
    for a in ans_dic.data:
        a_labels.append(ans_dic.to_id(a))
    # BERT input embedding
    answer_lables = a_labels
    input_ids = q_tokens
    input_masks = [[1] * max_seq_len for i in range(len(question_dic))]
    input_segment_ids = [[0] * max_seq_len for i in range(len(question_dic))]
    assert len(input_ids) == len(question_dic) and len(input_ids) == len(
        input_masks) and len(input_ids) == len(input_segment_ids)

    data_features = {
        'input_ids': input_ids,
        'input_masks': input_masks,
        'input_segment_ids': input_segment_ids,
        'answer_lables': answer_lables,
        'question_dic': question_dic,
        'answer_dic': ans_dic
    }

    output = open('Dataset/data_features_domain.pkl', 'wb')
    pickle.dump(data_features, output)
    return data_features
Exemple #22
0
    def __init__(self,
                 bert_tokenizer: BertTokenizer,
                 jp_tokenizer: JumanTokenizer,
                 args,
                 file_path='train',
                 block_size=512):
        assert os.path.isfile(file_path)
        directory, filename = os.path.split(file_path)
        cached_features_file = os.path.join(
            directory, 'cached_lm_' + str(block_size) + '_' + filename)

        if os.path.exists(cached_features_file) and not args.overwrite_cache:
            logger.info("Loading features from cached file %s",
                        cached_features_file)
            with open(cached_features_file, 'rb') as handle:
                self.examples = pickle.load(handle)
        else:
            logger.info("Creating features from dataset file at %s", directory)

            self.examples = []
            with open(file_path, encoding="utf-8") as f:
                docs = f.readlines()

            exsamples = []
            for _, line in enumerate(docs):
                text = line.rstrip(os.linesep)

                # separate text into tokens
                tokenized_text = bert_tokenizer.convert_tokens_to_ids(
                    bert_tokenizer.tokenize(" ".join(
                        jp_tokenizer.tokenize(text))))

                # add special tokkens : [CLS] and [SEP]
                added_special = bert_tokenizer.build_inputs_with_special_tokens(
                    tokenized_text)

                # Zero-pad up to the sequence length.
                diff = block_size - len(added_special)
                if diff < 0:
                    added_special = added_special[:diff]
                else:
                    # padding を 0 -> -1に変更
                    padding = [-1] * (block_size - len(added_special))
                    added_special += padding

                assert len(added_special) == block_size

                self.examples.append(added_special)

            logger.info("Saving features into cached file %s",
                        cached_features_file)
            with open(cached_features_file, 'wb') as handle:
                pickle.dump(self.examples,
                            handle,
                            protocol=pickle.HIGHEST_PROTOCOL)
def bert_text_preparation(text: str, tokenizer: BertTokenizer):
    marked_text = "[CLS] " + text + " [SEP]"
    tokenized_text = tokenizer.tokenize(marked_text)

    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [1] * len(indexed_tokens)

    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensor = torch.tensor([segments_ids])

    return tokenized_text, tokens_tensor, segments_tensor
Exemple #24
0
def build_feature(tokenizer: transformers.BertTokenizer,
                  examples: list,
                  max_length: int = None):
    '''
    @param tokenizer (transformers.BertTokenizer): tokenzier to convert token to ids

    @param examples (list): input examples

    @param maxlength (int): set max length to cut off example sequence

    @return examples (list): new examples with input feature
    '''

    if max_length is not None:
        length = max_length
    else:
        length = 1e3

    for example in examples:
        context = tokenizer.convert_tokens_to_ids(
            example['context'][:min(length, len(example['context']))])
        # print(context)
        question = tokenizer.convert_tokens_to_ids(
            example['question'][:min(length, len(example['question']))])
        # print(question)
        out = tokenizer.prepare_for_model(context,
                                          question,
                                          return_token_type_ids=True,
                                          return_attention_mask=True)
        inputs = out['input_ids']
        token_type_ids = out['token_type_ids']
        attention_mask = out['attention_mask']
        # print(inputs)
        # print(token_type_ids)
        # print(attention_mask)

        example['input_feature'] = inputs
        example['token_type_ids'] = token_type_ids
        example['attention_mask'] = attention_mask

    return examples
Exemple #25
0
class testAnswerGeneration():
    def __init__(self):
        self.tokenizer = BertTokenizer(
            vocab_file='bert-base-chinese-vocab.txt')
        self.config = BertConfig.from_pretrained('trained_model/1/config.json')
        self.model = BertForMaskedLM.from_pretrained(
            'trained_model/1/pytorch_model.bin',
            from_tf=bool('.ckpt' in 'bert-base-chinese'),
            config=self.config)
        self.model.eval()

    def to_input_id(self, sentence_input):
        return self.tokenizer.convert_tokens_to_ids(
            self.tokenizer.tokenize(sentence_input))

    def getAnswer(self, context, question):
        input_id = self.to_input_id("[CLS] " + context + " [SEP] " + question +
                                    " [SEP]")

        count = 0
        answer = ""
        maskpos = len(input_id)  # 標出要預測答案的位置
        input_id.append(103)
        # 補齊長度
        while len(input_id) < 512:
            input_id.append(0)

        # 限制答案最大長度為10
        while (count < 10):
            input_id_tensor = torch.LongTensor([input_id])
            outputs = self.model(input_id_tensor)
            predictions = outputs[0]
            predicted_index = torch.argmax(
                predictions[0, maskpos]).item()  # 生出最有可能的token_id
            predicted_token = self.tokenizer.convert_ids_to_tokens(
                predicted_index)  # id轉token

            # 當預測為[SEP]的時候,就結束生成答案
            if predicted_token == '[SEP]':
                break

            answer = answer + predicted_token  # 將生成的token連接起來
            input_id[maskpos] = predicted_index  # 用生成的token_id取代當前的[MASK]的id
            maskpos += 1
            if maskpos < 512:
                input_id[maskpos] = 103  # 標出下一個預測的[MASK]的id
            else:
                break

            count += 1

        return answer
Exemple #26
0
def tensorize_example(example: dict, config: dict, tokenizer: BertTokenizer,
                      genres: dict) -> CoNLLCorefResolution:
    clusters = example["clusters"]
    gold_mentions = sorted(tuple(m) for m in util.flatten(clusters))
    gold_mention_map = {m: i for i, m in enumerate(gold_mentions)}
    cluster_ids = [0] * len(gold_mentions)
    for cluster_id, cluster in enumerate(clusters):
        for mention in cluster:
            cluster_ids[gold_mention_map[tuple(mention)]] = cluster_id + 1
    cluster_ids = torch.tensor(cluster_ids, dtype=torch.int64)

    sentences = example["sentences"]
    num_words = sum(len(s) + 2 for s in sentences)
    speakers = example["speakers"]
    speaker_dict = util.get_speaker_dict(util.flatten(speakers),
                                         config['max_num_speakers'])

    max_sentence_length = config['max_segment_len']
    text_len = torch.tensor([len(s) for s in sentences], dtype=torch.int64)

    input_ids, input_mask, speaker_ids = [], [], []
    for i, (sentence, speaker) in enumerate(zip(sentences, speakers)):
        sentence = ['[CLS]'] + sentence + ['[SEP]']
        sent_input_ids = tokenizer.convert_tokens_to_ids(sentence)
        sent_input_mask = [-1] + [1] * (len(sent_input_ids) - 2) + [-1]
        sent_speaker_ids = [1] + [speaker_dict.get(s, 3)
                                  for s in speaker] + [1]
        while len(sent_input_ids) < max_sentence_length:
            sent_input_ids.append(0)
            sent_input_mask.append(0)
            sent_speaker_ids.append(0)
        input_ids.append(sent_input_ids)
        speaker_ids.append(sent_speaker_ids)
        input_mask.append(sent_input_mask)
    input_ids = torch.tensor(input_ids, dtype=torch.int64)
    input_mask = torch.tensor(input_mask, dtype=torch.int64)
    speaker_ids = torch.tensor(speaker_ids, dtype=torch.int64)
    assert num_words == torch.sum(
        torch.abs(input_mask)), (num_words, torch.sum(torch.abs(input_mask)))

    doc_key = example["doc_key"]
    subtoken_map = torch.tensor(example.get("subtoken_map", None),
                                dtype=torch.int64)
    sentence_map = torch.tensor(example['sentence_map'], dtype=torch.int64)
    genre = genres.get(doc_key[:2], 0)
    genre = torch.tensor([genre], dtype=torch.int64)
    gold_starts, gold_ends = tensorize_mentions(gold_mentions)

    return CoNLLCorefResolution(doc_key, input_ids, input_mask, text_len,
                                speaker_ids, genre, gold_starts, gold_ends,
                                cluster_ids, sentence_map, subtoken_map)
Exemple #27
0
class CustomBertVocab(object):
    def __init__(self, lang='en'):
        """Basic Vocabulary object"""
        self.lang = lang
        self.vocab_size = 0
        self.tokenizer = None

    def load(self, bert_vocab_path):
        """load 词汇表"""
        self.tokenizer = BertTokenizer(
            vocab_file=bert_vocab_path,
            never_split=['<num>', '<url>', '<img>', '</s>'])
        self.vocab_size = self.tokenizer.vocab_size

    def encode(self, words: list):
        """words 编码"""
        ids = []
        for word in words:
            ids.append(self.tokenizer.convert_tokens_to_ids(word))

        return ids

    def decode(self, ids, decode_type: str):
        """ids 解码"""
        sentence = []
        for id in ids:
            if isinstance(id, torch.Tensor):
                word = self.tokenizer.convert_ids_to_tokens(id.item())
            else:
                word = self.tokenizer.convert_ids_to_tokens(id)
            if decode_type == 'predict':
                if word not in [
                        EOS_TOKEN, SOS_TOKEN, PAD_TOKEN, IMG_TOKEN, MSP_TOKEN
                ]:
                    sentence.append(word)
                if word == PAD_TOKEN or word == EOS_TOKEN:
                    break
            else:  # context question
                sentence.append(word)
                if word == PAD_TOKEN:
                    break
        if self.lang == 'zh':
            return ''.join(sentence)

        return ' '.join(sentence)
Exemple #28
0
 def __init__(self, conf: GPT2ChatbotConf, tokenizer: BertTokenizer):
     self.conf = conf
     self.tokenizer = tokenizer
     self.speaker_ids = tokenizer.convert_tokens_to_ids(
         ["[speaker1]", "[speaker2]"])
     self.pool = Pool(1)
     # get all chatlog
     logger.info("read raw data...")
     self.chat_log = self._get_chatlog()
     logger.info("num data:{}".format(len(self.chat_log)))
     self.data_iter = iter(self.chat_log)
     self.steps = len(self.chat_log) // self.conf.batch_size
     # 创建一个数据进程
     if self.conf.use_multi_proc:
         batch_examples = self._get_batch_examples()
         self.proc = self.pool.apply_async(
             func=LCCCDataGenerator.get_batch_data,
             args=(batch_examples, tokenizer, self.speaker_ids))
Exemple #29
0
def get_encoder_embedding(phrases: List[str], bert: BertModel,
                          tokenizer: BertTokenizer,
                          embed_wo_special_tokens: bool) -> torch.Tensor:
    subtok_ids_list, hypo_mask_list = [], []
    for phr in phrases:
        subtok_ids_list.append(
            tokenizer.convert_tokens_to_ids(['[CLS]'] +
                                            tokenizer.tokenize(phr) +
                                            ['[SEP]']))
        hypo_mask_list.append([1.0] * len(subtok_ids_list[-1]))
        if embed_wo_special_tokens:
            hypo_mask_list[-1][0] = 0.0
            hypo_mask_list[-1][-1] = 0.0
    batch = HypoDataset.torchify_and_pad(subtok_ids_list, hypo_mask_list)
    subtok_ids_batch, hypo_mask_batch, attn_mask_batch = to_device(*batch)
    h = bert(subtok_ids_batch, attention_mask=attn_mask_batch)[0]
    m = hypo_mask_batch.unsqueeze(2)
    phrase_representations = torch.sum(h * m, 1) / torch.sum(m, 1)
    return phrase_representations
class PredictionModel:
    def __init__(self):
        self.model: BertModel = BertModel.from_pretrained(
            pretrained_model_name_or_path=
            'foodbert/data/mlm_output/checkpoint-final')
        with open('foodbert/data/used_ingredients.json', 'r') as f:
            used_ingredients = json.load(f)
        self.tokenizer = BertTokenizer(
            vocab_file='foodbert/data/bert-base-cased-vocab.txt',
            do_lower_case=False,
            max_len=128,
            never_split=used_ingredients)

        self.device = torch.device(
            'cuda') if torch.cuda.is_available() else torch.device('cpu')

        self.model.to(self.device)

    def predict_embeddings(self, sentences):
        dataset = InstructionsDataset(tokenizer=self.tokenizer,
                                      sentences=sentences)
        dataloader = DataLoader(dataset, batch_size=100, pin_memory=True)

        embeddings = []
        ingredient_ids = []
        for batch in dataloader:
            batch = batch.to(self.device)
            with torch.no_grad():
                embeddings_batch = self.model(batch)
                embeddings.extend(embeddings_batch[0])
                ingredient_ids.extend(batch)

        return torch.stack(embeddings), ingredient_ids

    def compute_embedding_for_ingredient(self, sentence, ingredient_name):
        embeddings, ingredient_ids = self.predict_embeddings([sentence])
        embeddings_flat = embeddings.view((-1, 768))
        ingredient_ids_flat = torch.stack(ingredient_ids).flatten()
        food_id = self.tokenizer.convert_tokens_to_ids(ingredient_name)
        food_embedding = embeddings_flat[ingredient_ids_flat ==
                                         food_id].cpu().numpy()

        return food_embedding[0]