Exemple #1
0
def convert_data_to_feature(filepath):
    DRCD = LoadJson(filepath)
    tokenizer = BertTokenizer(vocab_file='bert-base-chinese-vocab.txt')

    token_embeddings = []
    segement_embeddings = []
    attention_mask = []
    masked_lm_labels = []
    max_seq_len = 0
    Context_count = 0

    # BertForMaskedLM的訓練需要特殊符號('[MASK]')以及被mask掉的詞的id
    # context最大長度450,question最大長度42,answer最大長度16,以及4個特殊符號(1個[CLS],3個[SEP]),加起來最大不超過512
    for data in DRCD["data"]:
        for paragraph in data["paragraphs"]:
            context = paragraph["context"]
            word_piece_list = tokenizer.tokenize(context)
            if len(word_piece_list) <= 450:
                for qa in paragraph["qas"]:
                    question = qa["question"]
                    word_piece_list = tokenizer.tokenize(question)
                    if len(word_piece_list) <= 42:
                        answer = qa["answers"][0]["text"]
                        answer = answer + "[SEP]"
                        word_piece_list = tokenizer.tokenize(answer)
                        if len(word_piece_list) <= 16:
                            max_seq_len = create_input_features(tokenizer, context, question, answer, token_embeddings, segement_embeddings, attention_mask, masked_lm_labels, max_seq_len)
                            Context_count += 1
    
    print("最大長度:",max_seq_len)
    print("符合條件的context有" + str(Context_count) + "筆資料")
    print("總共產生" + str(len(token_embeddings)) + "筆資料")
    assert max_seq_len <= 512 # 小於BERT-base長度限制
    max_seq_len = 512         # 將長度統一補齊至512(避免traindata和testdata最大長度不一致)

    # 補齊長度
    for c in token_embeddings:
        while len(c)<max_seq_len:
            c.append(0)

    for c in segement_embeddings:
        while len(c)<max_seq_len:
            c.append(0)

    for c in attention_mask:
        while len(c)<max_seq_len:
            c.append(0)

    for c_l in masked_lm_labels:
        while len(c_l)<max_seq_len:
            c_l.append(-1)
    
    # BERT input embedding
    assert len(token_embeddings) == len(segement_embeddings) and len(token_embeddings) == len(attention_mask) and len(token_embeddings) == len(masked_lm_labels)
    data_features = {'token_embeddings':token_embeddings,
                    'segement_embeddings':segement_embeddings,
                    'attention_mask':attention_mask,
                    'masked_lm_labels':masked_lm_labels}

    return data_features
Exemple #2
0
def find_paragraph(tokenizer: BertTokenizer,
                   model: BertForNextSentencePrediction,
                   question: str,
                   context: str,
                   max_len=256,
                   batch_size=16):
    q_len = len(tokenizer.tokenize(question))
    context_tokens = tokenizer.tokenize(context)
    part_len = max_len - q_len - 3
    parts = []
    n = 0
    while n < len(context_tokens):
        parts += [context_tokens[n:n + part_len]]
        n += part_len // 2
    results = []
    all_parts = parts[:]
    while len(parts) > 0:
        batch = tokenizer.batch_encode_plus(list(
            zip([question] * batch_size, parts[:batch_size])),
                                            max_length=max_len,
                                            truncation=True,
                                            pad_to_max_length=True,
                                            return_tensors="pt").to("cuda")
        with torch.no_grad():
            output = model(**batch)[0]
        results += [a - b for a, b in output.cpu().tolist()]
        parts = parts[batch_size:]
    return np.array(results), [
        tokenizer.decode(tokenizer.encode(part), skip_special_tokens=True)
        for part in all_parts
    ]
Exemple #3
0
def str2id(tokenizer: BertTokenizer, sys_utter: str, usr_utter: str,
           source: str) -> Tuple[List[int], List[int]]:
    """Convert system, user utterance and source tokens to ids based on BertTokenizer.

    Args:
        tokenizer: BertTokenizer
        sys_utter: system utterance
        usr_utter: user utterance
        source: slot + value

    Returns:
        input_ids and token_type_ids
    """
    sys_utter_tokens = tokenizer.tokenize(sys_utter)
    usr_utter_tokens = tokenizer.tokenize(usr_utter)
    source_tokens = tokenizer.tokenize(source)
    sys_utter_ids = tokenizer.convert_tokens_to_ids(sys_utter_tokens)
    usr_utter_ids = tokenizer.convert_tokens_to_ids(usr_utter_tokens)
    source_ids = tokenizer.convert_tokens_to_ids(source_tokens)
    input_ids = ([tokenizer.cls_token_id] + sys_utter_ids +
                 [tokenizer.sep_token_id] + usr_utter_ids +
                 [tokenizer.sep_token_id] + source_ids +
                 [tokenizer.sep_token_id])
    token_type_ids = ([0] + [0] * (len(sys_utter_ids) + 1) + [1] *
                      (len(usr_utter_ids) + 1) + [0] * (len(source_ids) + 1))
    return input_ids, token_type_ids
Exemple #4
0
class Tokenizer:
    def __init__(self, tokenizer_name="komoran"):
        assert (tokenizer_name.lower() == "komoran") or (tokenizer_name.lower() == "mecab")\
            or (tokenizer_name.lower() == "subword"), "Only 'komoran', 'mecab', and 'subword' is acceptable."
        if tokenizer_name == "komoran":
            self.tokenizer = Komoran("STABLE")
        elif tokenizer_name == "mecab":
            self.tokenizer = Mecab()
        elif tokenizer_name == "subword":
            self.tokenizer = BertTokenizer(resource_filename(__package__, "vocab_noised.txt"), do_lower_case=False)
        self.tokenizer_name = tokenizer_name

    def tokenize(self, text):
        if self.tokenizer_name == "komoran":
            return self.tokenizer.get_morphes_by_tags(text)
        elif self.tokenizer_name == "mecab":
            return self.tokenizer.morphs(text)
        else: # self.tokenizer_name 이 None
            return self.tokenizer.tokenize(text)

    def post_process(self, tokens):
        if self.tokenizer_name == "komoran":
            return " ".join(tokens)
        elif self.tokenizer_name == "mecab":
            return " ".join(tokens)
        else: # self.tokenizer_name 이 subword 또는 moduletype
            return self.tokenizer.convert_tokens_to_string(tokens)
def chat(folder_bert, voc, testing=False):
    tf.random.set_seed(1)
    tokenizer = BertTokenizer(vocab_file=folder_bert + voc)
    if testing:
        tokens = tokenizer.tokenize("jeg tror det skal regne")
        print(tokens)
        ids = tokenizer.convert_tokens_to_ids(tokens)
        print(ids)
        print("Vocab size:", len(tokenizer.vocab))

    config = BertConfig.from_json_file(folder_bert + "/config.json")
    model = BertLMHeadModel.from_pretrained(folder_bert, config=config)
    while (1):
        text = input(">>User: "******"Bot: {}".format(tokenizer.decode(sample_output[0])))
        print("Bot: {}".format(
            tokenizer.decode(sample_output[:, input_ids.shape[-1]:][0],
                             skip_special_tokens=True)))
Exemple #6
0
def read_data(filename: str, tokenizer: BertTokenizer,
              args: TrainingArguments) -> List:
    with open(filename, encoding="utf-8") as f:
        data = f.readlines()
    items = [
        item for item in json.loads(data[0])['data']
        if 'russian' in item['paragraphs'][0]['qas'][0]['id']
    ]
    ds = []
    for item in items:
        paragraph = item['paragraphs'][0]
        context = paragraph['context']
        qas = paragraph['qas'][0]
        question = qas['question']
        answer = qas['answers'][0]
        answer_start = answer['answer_start']
        answer_text = answer['text']
        ids = tokenizer.encode(question, context[:answer_start])
        start = len(ids) - 1
        end = start + len(tokenizer.tokenize(answer_text))
        if end < args.block_size:
            ds += [{
                "question": question,
                "context": context,
                "start": start,
                "end": end
            }]
    return ds
Exemple #7
0
 def featurize(self, df):
     bert_model = BertModel.from_pretrained(self.data_path)
     bert_tokenizer = BertTokenizer(self.data_path + "/vocab.txt",
                                    do_lower_case=False,
                                    do_basic_tokenize=False)
     mecab = MeCab.Tagger('-Ochasen')
     data_list = df.rdd.collect()
     label_list = []
     vec_list = []
     for data in data_list:
         tmp_list = []
         node_list = data[1]
         for word in node_list:
             tmp_list.append(word)
         if len(tmp_list) != 0:
             label_list.append(float(data[0]))
             bert_tokens = bert_tokenizer.tokenize(
                 " ".join(["[CLS]"] + tmp_list + ["[SEP]"]))
             token_ids = bert_tokenizer.convert_tokens_to_ids(bert_tokens)
             tokens_tensor = torch.tensor(token_ids).unsqueeze(0)
             all_outputs = bert_model(tokens_tensor)
             embedding = all_outputs[-2].detach().numpy()[0]
             vec = np.mean(embedding, axis=0).tolist()
             vec_list.append(Vectors.dense(vec))
     zip_list = zip(label_list, vec_list)
     new_df = self.spark.createDataFrame(zip_list, ("label", "features"))
     return new_df
    def generate_embedding(
        self,
        model: transformers.BertModel,
        tokenizer: transformers.BertTokenizer,
        product: pd.Series,
        feature_columns: List[str],
    ) -> torch.Tensor:
        model.eval()
        if (Project.exported_objects_dir /
                f"{product['product_id']}.obj").exists():
            return self.load_already_geneated_embedding(product=product)
        product_description = self.generate_product_description(
            product=product, feature_columns=feature_columns)
        marked_text = "[CLS] " + product_description + " [SEP]"
        tokenized_text = tokenizer.tokenize(marked_text)

        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

        segments_ids = [1] * len(tokenized_text)
        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensors = torch.tensor([segments_ids])
        with torch.no_grad():
            outputs = model(tokens_tensor, segments_tensors)
            hidden_states = outputs[2]
        token_vecs = hidden_states[-2][0]
        sentence_embedding = torch.mean(token_vecs, dim=0)
        torch.save(
            sentence_embedding,
            Project.exported_objects_dir / f"{product['product_id']}.obj",
        )
        return sentence_embedding
Exemple #9
0
def createVocabulary(reciepts):
  vocab = set()
  for reciept in reciepts:
    words = reciept.dataWords
    for word in words:
      vocab.add(word)
  path = './data/prod_vocab.txt'
  with open(path, 'r') as f:
    for line in f:
      vocab.add(line[:-1])
  tokenizer=BertTokenizer(vocab_file=path,do_lower_case=False)
  new_set = set()
  for word in vocab:
    token_list = tokenizer.tokenize(word)
    if '[UNK]' in token_list:
      print(word)
      t = re.split(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./<>?]', word)
      for i, v in enumerate(token_list):
        if v == '[UNK]' and i < len(t):
          for x in t:
            new_set.add(x)
  with open('./data/prod_vocab.txt', 'w+') as f:
    for word in (vocab.union(new_set)):
      f.write(word  + '\n')
  return vocab
Exemple #10
0
class BertBPE(object):
    def __init__(self, cfg):
        try:
            from transformers import BertTokenizer
        except ImportError:
            raise ImportError(
                "Please install transformers with: pip install transformers")

        if cfg.bpe_vocab_file:
            self.bert_tokenizer = BertTokenizer(
                cfg.bpe_vocab_file, do_lower_case=not cfg.bpe_cased)
        else:
            vocab_file_name = ("bert-base-cased"
                               if cfg.bpe_cased else "bert-base-uncased")
            self.bert_tokenizer = BertTokenizer.from_pretrained(
                vocab_file_name)

    def encode(self, x: str) -> str:
        return " ".join(self.bert_tokenizer.tokenize(x))

    def decode(self, x: str) -> str:
        return self.bert_tokenizer.clean_up_tokenization(
            self.bert_tokenizer.convert_tokens_to_string(x.split(" ")))

    def is_beginning_of_word(self, x: str) -> bool:
        return not x.startswith("##")
Exemple #11
0
def get_indices_and_masks(sent_tokens: List[str],
                          in_sent_start: int,
                          in_sent_end: int,
                          tokenizer: BertTokenizer,
                          mask_mention: bool = False) \
        -> Tuple[List[int], List[float], int, int]:
    if in_sent_start not in range(len(sent_tokens)) or\
            in_sent_end not in range(1, len(sent_tokens) + 1):
        raise ValueError(
            f'wrong input: tokens {sent_tokens} don\'t contain pos'
            f' ({in_sent_start}, {in_sent_end}).')
    if mask_mention:
        for n in range(in_sent_start, in_sent_end):
            sent_tokens[n] = tokenizer.mask_token
    sent_subword_idxs = []
    sent_subwords = []
    sent_hypo_mask = []
    new_in_sent_start, new_in_sent_end = None, None
    for n, tok in enumerate(sent_tokens):
        if n == in_sent_start:
            new_in_sent_start = len(sent_subwords)
        subtokens = tokenizer.tokenize(tok)
        sent_subwords.extend(subtokens)
        subtok_idxs = tokenizer.convert_tokens_to_ids(subtokens)
        sent_subword_idxs.extend(subtok_idxs)
        # NOTE: absence of + 1 because absence of [CLS] token in the beginning
        mask_value = float(in_sent_start <= n < in_sent_end)
        sent_hypo_mask.extend([mask_value] * len(subtok_idxs))
        if n == in_sent_end - 1:
            new_in_sent_end = len(sent_subwords) + 1
    return sent_subword_idxs, sent_hypo_mask, new_in_sent_start, new_in_sent_end
Exemple #12
0
def get_embedding(phrases: List[str],
                  emb_mat: torch.Tensor,
                  tokenizer: BertTokenizer,
                  debug: bool = False) -> torch.Tensor:
    # emb_mat: [vocab_size, emb_size]
    # returns: [num_phrases, emb_size]
    subtok_ids, subtok_masks = [], []
    max_len = 0
    for w in phrases:
        subtok_toks = tokenizer.tokenize(w)
        subtok_ids.append(tokenizer.convert_tokens_to_ids(subtok_toks))
        num_subtoks = len(subtok_ids[-1])
        subtok_masks.append([1.] * num_subtoks)
        if debug:
            print(f"subtok_ids('{w}') = {subtok_ids[-1]}")
            print(
                f'{[tokenizer._convert_id_to_token(s) for s in subtok_ids[-1]]}'
            )
        max_len = max_len if max_len > num_subtoks else num_subtoks
    # subtok_ids, subtok_masks: [num_phrases, max_len]
    subtok_ids = torch.tensor(
        [sw_list + [-1] * (max_len - len(sw_list)) for sw_list in subtok_ids])
    subtok_masks = torch.tensor(
        [m + [0.] * (max_len - len(m)) for m in subtok_masks])
    # subtok_sizes: [num_phrases]
    subtok_sizes = torch.sum(subtok_masks, 1)
    if debug:
        print(subtok_sizes)
    # emb_mat[subtok_ids]: [num_phrases, max_len, emb_size]
    return torch.sum(emb_mat[subtok_ids] * subtok_masks.unsqueeze(2), axis=1) \
        / subtok_sizes.unsqueeze(1)
    def _ner_bert_tokenize(tokens: List[str],
                           tags: List[str],
                           tokenizer: BertTokenizer,
                           max_subword_len: int = None,
                           mode: str = None,
                           subword_mask_mode: str = "first",
                           token_masking_prob: float = None) -> Tuple[List[str], List[int], List[str]]:
        do_masking = (mode == 'train') and (token_masking_prob is not None)
        do_cutting = (max_subword_len is not None)
        tokens_subword = ['[CLS]']
        startofword_markers = [0]
        tags_subword = ['X']
        for token, tag in zip(tokens, tags):
            token_marker = int(tag != 'X')
            subwords = tokenizer.tokenize(token)
            if not subwords or (do_cutting and (len(subwords) > max_subword_len)):
                tokens_subword.append('[UNK]')
                startofword_markers.append(token_marker)
                tags_subword.append(tag)
            else:
                if do_masking and (random.random() < token_masking_prob):
                    tokens_subword.extend(['[MASK]'] * len(subwords))
                else:
                    tokens_subword.extend(subwords)
                if subword_mask_mode == "last":
                    startofword_markers.extend([0] * (len(subwords) - 1) + [token_marker])
                else:
                    startofword_markers.extend([token_marker] + [0] * (len(subwords) - 1))
                tags_subword.extend([tag] + ['X'] * (len(subwords) - 1))

        tokens_subword.append('[SEP]')
        startofword_markers.append(0)
        tags_subword.append('X')
        return tokens_subword, startofword_markers, tags_subword
Exemple #14
0
class JapaneseWorker:
    def __init__(self):
        self.juman_tokenizer = JumanTokenizer()
        self.bert_tokenizer = BertTokenizer(config['DEFAULT']['vocab_path'],
                                            do_basic_tokenize=False)
        self.cls_id = self.bert_tokenizer.vocab['[CLS]']
        self.mask_id = self.bert_tokenizer.vocab['[MASK]']
        self.bert_model = 'model/Japanese/'

        self.cp = 'checkpoint/jp/cp_step_1200000.pt'
        self.opt = 'checkpoint/jp/opt_step_1200000.pt'

    @staticmethod
    def linesplit(src):
        """
        :param src: type str, String type article
        :return: type list, punctuation seperated sentences
        """
        def remove_newline(x):
            x = x.replace('\n', '')
            return x

        def remove_blank(x):
            x = x.replace(' ', '')
            return x

        def remove_unknown(x):
            unknown = ['\u3000']
            for h in unknown:
                x = x.replace(h, '')
            return x
        src = remove_blank(src)
        src = remove_newline(src)
        src = remove_unknown(src)
        src_line = re.split('。(?<!」)|!(?<!」)|?(?!」)', src)
        src_line = [x for x in src_line if x is not '']
        return src_line

    def tokenizer(self, src):
        """
        :param src: type list, punctuation seperated sentences
        :return: token: type list, numberized tokens
                 token_id: type list, tokens
        """
        token = []
        token_id = []

        def _preprocess_text(text):
            return text.replace(" ", "")  # for Juman

        for sentence in src:
            preprocessed_text = _preprocess_text(sentence)
            juman_tokens = self.juman_tokenizer(preprocessed_text)
            tokens = self.bert_tokenizer.tokenize(" ".join(juman_tokens))
            tokens = ["[CLS]"] + tokens + ["[SEP]"]
            ids = self.bert_tokenizer.convert_tokens_to_ids(tokens)
            token += tokens
            token_id += ids
        return token, token_id
Exemple #15
0
def generate_template(tokenizer: BertTokenizer, first_name: str,
                      last_name: str, mode: str) -> str:
    """Generate a template given the information given.
    @param tokenizer is the tokenizer for the model.
    @param first_name is the patient's first name
    @param last_name is the patient's last name.
    @param mode will determine if we mask out first or last name.
    @return the template to be encoded (with MASKs).
    """
    if mode == "mask_first":
        tok_name = tokenizer.tokenize(first_name)
        mask_string = "[MASK] " * len(tok_name)
        return f"[CLS] {mask_string.strip()} {last_name} [SEP]"
    elif mode == "mask_last":
        tok_name = tokenizer.tokenize(last_name)
        mask_string = "[MASK] " * len(tok_name)
        return f"[CLS] {first_name} {mask_string.strip()} [SEP]"
Exemple #16
0
class NemoBertTokenizer(TokenizerSpec):
    def __init__(
            self,
            pretrained_model=None,
            vocab_file=None,
            do_lower_case=True,
            max_len=None,
            do_basic_tokenize=True,
            never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"),
    ):
        if pretrained_model:
            self.tokenizer = BertTokenizer.from_pretrained(pretrained_model)
            if "uncased" not in pretrained_model:
                self.tokenizer.basic_tokenizer.do_lower_case = False
        else:
            self.tokenizer = BertTokenizer(vocab_file, do_lower_case,
                                           do_basic_tokenize)
        self.vocab_size = len(self.tokenizer.vocab)
        self.never_split = never_split

    def text_to_tokens(self, text):
        tokens = self.tokenizer.tokenize(text)
        return tokens

    def tokens_to_text(self, tokens):
        text = self.tokenizer.convert_tokens_to_string(tokens)
        return remove_spaces(handle_quotes(text.strip()))

    def token_to_id(self, token):
        return self.tokens_to_ids([token])[0]

    def tokens_to_ids(self, tokens):
        ids = self.tokenizer.convert_tokens_to_ids(tokens)
        return ids

    def ids_to_tokens(self, ids):
        tokens = self.tokenizer.convert_ids_to_tokens(ids)
        return tokens

    def text_to_ids(self, text):
        tokens = self.text_to_tokens(text)
        ids = self.tokens_to_ids(tokens)
        return ids

    def ids_to_text(self, ids):
        tokens = self.ids_to_tokens(ids)
        tokens_clean = [t for t in tokens if t not in self.never_split]
        text = self.tokens_to_text(tokens_clean)
        return text

    def pad_id(self):
        return self.tokens_to_ids(["[PAD]"])[0]

    def bos_id(self):
        return self.tokens_to_ids(["[CLS]"])[0]

    def eos_id(self):
        return self.tokens_to_ids(["[SEP]"])[0]
Exemple #17
0
def example_to_input(lemma_list: List[str],
                     tags_list: List[int],
                     tok: BertTokenizer):
    subword_list, tags_map = tok.convert_tokens_to_ids(tok.tokenize('[CLS]')), []
    for w in lemma_list:
        tags_map.append(len(subword_list))
        subword_list += tok.convert_tokens_to_ids(tok.tokenize(w))
    subword_list += tok.convert_tokens_to_ids(tok.tokenize('[SEP]'))
    mapped_tags = [0] * len(subword_list)
    # mapped_pos = [0] * len(subword_list)
    # mapped_lemmas = ["[UNK]"] * len(subword_list)
    # mapped_altern = [[]] * len(subword_list)
    for i, j in enumerate(tag_map):
        mapped_tags[j] = tags_list[i]
        # mapped_pos[j] = example['pos'][i]
        # mapped_lemmas[j] = example['lemmas'][i]
        # mapped_altern[j] = example['alternatives'][i]
    return subword_list, mapped_tags
def convert_data_to_feature():
    #載入問題資料集
    q = open('Dataset/Query_Train/Final_question.txt', "r", encoding="utf-8")
    questions = q.readlines()
    q.close()
    #載入答案資料集
    a = open('Dataset/Train_Label/FinalDomainLabel.txt', "r", encoding="utf-8")
    answers = a.readlines()
    a.close()
    assert len(answers) == len(questions)
    # ans_dic 表示answer的類別
    ans_dic = make_ans_dic(answers)
    # question_dic 表示question的類別
    question_dic = make_question_dic(questions)

    tokenizer = BertTokenizer(vocab_file='bert-base-chinese-vocab.txt')
    q_tokens = []
    max_seq_len = 0

    for q in question_dic.data:
        bert_ids = tokenizer.build_inputs_with_special_tokens(
            tokenizer.convert_tokens_to_ids(tokenizer.tokenize(q)))
        if (len(bert_ids) > max_seq_len):
            max_seq_len = len(bert_ids)
        q_tokens.append(bert_ids)

    print("最長問句長度:", max_seq_len)
    assert max_seq_len <= 512  # 小於BERT-base長度限制
    # 補齊長度
    for q in q_tokens:
        while len(q) < max_seq_len:
            q.append(0)
    a_labels = []
    for a in ans_dic.data:
        a_labels.append(ans_dic.to_id(a))
    # BERT input embedding
    answer_lables = a_labels
    input_ids = q_tokens
    input_masks = [[1] * max_seq_len for i in range(len(question_dic))]
    input_segment_ids = [[0] * max_seq_len for i in range(len(question_dic))]
    assert len(input_ids) == len(question_dic) and len(input_ids) == len(
        input_masks) and len(input_ids) == len(input_segment_ids)

    data_features = {
        'input_ids': input_ids,
        'input_masks': input_masks,
        'input_segment_ids': input_segment_ids,
        'answer_lables': answer_lables,
        'question_dic': question_dic,
        'answer_dic': ans_dic
    }

    output = open('Dataset/data_features_domain.pkl', 'wb')
    pickle.dump(data_features, output)
    return data_features
Exemple #19
0
    def __init__(self,
                 bert_tokenizer: BertTokenizer,
                 jp_tokenizer: JumanTokenizer,
                 args,
                 file_path='train',
                 block_size=512):
        assert os.path.isfile(file_path)
        directory, filename = os.path.split(file_path)
        cached_features_file = os.path.join(
            directory, 'cached_lm_' + str(block_size) + '_' + filename)

        if os.path.exists(cached_features_file) and not args.overwrite_cache:
            logger.info("Loading features from cached file %s",
                        cached_features_file)
            with open(cached_features_file, 'rb') as handle:
                self.examples = pickle.load(handle)
        else:
            logger.info("Creating features from dataset file at %s", directory)

            self.examples = []
            with open(file_path, encoding="utf-8") as f:
                docs = f.readlines()

            exsamples = []
            for _, line in enumerate(docs):
                text = line.rstrip(os.linesep)

                # separate text into tokens
                tokenized_text = bert_tokenizer.convert_tokens_to_ids(
                    bert_tokenizer.tokenize(" ".join(
                        jp_tokenizer.tokenize(text))))

                # add special tokkens : [CLS] and [SEP]
                added_special = bert_tokenizer.build_inputs_with_special_tokens(
                    tokenized_text)

                # Zero-pad up to the sequence length.
                diff = block_size - len(added_special)
                if diff < 0:
                    added_special = added_special[:diff]
                else:
                    # padding を 0 -> -1に変更
                    padding = [-1] * (block_size - len(added_special))
                    added_special += padding

                assert len(added_special) == block_size

                self.examples.append(added_special)

            logger.info("Saving features into cached file %s",
                        cached_features_file)
            with open(cached_features_file, 'wb') as handle:
                pickle.dump(self.examples,
                            handle,
                            protocol=pickle.HIGHEST_PROTOCOL)
Exemple #20
0
def generate_masked_sent(tokenizer: BertTokenizer, sent: str, mask_token: str, mask_rate:float=0.15):
    words = tokenizer.tokenize(sent)
    input_list = words
    output_list = words
    len_sent = len(words)
    num_mask_token = int(len_sent*mask_rate)
    masked_idx = random.sample(list(range(len_sent), num_mask_token))
    for idx in masked_idx:
        # TODO: randomly masks / replaces / keeps the token
        input_list[idx] = mask_token
    return input_list, output_list
def bert_text_preparation(text: str, tokenizer: BertTokenizer):
    marked_text = "[CLS] " + text + " [SEP]"
    tokenized_text = tokenizer.tokenize(marked_text)

    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [1] * len(indexed_tokens)

    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensor = torch.tensor([segments_ids])

    return tokenized_text, tokens_tensor, segments_tensor
Exemple #22
0
class testAnswerGeneration():
    def __init__(self):
        self.tokenizer = BertTokenizer(
            vocab_file='bert-base-chinese-vocab.txt')
        self.config = BertConfig.from_pretrained('trained_model/1/config.json')
        self.model = BertForMaskedLM.from_pretrained(
            'trained_model/1/pytorch_model.bin',
            from_tf=bool('.ckpt' in 'bert-base-chinese'),
            config=self.config)
        self.model.eval()

    def to_input_id(self, sentence_input):
        return self.tokenizer.convert_tokens_to_ids(
            self.tokenizer.tokenize(sentence_input))

    def getAnswer(self, context, question):
        input_id = self.to_input_id("[CLS] " + context + " [SEP] " + question +
                                    " [SEP]")

        count = 0
        answer = ""
        maskpos = len(input_id)  # 標出要預測答案的位置
        input_id.append(103)
        # 補齊長度
        while len(input_id) < 512:
            input_id.append(0)

        # 限制答案最大長度為10
        while (count < 10):
            input_id_tensor = torch.LongTensor([input_id])
            outputs = self.model(input_id_tensor)
            predictions = outputs[0]
            predicted_index = torch.argmax(
                predictions[0, maskpos]).item()  # 生出最有可能的token_id
            predicted_token = self.tokenizer.convert_ids_to_tokens(
                predicted_index)  # id轉token

            # 當預測為[SEP]的時候,就結束生成答案
            if predicted_token == '[SEP]':
                break

            answer = answer + predicted_token  # 將生成的token連接起來
            input_id[maskpos] = predicted_index  # 用生成的token_id取代當前的[MASK]的id
            maskpos += 1
            if maskpos < 512:
                input_id[maskpos] = 103  # 標出下一個預測的[MASK]的id
            else:
                break

            count += 1

        return answer
def subword_tokenize(tokenizer: BertTokenizer,
                     tokens: List[str]) -> List[Tuple[int, str]]:
    """
    Returns: List of subword tokens, List of indices mapping each subword token to one real token.
    """
    subtokens = [tokenizer.tokenize(t) for t in tokens]

    indexed_subtokens = []
    for idx, subtoks in enumerate(subtokens):
        for subtok in subtoks:
            indexed_subtokens.append((idx, subtok))

    return indexed_subtokens
def split_kc(input_dir: Path, output_dir: Path, max_subword_length: int,
             tokenizer: BertTokenizer):
    """
    各文書を,tokenize したあとの長さが max_subword_length 以下になるように複数の文書に分割する.
    1文に分割しても max_subword_length を超えるような長い文はそのまま出力する
    """
    did2sids: Dict[str, List[str]] = defaultdict(list)
    did2cumlens: Dict[str, List[int]] = {}
    sid2knp: Dict[str, str] = {}

    for knp_file in input_dir.glob('*.knp'):
        with knp_file.open() as fin:
            did = knp_file.stem
            did2cumlens[did] = [0]
            buff = ''
            for line in fin:
                buff += line
                if line.strip() == 'EOS':
                    blist = BList(buff)
                    did2sids[did].append(blist.sid)
                    did2cumlens[did].append(did2cumlens[did][-1] + len(
                        tokenizer.tokenize(' '.join(
                            m.midasi for m in blist.mrph_list()))))
                    sid2knp[blist.sid] = buff
                    buff = ''

    for did, sids in did2sids.items():
        cum: List[int] = did2cumlens[did]
        end = 1
        # end を探索
        while end < len(sids) and cum[end + 1] - cum[0] <= max_subword_length:
            end += 1

        idx = 0
        while end < len(sids) + 1:
            start = 0
            # start を探索
            while cum[end] - cum[start] > max_subword_length:
                start += 1
                if start == end - 1:
                    break
            with output_dir.joinpath(f'{did}-{idx:02}.knp').open(
                    mode='w') as fout:
                fout.write(''.join(
                    sid2knp[sid]
                    for sid in sids[start:end]))  # start から end まで書き出し
            idx += 1
            end += 1
def map_labels_to_wordpiece(words: list, labels: list,
                            tokenizer: BertTokenizer):
    """
    Maps labels from original sentence to labels per bert wordpeace token
    @param words:                 words
    @param labels:                labels per word
    @param wordpiece_tokenizer:
    """
    assert len(words) == len(labels)
    wp_labels = []

    for word, label in zip(words, labels):
        wp_labels += [label] * len(tokenizer.tokenize(word))
    wp_labels = ['O'] + wp_labels + ['O']

    return wp_labels
Exemple #26
0
def get_pos_embedding(semantics_list, ori_syntactic_list,
                      syntactic_list_in_dict, nlp):
    pos_embedding_list = []
    # pos_encoder_dict_temp=convert_tuple_to_dict(nlp.get_pipe("tagger").labels)

    # pos_encoder_dict={}
    # for key_num,pos_tag in pos_encoder_dict_temp.items():
    #     pos_encoder_dict[key_num+1]=pos_tag
    reduced_pos_encoder_dict = get_reduced_pos_encoder_dict()
    # print(pos_encoder_dict)
    tokenizer = BertTokenizer(vocab_file='bert-base-uncased-vocab.txt')
    for i in tqdm(range(len(ori_syntactic_list))):
        # 先產生兩句話詞性的dict
        semantics_tag_dict = get_sentence_tag_dict(nlp, semantics_list[i])
        syntactic_tag_dict = get_sentence_tag_dict(nlp, ori_syntactic_list[i])

        semantics_token = []
        semantics_token.append("[CLS]")
        semantics_token.extend(tokenizer.tokenize(semantics_list[i]))
        semantics_token.append("[SEP]")
        # semantics_pos=pos_match(semantics_tag_dict,semantics_token,pos_encoder_dict)
        semantics_pos = reduced_pos_match(semantics_tag_dict, semantics_token,
                                          reduced_pos_encoder_dict)
        for j, syntactic_sentence in enumerate(syntactic_list_in_dict[i]):
            input_pos = []
            input_token = []
            syntactic_token = []
            # [CLS] + semantics_sentence(X2) + [SEP] + syntactic_sentence(X1)
            input_token = semantics_token.copy()
            syntactic_token = syntactic_sentence
            input_token.extend(syntactic_sentence)

            # syntactic_pos=pos_match(syntactic_tag_dict,syntactic_token,pos_encoder_dict)
            syntactic_pos = reduced_pos_match(syntactic_tag_dict,
                                              syntactic_token,
                                              reduced_pos_encoder_dict)

            input_pos.extend(semantics_pos)
            input_pos.extend(syntactic_pos)
            assert len(input_token) == len(input_pos)
            # print("input_token",input_token)
            # print("input_pos",input_pos)
            pos_embedding_list.append(input_pos.copy())

    return pos_embedding_list
Exemple #27
0
def get_encoder_embedding(phrases: List[str], bert: BertModel,
                          tokenizer: BertTokenizer,
                          embed_wo_special_tokens: bool) -> torch.Tensor:
    subtok_ids_list, hypo_mask_list = [], []
    for phr in phrases:
        subtok_ids_list.append(
            tokenizer.convert_tokens_to_ids(['[CLS]'] +
                                            tokenizer.tokenize(phr) +
                                            ['[SEP]']))
        hypo_mask_list.append([1.0] * len(subtok_ids_list[-1]))
        if embed_wo_special_tokens:
            hypo_mask_list[-1][0] = 0.0
            hypo_mask_list[-1][-1] = 0.0
    batch = HypoDataset.torchify_and_pad(subtok_ids_list, hypo_mask_list)
    subtok_ids_batch, hypo_mask_batch, attn_mask_batch = to_device(*batch)
    h = bert(subtok_ids_batch, attention_mask=attn_mask_batch)[0]
    m = hypo_mask_batch.unsqueeze(2)
    phrase_representations = torch.sum(h * m, 1) / torch.sum(m, 1)
    return phrase_representations
Exemple #28
0
    def bert_tokenize_sentence(self,
                               tokens: List[str],
                               tokenizer: BertTokenizer = None) -> List[str]:
        """
        Auxiliary function that tokenize given context into subwords.

        Args:
            tokens: list of unsplitted tokens.
            tokenizer: tokenizer to be used for words tokenization into subwords.

        Returns:
            list of newly acquired tokens
        """
        if tokenizer is None:
            tokenizer = self.tokenizer
        bert_tokens = list()
        for token in tokens:
            bert_tokens.extend(tokenizer.tokenize(token))
        return bert_tokens
Exemple #29
0
def process_trees_and_bpes(sst_dir='.data/sst/trees/',
                           tokenizer: BertTokenizer = None,
                           save_dir='data/sst/fine-tune',
                           save_bpe=True,
                           **kwargs):
    train_matrices, train_texts = trees2matrices_and_texts(
        os.path.join(sst_dir, 'train.txt'), **kwargs)
    torch.save(train_matrices, os.path.join(save_dir, 'trees'))
    if save_bpe:
        all_bpe_indices = []
        for text in train_texts:  # text is a list of tokens
            tokens = tokenizer.tokenize(' '.join(text))
            bpe_indices = []
            for i, t in enumerate(tokens):
                if '##' in t:
                    bpe_indices.append(i)
            all_bpe_indices.append(bpe_indices)
            assert len(text) == len(tokens) - len(bpe_indices)
        torch.save(all_bpe_indices, os.path.join(save_dir, 'bpe'))
Exemple #30
0
def to_action_tuple(
        word: str,
        grammar: state_tree.NQCFG,
        tokenizer: transformers.BertTokenizer,
        valid_actions: Optional[Collection[int]] = None) -> Tuple[int, ...]:

    # If you want to programmatically exclude certain kinds of tokens from the
    # tries, add the logic here and return an empty tuple for `word`s you do not
    # want to see included.
    if (common_flags.EXCLUDE_PUNCTUATION_FROM_TRIE.value == 1
            and word in string.punctuation):
        return ()

    # We need to skip `unks` that aren't even covered by word pieces.
    tokens = tokenizer.tokenize(word)
    actions = []
    for i, token in enumerate(tokens):
        # The vocabulary might contain items like `toxin’s` which the BERT tokenizer
        # does tokenize into several initial word pieces.  Presence of such a
        # sequence in the trie would be inconsistent with our insistence of deriving
        # words as exactly one initial word piece followed by several non-initial
        # ones.  We can still return the "word" assembled so far, as this will match
        # the grammar, but cannot accumulate any more tokens.
        if i > 0 and not token.startswith('##'):
            break
        token = state_tree.NQStateTree.clean_escape_characters(token)
        if token not in grammar.terminal_to_action:
            return ()
        token_action = grammar.terminal_to_action[token]

        # If we are in a "constrained" setting, we need to ensure that every token
        # of a word is actually present.  This situation can arise if the state
        # truncation cuts off the final word piece(s) of a multi-piece sequence.
        # Implementing the constraint here allows us to not worry about it in the
        # transition model; otherwise, we would not just need to check for sub-trie
        # presence for a prefix to determine recursion, but check explicitly that at
        # least one path along the subtrie is "constructible".
        if valid_actions is not None:
            if token_action not in valid_actions:
                return ()

        actions.append(token_action)
    return tuple(actions)