class BertTokenizer:
    def __init__(self, bert_path, tokenizer_cls=FullTokenizer, maxlen=512):
        self.maxlen = maxlen
        # with tf.compat.v1.Session() as sess:
        #     bert = hub.Module(bert_path)
        #     tk_info = bert(signature='tokenization_info', as_dict=True)
        #     tk_info = [tk_info['vocab_file'], tk_info['do_lower_case']]
        #     vocab_file, do_lower_case = sess.run(tk_info)
        #     self.tokenizer = tokenizer_cls(vocab_file, do_lower_case)
        bert_layer = hub.KerasLayer(bert_path, trainable=True)
        vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
        do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
        self.tokenizer = FullTokenizer(vocab_file, do_lower_case)

    def convert_sentences_to_ids(self, sentences):
        ids = list(map(self.convert_single_sentence_to_ids, sentences))
        return np.array(ids)

    def convert_single_sentence_to_ids(self, sentence):
        tokens = self.tokenize(sentence)
        tokens = ['[CLS]'] + tokens + ['[SEP]']
        tokens += (self.maxlen - len(tokens)) * ['[PAD]']
        return self.tokenizer.convert_tokens_to_ids(tokens)

    def convert_two_sentence_to_ids(self,
                                    sent1,
                                    sent2,
                                    maxlen=None,
                                    return_tokens=False):
        if not maxlen:
            maxlen = self.maxlen
        tokens1 = self.tokenize(sent1)
        tokens2 = self.tokenize(sent2)
        if len(tokens1) + len(tokens2) > maxlen - 3:
            tokens2 = tokens2[:maxlen - 3 - len(tokens1)]
        tokens = ['[CLS]'] + tokens1 + ['[SEP]'] + tokens2 + ['[SEP]']
        tokens += (maxlen - len(tokens)) * ['[PAD]']
        ids = self.tokenizer.convert_tokens_to_ids(tokens)
        if return_tokens:
            return tokens1, tokens2, ids
        return ids

    def convert_sentence_to_features(self, sent1, sent2, maxlen=None):
        if not maxlen:
            maxlen = self.maxlen

        tokens1, tokens2, token_ids = self.convert_two_sentence_to_ids(
            sent1, sent2, maxlen, return_tokens=True)
        segment_ids = [0] * (len(tokens1) + 2) + [1] * (len(tokens2) + 1)
        input_mask = [1] * len(segment_ids)
        segment_ids += (maxlen - len(segment_ids)) * [0]
        input_mask += (maxlen - len(input_mask)) * [0]

        return token_ids, input_mask, segment_ids

    def tokenize(self, sent):
        return self.tokenizer.tokenize(sent)
Esempio n. 2
0
def predict_input_fn_generator(input_file_or_list,
                               config: Params,
                               mode='predict'):
    # if is string, treat it as path to file
    if isinstance(input_file_or_list, str):
        inputs = open(input_file_or_list, 'r', encoding='utf8').readlines()
    else:
        inputs = input_file_or_list

    tokenizer = FullTokenizer(config.vocab_file)

    data_dict = {}
    data_dict['input_ids'] = []
    data_dict['input_mask'] = []
    data_dict['segment_ids'] = []

    for doc in inputs:

        inputs_a = list(doc)
        tokens, target = tokenize_text_with_seqs(tokenizer, inputs_a, None)

        tokens_a, tokens_b, target = truncate_seq_pair(tokens, None, target,
                                                       config.max_seq_len)

        tokens, segment_ids, target = add_special_tokens_with_seqs(
            tokens_a, tokens_b, target)

        input_mask, tokens, segment_ids, target = create_mask_and_padding(
            tokens, segment_ids, target, config.max_seq_len)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        data_dict['input_ids'] = input_ids
        data_dict['input_mask'] = input_mask
        data_dict['segment_ids'] = segment_ids
        yield data_dict
Esempio n. 3
0
def prepare_train_dataset(input_file,
                          output_data_dir,
                          output_filename,
                          sliding_window_size,
                          config,
                          tokenizer=None,
                          vocab_file=None,
                          language="english",
                          max_doc_length: int = None,
                          is_training=True,
                          demo=False,
                          lowercase=False):
    if vocab_file is None:
        if not lowercase:
            vocab_file = os.path.join(REPO_PATH, "data_utils",
                                      "uppercase_vocab.txt")
        else:
            vocab_file = os.path.join(REPO_PATH, "data_utils",
                                      "lowercase_vocab.txt")

    if tokenizer is None:
        tokenizer = FullTokenizer(vocab_file=vocab_file,
                                  do_lower_case=lowercase)

    writer = tf.python_io.TFRecordWriter(
        os.path.join(output_data_dir,
                     "{}.{}.tfrecord".format(output_filename, language)))
    doc_map = {}
    documents = read_conll_file(input_file)
    for doc_idx, document in enumerate(documents):
        doc_info = parse_document(document, language)
        tokenized_document = tokenize_document(config,
                                               doc_info,
                                               tokenizer,
                                               max_doc_length=max_doc_length)
        doc_key = tokenized_document['doc_key']
        token_windows, mask_windows, text_len = convert_to_sliding_window(
            tokenized_document, sliding_window_size)
        input_id_windows = [
            tokenizer.convert_tokens_to_ids(tokens) for tokens in token_windows
        ]
        span_start, span_end, mention_span, cluster_ids = flatten_clusters(
            tokenized_document['clusters'])

        # {'sub_tokens': sub_tokens, 'sentence_map': sentence_map, 'subtoken_map': subtoken_map,
        # 'speakers': speakers, 'clusters': clusters, 'doc_key': doc_info['doc_key']}
        tmp_speaker_ids = tokenized_document["speakers"]
        tmp_speaker_ids = [[0] * 130] * config["max_training_sentences"]
        instance = (input_id_windows, mask_windows, text_len, tmp_speaker_ids,
                    tokenized_document["genre"], is_training, span_start,
                    span_end, cluster_ids, tokenized_document['sentence_map'])
        write_instance_to_example_file(writer, instance, doc_key, config)
        doc_map[doc_idx] = doc_key
        if demo and doc_idx > 3:
            break
    with open(
            os.path.join(output_data_dir,
                         "{}.{}.map".format(output_filename, language)),
            'w') as fo:
        json.dump(doc_map, fo, indent=2)
def prepare_training_data(data_dir: str, language: str, vocab_file: str,
                          sliding_window_size: int):
    tokenizer = FullTokenizer(vocab_file=vocab_file, do_lower_case=False)
    for dataset in ['train', 'dev', 'test']:
        conll_file_path = os.path.join(data_dir,
                                       F"{dataset}.{language}.v4_gold_conll")
        writer = tf.python_io.TFRecordWriter(
            os.path.join(data_dir, F"{dataset}.{language}.tfrecord"))
        doc_map = {}
        documents = read_conll_file(conll_file_path)
        for doc_idx, document in enumerate(documents):
            doc_info = parse_document(document, language)
            checkout_clusters(doc_info)
            tokenized_document = tokenize_document(doc_info, tokenizer)
            doc_map[doc_idx] = tokenized_document['doc_key']
            token_windows, mask_windows = convert_to_sliding_window(
                tokenized_document, sliding_window_size)
            input_id_windows = [
                tokenizer.convert_tokens_to_ids(tokens)
                for tokens in token_windows
            ]
            span_starts, span_ends, cluster_ids = flatten_clusters(
                tokenized_document['clusters'])
            instance = (doc_idx, tokenized_document['sentence_map'],
                        tokenized_document['subtoken_map'], input_id_windows,
                        mask_windows, span_starts, span_ends, cluster_ids)
            write_instance_to_example_file(writer, instance)
        with open(os.path.join(data_dir, F"{dataset}.{language}.map"),
                  'w') as fo:
            json.dump(doc_map, fo, indent=2)
Esempio n. 5
0
class Inferer:
  def __init__(self, checkpoint, attr_values_file, vocab_file):
    self.checkpoint = checkpoint
    self.attr_values_file = attr_values_file
    self.vocab_file = vocab_file
    if not os.path.exists(self.checkpoint):
      raise Exception("local checkpoint %s not exists" % self.checkpoint)
    if not os.path.exists(self.attr_values_file):
      raise Exception("local attr_values_file %s not exists" % self.attr_values_file)
    if not os.path.exists(self.vocab_file):
      raise Exception("local vocab_file %s not exists" % self.vocab_file)
    self.config = InferConfig()
    self.tokenizer = FullTokenizer(self.vocab_file)
    with open(self.attr_values_file, 'rb') as fr:
      attr_values, attr_values_r = pickle.load(fr)
    self.attr_values_r = attr_values_r
    self.config.output_dim = len(attr_values_r)

    self.graph = tf.Graph()
    with self.graph.as_default():
      self.input_ids_p = tf.placeholder(tf.int32, [None, self.config.max_seq_length])
      self.token_type_ids_p = tf.placeholder(tf.int32, [None, self.config.max_seq_length])
      self.input_mask_p = tf.placeholder(tf.int32, [None, self.config.max_seq_length])
      model = Model(self.config)
      self.inference = model.infer(self.input_ids_p, self.token_type_ids_p, self.input_mask_p)
      ckpt_state = tf.train.get_checkpoint_state(self.checkpoint)
      if not (ckpt_state and ckpt_state.model_checkpoint_path):
        raise Exception('No model to eval yet at: ' + self.checkpoint)
      self.sess = tf.Session(config = tf.ConfigProto(allow_soft_placement = True))
      saver = tf.train.Saver()
      saver.restore(self.sess, ckpt_state.model_checkpoint_path)

  def infer(self, sequences):
    transforms = [self._transform(s) for s in sequences if s != '']
    input_ids, token_type_ids, input_mask = list(map(lambda x: list(x), zip(*transforms)))
    with self.graph.as_default():
      result = self.sess.run(self.inference, feed_dict = {
                                                           self.input_ids_p: input_ids,
                                                           self.token_type_ids_p: token_type_ids,
                                                           self.input_mask_p: input_mask
                                                         })
    return [self.attr_values_r[e] for e in result]

  def _transform(self, sequence):
    tokens = self.tokenizer.tokenize(sequence)
    if len(tokens) > self.config.max_seq_length - 2:
      tokens = tokens[0:self.config.max_seq_length - 2]
    tokens = ['[CLS]'] + tokens + ['[SEP]']
    token_ids = self.tokenizer.convert_tokens_to_ids(tokens)

    input_ids_1 = token_ids[0:self.config.max_seq_length] + [0] * (self.config.max_seq_length - len(token_ids))
    token_type_ids_1 = [0] * self.config.max_seq_length
    input_mask_1 = [1] * len(token_ids) + [0] * (self.config.max_seq_length - len(token_ids))
    return input_ids_1, token_type_ids_1, input_mask_1
    def test_compare(self):

        model_dir = tempfile.TemporaryDirectory().name
        os.makedirs(model_dir)
        save_path = MiniBertFactory.create_mini_bert_weights(model_dir)
        tokenizer = FullTokenizer(vocab_file=os.path.join(
            model_dir, "vocab.txt"),
                                  do_lower_case=True)

        # prepare input
        max_seq_len = 16
        input_str = "hello, bert!"
        input_tokens = tokenizer.tokenize(input_str)
        input_tokens = ["[CLS]"] + input_tokens + ["[SEP]"]
        input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
        input_ids = input_ids + [0] * (max_seq_len - len(input_tokens))
        input_mask = [1] * len(input_tokens) + [0] * (max_seq_len -
                                                      len(input_tokens))
        token_type_ids = [0] * len(input_tokens) + [0] * (max_seq_len -
                                                          len(input_tokens))

        input_ids = np.array([input_ids], dtype=np.int32)
        input_mask = np.array([input_mask], dtype=np.int32)
        token_type_ids = np.array([token_type_ids], dtype=np.int32)

        print("   tokens:", input_tokens)
        print(
            "input_ids:{}/{}:{}".format(len(input_tokens), max_seq_len,
                                        input_ids), input_ids.shape,
            token_type_ids)

        bert_1_seq_out = CompareBertActivationsTest.predict_on_stock_model(
            model_dir, input_ids, input_mask, token_type_ids)
        bert_2_seq_out = CompareBertActivationsTest.predict_on_keras_model(
            model_dir, input_ids, input_mask, token_type_ids)

        np.set_printoptions(precision=9,
                            threshold=20,
                            linewidth=200,
                            sign="+",
                            floatmode="fixed")

        print("stock bert res", bert_1_seq_out.shape)
        print("keras bert res", bert_2_seq_out.shape)

        print("stock bert res:\n {}".format(bert_1_seq_out[0, :2, :10]),
              bert_1_seq_out.dtype)
        print("keras bert_res:\n {}".format(bert_2_seq_out[0, :2, :10]),
              bert_2_seq_out.dtype)

        abs_diff = np.abs(bert_1_seq_out - bert_2_seq_out).flatten()
        print("abs diff:", np.max(abs_diff), np.argmax(abs_diff))
        self.assertTrue(np.allclose(bert_1_seq_out, bert_2_seq_out, atol=1e-6))
    def test_finetune(self):

        model_dir = tempfile.TemporaryDirectory().name
        os.makedirs(model_dir)
        save_path = MiniBertFactory.create_mini_bert_weights(model_dir)
        tokenizer = FullTokenizer(vocab_file=os.path.join(
            model_dir, "vocab.txt"),
                                  do_lower_case=True)

        # prepare input
        max_seq_len = 24
        input_str_batch = ["hello, bert!", "how are you doing!"]

        input_ids_batch = []
        token_type_ids_batch = []
        for input_str in input_str_batch:
            input_tokens = tokenizer.tokenize(input_str)
            input_tokens = ["[CLS]"] + input_tokens + ["[SEP]"]

            print("input_tokens len:", len(input_tokens))

            input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
            input_ids = input_ids + [0] * (max_seq_len - len(input_tokens))
            token_type_ids = [0] * len(input_tokens) + [0] * (
                max_seq_len - len(input_tokens))

            input_ids_batch.append(input_ids)
            token_type_ids_batch.append(token_type_ids)

        input_ids = np.array(input_ids_batch, dtype=np.int32)
        token_type_ids = np.array(token_type_ids_batch, dtype=np.int32)

        print("   tokens:", input_tokens)
        print(
            "input_ids:{}/{}:{}".format(len(input_tokens), max_seq_len,
                                        input_ids), input_ids.shape,
            token_type_ids)

        model = CompareBertActivationsTest.load_keras_model(
            model_dir, max_seq_len)
        model.compile(optimizer=keras.optimizers.Adam(),
                      loss=keras.losses.mean_squared_error)

        pres = model.predict([input_ids, token_type_ids
                              ])  # just for fetching the shape of the output
        print("pres:", pres.shape)

        model.fit(x=(input_ids, token_type_ids),
                  y=np.zeros_like(pres),
                  batch_size=2,
                  epochs=2)
    def test_direct_keras_to_stock_compare(self):
        from tests.ext.modeling import BertModel, BertConfig, get_assignment_map_from_checkpoint

        bert_config = BertConfig.from_json_file(self.bert_config_file)
        tokenizer = FullTokenizer(
            vocab_file=os.path.join(self.bert_ckpt_dir, "vocab.txt"))

        # prepare input
        max_seq_len = 6
        input_str = "Hello, Bert!"
        input_tokens = tokenizer.tokenize(input_str)
        input_tokens = ["[CLS]"] + input_tokens + ["[SEP]"]
        input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
        input_ids = input_ids + [0] * (max_seq_len - len(input_tokens))
        input_mask = [1] * len(input_tokens) + [0] * (max_seq_len -
                                                      len(input_tokens))
        token_type_ids = [0] * len(input_tokens) + [0] * (max_seq_len -
                                                          len(input_tokens))

        input_ids = np.array([input_ids], dtype=np.int32)
        input_mask = np.array([input_mask], dtype=np.int32)
        token_type_ids = np.array([token_type_ids], dtype=np.int32)

        print("   tokens:", input_tokens)
        print(
            "input_ids:{}/{}:{}".format(len(input_tokens), max_seq_len,
                                        input_ids), input_ids.shape,
            token_type_ids)

        s_res = self.predict_on_stock_model(input_ids, input_mask,
                                            token_type_ids)
        k_res = self.predict_on_keras_model(input_ids, input_mask,
                                            token_type_ids)

        np.set_printoptions(precision=9,
                            threshold=20,
                            linewidth=200,
                            sign="+",
                            floatmode="fixed")
        print("s_res", s_res.shape)
        print("k_res", k_res.shape)

        print("s_res:\n {}".format(s_res[0, :2, :10]), s_res.dtype)
        print("k_res:\n {}".format(k_res[0, :2, :10]), k_res.dtype)

        adiff = np.abs(s_res - k_res).flatten()
        print("diff:", np.max(adiff), np.argmax(adiff))
        self.assertTrue(np.allclose(s_res, k_res, atol=1e-6))
Esempio n. 9
0
def tokenize_single_input(text, tokenizer: btk.FullTokenizer,
                          max_input_length):
    tokens = ['[CLS]']
    tokens += tokenizer.tokenize(text)
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    token_masks = [1] * len(token_ids)
    segment_ids = [0] * max_input_length

    if len(token_ids) > max_input_length:
        raise ValueError(
            'The input is %i while the maximum input can be only %i.' %
            (len(token_ids), max_input_length))

    while len(token_ids) != max_input_length:
        token_ids.append(0)
        token_masks.append(0)

    return token_ids, token_masks, segment_ids
Esempio n. 10
0
def tokenize_data(input_str_batch, max_seq_len, model_dir):
    tokenizer = FullTokenizer(vocab_file=os.path.join(model_dir, "vocab.txt"),
                              do_lower_case=True)
    input_ids_batch = []
    token_type_ids_batch = []
    for input_str in input_str_batch:
        input_tokens = tokenizer.tokenize(input_str)
        input_tokens = ["[CLS]"] + input_tokens + ["[SEP]"]

        print("input_tokens len:", len(input_tokens))

        input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
        if len(input_tokens) > max_seq_len:
            input_ids = input_ids[:max_seq_len]
        else:
            input_ids = input_ids + [0] * (max_seq_len - len(input_tokens))
        # token_type_ids = [0] * len(input_tokens) + [0] * (max_seq_len - len(input_tokens))
        token_type_ids = [0] * max_seq_len
        input_ids_batch.append(input_ids)
        token_type_ids_batch.append(token_type_ids)
    return input_ids_batch, token_type_ids_batch
Esempio n. 11
0
def predict_input_fn(input_file_or_list, config: Params, mode='predict'):

    # if is string, treat it as path to file
    if isinstance(input_file_or_list, str):
        inputs = open(input_file_or_list, 'r', encoding='utf8').readlines()
    else:
        inputs = input_file_or_list

    tokenizer = FullTokenizer(config.vocab_file)

    data_dict = {}
    data_dict['input_ids'] = []
    data_dict['input_mask'] = []
    data_dict['segment_ids'] = []

    for doc in inputs:
        inputs_a = list(doc)
        tokens, target = tokenize_text_with_seqs(tokenizer, inputs_a, None)

        tokens_a, tokens_b, target = truncate_seq_pair(tokens, None, target,
                                                       config.max_seq_len)

        tokens, segment_ids, target = add_special_tokens_with_seqs(
            tokens_a, tokens_b, target)

        input_mask, tokens, segment_ids, target = create_mask_and_padding(
            tokens, segment_ids, target, config.max_seq_len)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        data_dict['input_ids'].append(input_ids)
        data_dict['input_mask'].append(input_mask)
        data_dict['segment_ids'].append(segment_ids)

    dataset = tf.data.Dataset.from_tensor_slices(data_dict)
    dataset = dataset.batch(config.batch_size * 2)

    return dataset
Esempio n. 12
0
class BERTEmbeddingEvaluator(SentenceEmbeddingEvaluator):
    def __init__(
            self,
            model_fname="/notebooks/embedding/data/sentence-embeddings/bert/tune-ckpt",
            bertconfig_fname="/notebooks/embedding/data/sentence-embeddings/bert/multi_cased_L-12_H-768_A-12/bert_config.json",
            vocab_fname="/notebooks/embedding/data/sentence-embeddings/bert/multi_cased_L-12_H-768_A-12/vocab.txt",
            max_seq_length=32,
            dimension=768,
            num_labels=2,
            use_notebook=False):

        super().__init__("bert", dimension, use_notebook)
        config = BertConfig.from_json_file(bertconfig_fname)
        self.max_seq_length = max_seq_length
        self.tokenizer = FullTokenizer(vocab_file=vocab_fname,
                                       do_lower_case=False)
        self.model, self.input_ids, self.input_mask, self.segment_ids, self.probs = make_bert_graph(
            config, max_seq_length, 1.0, num_labels, tune=False)
        saver = tf.train.Saver(tf.global_variables())
        self.sess = tf.Session()
        checkpoint_path = tf.train.latest_checkpoint(model_fname)
        saver.restore(self.sess, checkpoint_path)

    def predict(self, sentence):
        tokens = self.tokenize(sentence)
        model_input = self.make_input(tokens)
        probs = self.sess.run(self.probs, model_input)
        return probs

    """
    sentence를 입력하면 토크나이즈 결과와 token 벡터 시퀀스를 반환한다
        - shape :[[# of tokens], [batch size, max seq length, dimension]]
    """

    def get_token_vector_sequence(self, sentence):
        tokens = self.tokenize(sentence)
        model_input = self.make_input(tokens)
        return [
            tokens,
            self.sess.run(self.model.get_sequence_output()[0],
                          model_input)[:len(tokens) + 2]
        ]

    """
    sentence를 입력하면 토크나이즈 결과와 [CLS] 벡터를 반환한다
         - shape :[[# of tokens], [batch size, dimension]]
    """

    def get_sentence_vector(self, sentence):
        tokens = self.tokenize(sentence)
        model_input = self.make_input(tokens)
        return [
            tokens,
            self.sess.run(self.model.pooled_output, model_input)[0]
        ]

    """
    sentence를 입력하면 토크나이즈 결과와 self-attention score matrix를 반환한다
        - shape :[[# of tokens], [batch size, # of tokens, # of tokens]]
    """

    def get_self_attention_score(self, sentence):
        tokens = self.tokenize(sentence)
        model_input = self.make_input(tokens)
        # raw_score : shape=[# of layers, batch_size, num_attention_heads, max_seq_length, max_seq_length]
        raw_score = self.sess.run(self.model.attn_probs_for_visualization_list,
                                  model_input)
        # 마지막 레이어를 취한 뒤, attention head 기준(axis=0)으로 sum
        scores = np.sum(raw_score[-1][0], axis=0)
        # scores matrix에서 토큰 개수만큼 취함
        scores = scores[:len(tokens), :len(tokens)]
        return [tokens, scores]

    def tokenize(self, sentence):
        return self.tokenizer.tokenize(convert_to_unicode(sentence))

    def make_input(self, tokens):
        tokens = tokens[:(self.max_seq_length - 2)]
        token_sequence = ["[CLS]"] + tokens + ["[SEP]"]
        segment = [0] * len(token_sequence)
        sequence = self.tokenizer.convert_tokens_to_ids(token_sequence)
        current_length = len(sequence)
        padding_length = self.max_seq_length - current_length
        input_feed = {
            self.input_ids:
            np.array([sequence + [0] * padding_length]),
            self.segment_ids:
            np.array([segment + [0] * padding_length]),
            self.input_mask:
            np.array([[1] * current_length + [0] * padding_length])
        }
        return input_feed

    def visualize_self_attention_scores(self, sentence):
        tokens, scores = self.get_self_attention_score(sentence)
        visualize_self_attention_scores(tokens,
                                        scores,
                                        use_notebook=self.use_notebook)
Esempio n. 13
0
class BERTVectorizer:
    def __init__(
        self,
        sess,
        is_bert,
        # bert_model_hub_path='https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1'
        bert_model_hub_path="https://tfhub.dev/google/albert_base/1"):
        self.sess = sess
        self.is_bert = is_bert
        self.bert_model_hub_path = bert_model_hub_path
        self.create_tokenizer_from_hub_module(is_bert=is_bert)

    def create_tokenizer_from_hub_module(self, is_bert):
        """Get the vocab file and casing info from the Hub module."""
        bert_module = hub.Module(self.bert_model_hub_path)
        tokenization_info = bert_module(signature="tokenization_info",
                                        as_dict=True)
        vocab_file, do_lower_case = self.sess.run([
            tokenization_info["vocab_file"],
            tokenization_info["do_lower_case"],
        ])

        if is_bert:
            from bert.tokenization import FullTokenizer
            self.tokenizer = FullTokenizer(vocab_file=vocab_file,
                                           do_lower_case=do_lower_case)
        else:
            from vectorizers.albert_tokenization import FullTokenizer
            self.tokenizer = FullTokenizer(vocab_file=vocab_file,
                                           do_lower_case=do_lower_case,
                                           spm_model_file=vocab_file)

    def tokenize(self, text: str):
        words = text.split()  # whitespace tokenizer
        tokens = []
        valid_positions = []
        for i, word in enumerate(words):
            token = self.tokenizer.tokenize(word)
            tokens.extend(token)
            for i in range(len(token)):
                if i == 0:
                    valid_positions.append(1)
                else:
                    valid_positions.append(0)
        return tokens, valid_positions

    def transform(self, text_arr):
        input_ids = []
        input_mask = []
        segment_ids = []
        valid_positions = []
        for text in text_arr:
            ids, mask, seg_ids, valid_pos = self.__vectorize(text)
            input_ids.append(ids)
            input_mask.append(mask)
            segment_ids.append(seg_ids)
            valid_positions.append(valid_pos)

        sequence_lengths = np.array([len(i) for i in input_ids])
        input_ids = tf.keras.preprocessing.sequence.pad_sequences(
            input_ids, padding='post')
        input_mask = tf.keras.preprocessing.sequence.pad_sequences(
            input_mask, padding='post')
        segment_ids = tf.keras.preprocessing.sequence.pad_sequences(
            segment_ids, padding='post')
        valid_positions = tf.keras.preprocessing.sequence.pad_sequences(
            valid_positions, padding='post')
        return input_ids, input_mask, segment_ids, valid_positions, sequence_lengths

    def __vectorize(self, text: str):
        tokens, valid_positions = self.tokenize(text)
        # insert "[CLS]"
        tokens.insert(0, '[CLS]')
        valid_positions.insert(0, 1)
        # insert "[SEP]"
        tokens.append('[SEP]')
        valid_positions.append(1)

        segment_ids = [0] * len(tokens)
        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)

        input_mask = [1] * len(input_ids)

        return input_ids, input_mask, segment_ids, valid_positions
class BERTModel:
    def __init__(self):
        bert_pretrained_dir = args.pretrain_models_path + args.bert_model_name
        self.do_lower_case = args.bert_model_name.startswith('uncased')
        self.vocab_file = os.path.join(bert_pretrained_dir, 'vocab.txt')
        self.config_file = os.path.join(bert_pretrained_dir,
                                        'bert_config.json')
        self.tokenizer = FullTokenizer(vocab_file=self.vocab_file,
                                       do_lower_case=self.do_lower_case)

        self.input_id = tf.placeholder(tf.int64, [None, None], 'input_ids')
        self.input_mask = tf.placeholder(tf.int64, [None, None], 'input_mask')
        self.segment_ids = tf.placeholder(tf.int64, [None, None],
                                          'segment_ids')

        bert_config = BertConfig.from_json_file(self.config_file)
        model = BertModel(config=bert_config,
                          is_training=False,
                          input_ids=self.input_id,
                          input_mask=self.input_mask,
                          token_type_ids=self.segment_ids,
                          use_one_hot_embeddings=True,
                          scope='bert')
        self.output_layer = model.get_sequence_output()
        self.embedding_layer = model.get_embedding_output()

        saver = tf.train.Saver()

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        self.session = tf.Session(config=config)
        saver.restore(self.session, bert_pretrained_dir + '/bert_model.ckpt')

    def tokenize(self, token_list, attributes_list):
        num_attributes = len(attributes_list)
        output_list = [[] for _ in range(num_attributes)]
        token_ids = []
        masks = []

        token_ids.append("[CLS]")
        for token_id, token in enumerate(token_list):
            new_tokens = self.tokenizer.tokenize(token)
            token_ids.extend(new_tokens)

            for att_id in range(num_attributes):
                l_ = [
                    attributes_list[att_id][token_id]
                    for _ in range(len(new_tokens))
                ]
                output_list[att_id].extend(l_)

            m = [0 for _ in range(len(new_tokens))]
            m[0] = 1
            masks.extend(m)

        token_ids.append("[SEP]")

        token_ids = self.tokenizer.convert_tokens_to_ids(token_ids)
        last_layer, embedding = self.get_embeddings(token_ids)

        if len(last_layer) != len(output_list[0]):
            print(token_list)
            print(token_ids)
            for list_i in output_list:
                print(list_i)

        assert len(last_layer) == len(output_list[0])

        return last_layer, embedding, token_ids[1:-1], output_list, masks

    def get_embeddings(self, token_ids):
        input_mask = [[1] * len(token_ids)]
        segment_ids = [[0] * len(token_ids)]
        input_id = [token_ids]

        outputs, emb = self.session.run(
            [self.output_layer, self.embedding_layer],
            feed_dict={
                self.input_mask: input_mask,
                self.segment_ids: segment_ids,
                self.input_id: input_id
            })

        return outputs[0][1:-1], emb[0][1:-1]

    def tokenize_sentence(self, token_list):
        token_ids = []

        token_ids.append("[CLS]")
        for token_id, token in enumerate(token_list):
            new_tokens = self.tokenizer.tokenize(token)
            token_ids.extend(new_tokens)
        token_ids.append("[SEP]")

        token_ids = self.tokenizer.convert_tokens_to_ids(token_ids)
        return token_ids[1:-1]
Esempio n. 15
0
def texts_to_X(texts: List[List[str]], max_sentence_length: int, data_name: str, path_to_bert: str) -> np.ndarray:
    if os.path.isfile(data_name):
        with open(data_name, 'rb') as fp:
            X = pickle.load(fp)
        if not isinstance(X, np.ndarray):
            raise ValueError('The file `{0}` does not contain a `{1}` object.'.format(
                data_name, type(np.array([1, 2]))))
        if X.shape != (len(texts), max_sentence_length, EMBEDDING_SIZE):
            raise ValueError(
                'The file `{0}` contains an inadmissible `{1}` object. Shapes are wrong. Expected {2}, got {3}.'.format(
                    data_name, type(np.array([1, 2])), (len(texts), max_sentence_length, EMBEDDING_SIZE), X.shape)
            )
    else:
        path_to_bert_ = os.path.normpath(path_to_bert)
        if not check_path_to_bert(path_to_bert_):
            raise ValueError('`path_to_bert` is wrong! There are no BERT files into the directory `{0}`.'.format(
                path_to_bert))
        if os.path.basename(path_to_bert_).find('_uncased_') >= 0:
            do_lower_case = True
        else:
            if os.path.basename(path_to_bert_).find('_cased_') >= 0:
                do_lower_case = False
            else:
                do_lower_case = None
        if do_lower_case is None:
            raise ValueError('`{0}` is bad path to the BERT model, because a tokenization mode (lower case or no) '
                             'cannot be detected.'.format(path_to_bert))
        X = np.zeros((len(texts), max_sentence_length, EMBEDDING_SIZE), dtype=np.float32)
        batch_size = 4
        n_batches = int(math.ceil(len(texts) / float(batch_size)))
        max_seq_length_for_bert = 512
        with tf.Graph().as_default():
            input_ids_ = tf.placeholder(shape=(batch_size, max_seq_length_for_bert), dtype=tf.int32, name='input_ids')
            input_mask_ = tf.placeholder(shape=(batch_size, max_seq_length_for_bert), dtype=tf.int32, name='input_mask')
            segment_ids_ = tf.placeholder(shape=(batch_size, max_seq_length_for_bert), dtype=tf.int32,
                                          name='segment_ids')
            bert_config = BertConfig.from_json_file(os.path.join(path_to_bert, 'bert_config.json'))
            tokenizer = FullTokenizer(vocab_file=os.path.join(path_to_bert, 'vocab.txt'),
                                            do_lower_case=do_lower_case)
            bert_model = BertModel(config=bert_config, is_training=False, input_ids=input_ids_,
                                   input_mask=input_mask_, token_type_ids=segment_ids_,
                                   use_one_hot_embeddings=False)
            sequence_output = bert_model.sequence_output
            tvars = tf.trainable_variables()
            init_checkpoint = os.path.join(path_to_bert_, 'bert_model.ckpt')
            (assignment_map, initialized_variable_names) = get_assignment_map_from_checkpoint(tvars, init_checkpoint)
            tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
            with tf.Session() as sess:
                sess.run(tf.global_variables_initializer())
                sess.run(tf.tables_initializer())
                tokenized_texts = []
                bert2tokens = []
                for cur_text in texts:
                    new_text = []
                    new_bert2tokens = []
                    start_pos = 0
                    for word_idx, cur_word in enumerate(cur_text):
                        bert_tokens = tokenizer.tokenize(cur_word)
                        new_text += bert_tokens
                        new_bert2tokens.append((start_pos + 1, start_pos + len(bert_tokens) + 1))
                        start_pos += len(bert_tokens)
                    if len(new_text) > (max_seq_length_for_bert - 2):
                        new_text = new_text[:(max_seq_length_for_bert - 2)]
                        new_bert2tokens = new_bert2tokens[:(max_seq_length_for_bert - 2)]
                    new_text = ['[CLS]'] + new_text + ['[SEP]']
                    tokenized_texts.append(tokenizer.convert_tokens_to_ids(new_text))
                    bert2tokens.append(tuple(new_bert2tokens))
                del tokenizer
                for batch_idx in range(n_batches):
                    start_pos = batch_idx * batch_size
                    end_pos = min(len(texts), (batch_idx + 1) * batch_size)
                    embeddings_of_texts_as_numpy = sess.run(
                        sequence_output,
                        feed_dict={
                            ph: x for ph, x in zip(
                                [input_ids_, input_mask_, segment_ids_],
                                texts_to_batch_for_bert(tokenized_texts[start_pos:end_pos], batch_size,
                                                        max_seq_length_for_bert)
                            )
                        }
                    )
                    for idx in range(end_pos - start_pos):
                        text_idx = start_pos + idx
                        for token_idx in range(min(len(texts[text_idx]), max_sentence_length)):
                            token_start, token_end = bert2tokens[text_idx][token_idx]
                            X[text_idx][token_idx] = embeddings_of_texts_as_numpy[idx][token_start:token_end].mean(
                                axis=0)
                    del embeddings_of_texts_as_numpy
                for k in list(sess.graph.get_all_collection_keys()):
                    sess.graph.clear_collection(k)
        with open(data_name, mode='wb') as fp:
            pickle.dump(X, fp, protocol=2)
        tf.reset_default_graph()
    return X
class PredicateInfer(LoadModelBase):
    def __init__(self,
                 vocab_file,
                 export_dir=None,
                 url=None,
                 model_name='models',
                 signature_name=None,
                 do_lower_case=True):
        super(PredicateInfer, self).__init__(export_dir, url, model_name,
                                             signature_name)
        # 加载段落处理器
        # self.sen_processor = SentenceProcessor()
        # 加载 bert tokenizer
        self.tokenizer = FullTokenizer(vocab_file=vocab_file,
                                       do_lower_case=do_lower_case)

        # 通过 grpc
        if url:
            self.stub, self.request = self.load_grpc_connect()

        if export_dir:
            self.predict_fn = self.load_pb_model()

        self.id_map_predicate = self.id_to_label(model_config.PREDICATE_LABEL)

    def process(self, sentences, max_seq_length=64):
        if not sentences or not isinstance(sentences, list):
            raise ValueError(
                '`sentences` must be list object and not a empty list !')

        examples = []
        for sentence in sentences:
            feature = self.convert_single_example(sentence, max_seq_length)
            example = self.convert_single_feature(feature)
            examples.append(example)

        return examples

    def convert_single_example(self, sentence, max_seq_length):
        """
        处理单个语句
        sentence: str, 预测句子
        max_seq_length: int,句子最大长度
        :return:
        """
        sentence = self.tokenizer.tokenize(sentence)
        if len(sentence) > max_seq_length - 2:
            sentence = sentence[0:(max_seq_length - 2)]

        tokens = []
        segment_ids = []
        tokens.append("[CLS]")
        segment_ids.append(0)
        for token in sentence:
            tokens.append(token)
            segment_ids.append(0)
        tokens.append("[SEP]")
        segment_ids.append(0)

        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        while len(input_ids) < max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
            segment_ids.append(0)

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        feature = InputFeatures(input_ids=input_ids,
                                input_mask=input_mask,
                                segment_ids=segment_ids)

        return feature

    def convert_single_feature(self, feature):
        features = dict()
        features['input_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(
            value=feature.input_ids))
        features['input_mask'] = tf.train.Feature(
            int64_list=tf.train.Int64List(value=feature.input_mask))
        features['segment_ids'] = tf.train.Feature(
            int64_list=tf.train.Int64List(value=feature.segment_ids))
        example = tf.train.Example(features=tf.train.Features(
            feature=features))

        return example.SerializeToString()

    def infer(self, sentences, max_seq_length, top_n=3):
        """
        预测调用
        sentences: list,输入一批预测句子
        max_seq_length: int, 输入最大长度
        top_n: int,返回前多少个类别
        :return:
        list,例如 [[('作者', 0.98), ('出生地', 0.02)...], ]
        """
        result = []
        examples = self.process(sentences, max_seq_length)
        if self.url:
            predictions = self.tf_serving_infer(examples)

        else:
            s = time.time()
            predictions = self.local_infer(examples)
            print('predicate:', time.time() - s)

        predictions = predictions['predictions']

        for p in predictions:
            top_n_idx = p.argsort()[::-1][0:top_n]
            label = list(
                map(lambda x: (self.id_map_predicate[x], p[x]), top_n_idx))

            result.append(label)

        return result

    def tf_serving_infer(self, examples):
        self.request.inputs['examples'].CopyFrom(
            tf.make_tensor_proto(examples, dtype=types_pb2.DT_STRING))
        response = self.stub.Predict(self.request, 5.0)
        predictions = {}
        for key in response.outputs:
            tensor_proto = response.outputs[key]
            nd_array = tf.contrib.util.make_ndarray(tensor_proto)
            predictions[key] = nd_array

        return predictions

    def local_infer(self, examples):
        """
        本地进行预测,参数解释同上
        """
        predictions = self.predict_fn({'examples': examples})

        return predictions

    def id_to_label(self, labels):
        return dict([(i, label) for i, label in enumerate(labels)])
Esempio n. 17
0
class BERTVectorizer:

    def __init__(self, sess, bert_model_hub_path):
        self.sess = sess
        self.bert_model_hub_path = bert_model_hub_path
        self.create_tokenizer_from_hub_module()

    def create_tokenizer_from_hub_module(self):
        # get the vocabulary and lowercasing or uppercase information directly from the BERT tf hub module
        bert_module = hub.Module(self.bert_model_hub_path)
        tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
        vocab_file, do_lower_case = self.sess.run(
            [
                tokenization_info["vocab_file"],
                tokenization_info["do_lower_case"]
            ]
        )
        self.tokenizer = FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) #do_lower_case=True
        # print(tokenizer.tokenize('hello world!'))  --> ['hello', 'world', '!']

    def tokenize(self, text:str): ## tokenize every sentence
        words = text.split()
        ## # text: add leah kauffman to my uncharted 4 nathan drake playlist
        ## # words: ['add', 'leah', 'kauffman', 'to', 'my', 'uncharted', '4', 'nathan', 'drake', 'playlist']
        tokens = []
        ## # tokens: ['add', 'leah', 'ka', '##uf', '##fm', '##an', 'to', 'my', 'un', '##cha', '##rted', '4', 'nathan', 'drake', 'play', '##list']
        valid_positions = []
        ## # valid_positions:[1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0]
        for i, word in enumerate(words):
            token = self.tokenizer.tokenize(word)
            tokens.extend(token)
            for i in range(len(token)):
                if i == 0:
                    valid_positions.append(1)
                else:
                    valid_positions.append(0)
        return tokens, valid_positions

    def transform(self, text_arr):
        input_ids = []
        input_mask = []
        segment_ids = []
        valid_positions = []
        for text in text_arr:
            ids, mask, seg_ids, valid_pos = self.__vectorize(text)
            input_ids.append(ids)
            input_mask.append(mask)
            segment_ids.append(seg_ids)
            valid_positions.append(valid_pos)

        sequence_length = np.array([len(i) for i in input_ids])

        ## set the maximum length is 50
        input_ids = tf.keras.preprocessing.sequence.pad_sequences(input_ids, maxlen=50, truncating='post', padding='post')
        input_mask = tf.keras.preprocessing.sequence.pad_sequences(input_mask, maxlen=50, truncating='post', padding='post')
        segment_ids = tf.keras.preprocessing.sequence.pad_sequences(segment_ids, maxlen=50, truncating='post', padding='post')
        valid_positions = tf.keras.preprocessing.sequence.pad_sequences(valid_positions, maxlen=50, truncating='post', padding='post')

        # input_ids = tf.keras.preprocessing.sequence.pad_sequences(input_ids, padding='post')
        # input_mask = tf.keras.preprocessing.sequence.pad_sequences(input_mask, padding='post')
        # segment_ids = tf.keras.preprocessing.sequence.pad_sequences(segment_ids, padding='post')
        # valid_positions = tf.keras.preprocessing.sequence.pad_sequences(valid_positions, padding='post')

        return input_ids, input_mask, segment_ids, valid_positions, sequence_length

    def __vectorize(self, text:str):
        tokens, valid_positions = self.tokenize(text)

        ## insert the first token "[CLS]"
        tokens.insert(0, '[CLS]')
        valid_positions.insert(0, 1)
        ## insert the last token "[SEP]"
        tokens.append('[SEP]')
        valid_positions.append(1)
        ## ['[CLS]', 'add', 'leah', 'ka', '##uf', '##fm', '##an', 'to', 'my', 'un', '##cha', '##rted', '4', 'nathan', 'drake', 'play', '##list', '[SEP]']
        ## [1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1]

        '''
        (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids: 0     0   0   0  0     0 0
        
        Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambiguously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        '''
        segment_ids = [0] * len(tokens)
        ## # segment_ids: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        ## # input_ids: [101, 5587, 14188, 10556, 16093, 16715, 2319, 2000, 2026, 4895, 7507, 17724, 1018, 7150, 7867, 2377, 9863, 102] and the first is always 101 and the last is 102

        input_mask = [1] * len(input_ids) ## The mask has 1 for real tokens and 0 for padding tokens.
        ## # input_mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

        return input_ids, input_mask, segment_ids, valid_positions
Esempio n. 18
0
class BertPreprocessor(Preprocessor):
    """Preprocessor for BERT embedding.

    This class can be used to do all the work to create the inputs (and outputs) of a Neural Network using BERT
    as embedding. Currently only single sequence classification is supported.
    """

    def __init__(self,
                 pretrained_model_path: str,
                 **kwargs):
        super().__init__(**kwargs)

        info = hub.Module(spec=pretrained_model_path)(signature="tokenization_info", as_dict=True)

        with tf.Session() as sess:
            vocab_file, do_lower_case = sess.run(
                [
                    info["vocab_file"],
                    info["do_lower_case"]
                ]
            )

        # Create the tokenizer with the vocabulary of the pretrained model
        self._tokenizer = FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)

        basic_tokens = self._tokenizer.convert_tokens_to_ids(["[CLS]", "[SEP]"])
        self._CLS_token = basic_tokens[0]
        self._SEP_token = basic_tokens[1]

    def _padding_sentence(self):
        """Return a zero length sentence to pad last batch.

        :return: Three sequences of zeros (tokens, masks, segment ids).
        """

        return [0] * self._max_seq_len, [0] * self._max_seq_len, [0] * self._max_seq_len

    def tokenize(self, text: str):
        """Convert a sequence of words into a sequence of tokens and also compute the masking- and segment ids.

        For further details please read BERT paper.

        :param text: The sequence of words.
        :return: The sequence of tokens, masks and segment ids.
        """

        input_ids = [0] * self._max_seq_len
        input_mask = [0] * self._max_seq_len
        input_segment_ids = [0] * self._max_seq_len

        tokens_input = self._tokenizer.tokenize(text)

        # if too long cut to size (the first token will be [CLS], the last [SEP])
        if len(tokens_input) > self._max_seq_len - 2:
            tokens_input = tokens_input[0: (self._max_seq_len - 2)]

        idx = 0
        input_ids[idx] = self._CLS_token
        idx += 1

        for element in self._tokenizer.convert_tokens_to_ids(tokens_input):
            input_ids[idx] = element
            idx += 1

        input_ids[idx] = self._SEP_token

        # The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to.
        for i in range(idx + 1):
            input_mask[i] = 1

        # safety check
        assert len(input_ids) == self._max_seq_len
        assert len(input_mask) == self._max_seq_len
        assert len(input_segment_ids) == self._max_seq_len

        return input_ids, input_mask, input_segment_ids

    def fit(self, texts: List[str]) -> 'BertPreprocessor':
        """This function does nothing in case of BERT but must be implemented.

        :param texts: -
        :return: self
        """

        return self

    def transform(self, texts: List[str]) -> list:
        """Transform sequences of words into sequences of tokens, masks and segment ids.

        Masks are used to separate valid and padding tokens. Here the segment ids are always one since the whole
        sequence belongs together.

        For further details please read BERT paper.

        :param texts: The sequences of texts.
        :return: The sequences of tokens, masks and segment ids.
        """
        
        input_masks = np.empty([len(texts), self._max_seq_len], dtype=np.int64)
        segment_ids = np.empty([len(texts), self._max_seq_len], dtype=np.int64)

        # input_ids, input_masks, segment_ids = [], [], []
        
        input_ids, input_masks, segment_ids = zip(*Pool(processes=8).map(self.tokenize, texts))

        # for i, text in enumerate(texts):
            # input_ids[i], input_masks[i], segment_ids[i] = self.tokenize(text=text)
            # input_id, input_mask, segment_id = self.tokenize(text=text)
            # input_ids.append(input_id)
            # input_masks.append(input_mask)
            # segment_ids.append(segment_id)

        # return [np.array(input_ids), np.array(input_masks), np.array(segment_ids)]
        return [input_ids, input_masks, segment_ids]

    def inverse_transform(self, sequences: np.ndarray):
        """Transform sequences of tokens back to sequences of words (sentences).

        :param sequences: The sequences of tokens.
        :return: The sequences of words
        """

        return self._tokenizer.convert_ids_to_tokens(sequences)
class EntityInfer(LoadModelBase):
    def __init__(self,
                 vocab_file,
                 export_dir=None,
                 url=None,
                 model_name='models',
                 signature_name=None,
                 do_lower_case=True):
        super(EntityInfer, self).__init__(export_dir, url, model_name,
                                          signature_name)
        self.tokenizer = FullTokenizer(vocab_file=vocab_file,
                                       do_lower_case=do_lower_case)
        # 通过 grpc
        if url:
            self.stub, self.request = self.load_grpc_connect()

        if export_dir:
            self.predict_fn = self.load_pb_model()

        self.id_map_predicate = self.id_to_label(model_config.PREDICATE_LABEL)
        self.predicate_map_id = self.label_to_id(model_config.PREDICATE_LABEL)
        self.id_map_sequence = self.id_to_label(model_config.SEQ_LABEL)

    def id_to_label(self, labels):
        return dict([(i, label) for i, label in enumerate(labels)])

    def label_to_id(self, labels):
        return dict([(label, i) for i, label in enumerate(labels)])

    def _truncate_seq_pair(self, tokens_a, tokens_b, max_length):
        """Truncates a sequence pair in place to the maximum length."""

        # This is a simple heuristic which will always truncate the longer sequence
        # one token at a time. This makes more sense than truncating an equal percent
        # of tokens from each, since if one sequence is very short then each token
        # that's truncated likely contains more information than a longer sequence.
        while True:
            total_length = len(tokens_a) + len(tokens_b)
            if total_length <= max_length:
                break
            if len(tokens_a) > len(tokens_b):
                tokens_a.pop()
            else:
                tokens_b.pop()

    def process(self, sentences, predicate_labels, max_seq_length=64):
        if not sentences or (not isinstance(sentences, list)
                             and not isinstance(sentences, tuple)):
            raise ValueError(
                '`sentences` must be list object and not a empty list !')

        examples = []
        for sentence, predicate_label in zip(sentences, predicate_labels):
            feature = self.convert_single_example(sentence, predicate_label,
                                                  max_seq_length)
            example = self.convert_single_feature(feature)
            examples.append(example)

        return examples

    def convert_single_example(self, sentence, predicate_label,
                               max_seq_length):
        tokens = []
        for token in sentence:
            tokens.extend(self.tokenizer.tokenize(token))

        tokens_b = [predicate_label] * len(tokens)
        predicate_label_id = self.predicate_map_id[predicate_label]

        # 把 tokens 和 tokens_b 都截断到相等长度,并且长度的和小于 max_seq_length - 3
        self._truncate_seq_pair(tokens, tokens_b, max_seq_length - 3)

        tokens_a = []
        segment_ids = []
        tokens_a.append("[CLS]")
        segment_ids.append(0)
        for token in tokens:
            tokens_a.append(token)
            segment_ids.append(0)

        tokens_a.append("[SEP]")
        segment_ids.append(0)
        input_ids = self.tokenizer.convert_tokens_to_ids(tokens_a)

        # bert_tokenizer.convert_tokens_to_ids(["[SEP]"]) --->[102]
        # 1-100 dict index not used
        bias = 1
        for token in tokens_b:
            # add  bias for different from word dict
            tokens.append(token)
            input_ids.append(predicate_label_id + bias)
            segment_ids.append(1)

        tokens.append('[SEP]')
        # `[SEP]` index 等于 102
        input_ids.append(self.tokenizer.convert_tokens_to_ids(["[SEP]"])[0])
        segment_ids.append(1)

        input_mask = [1] * len(input_ids)

        while len(input_ids) < max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
            segment_ids.append(0)
            tokens.append("[Padding]")

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        feature = InputFeatures(input_ids=input_ids,
                                input_mask=input_mask,
                                segment_ids=segment_ids)

        return feature

    def convert_single_feature(self, feature):
        features = dict()
        features['input_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(
            value=feature.input_ids))
        features['input_mask'] = tf.train.Feature(
            int64_list=tf.train.Int64List(value=feature.input_mask))
        features['segment_ids'] = tf.train.Feature(
            int64_list=tf.train.Int64List(value=feature.segment_ids))
        example = tf.train.Example(features=tf.train.Features(
            feature=features))

        return example.SerializeToString()

    def infer(self,
              sentences,
              predicate_labels,
              max_seq_length,
              predicate_probabilities=None):
        """
        预测调用
        sentences: list,句子,['xxxx', 'xxxx'...]
        predicate_labels: list, 标签, ['作者', '出生地'...]
        max_seq_length: int
        predicate_probabilities: list, [0.92, 0.01, ...]
        :return:
        list, [
        [{'predicate': predicate, 'subject': subj, 'object': entity}, {'predicate': predicate...],
        [{'predicate': predicate, 'subject': subj, 'object': entity}, {'predicate': predicate...]...
        ]
        """
        examples = self.process(sentences, predicate_labels, max_seq_length)
        if self.url:
            predictions = self.tf_serving_infer(examples)
        else:
            s = time.time()
            predictions = self.local_infer(examples)
            print('sequence:', time.time() - s)

        token_label_predictions = predictions['token_label_predictions']
        predicate_predictions = predictions['predicate_predictions']
        predicate_labels_index = np.argmax(predicate_predictions, -1)

        result = []
        for i in range(len(sentences)):
            token_label = list(
                map(lambda x: self.id_map_sequence[x],
                    token_label_predictions[i]))
            entities = self.entity_extract(
                sentences[i], token_label[1:token_label.index('[SEP]')])
            predicate_label_index = predicate_labels_index[i]
            # 关系分类的模型输出 与 序列标注模型输出的结果比较
            if predicate_probabilities:
                predicate_label = max(
                    [(predicate_labels[i], predicate_probabilities[i]),
                     (self.id_map_predicate[predicate_label_index],
                      predicate_predictions[i][predicate_label_index])],
                    key=lambda x: x[1])
            else:
                predicate_label = predicate_predictions[i][
                    predicate_label_index]

            triplets = self.organize_triplet(entities, predicate_label[0])
            if triplets:
                result.append(triplets)

        return result

    def organize_triplet(self, entities, predicate):
        """
        把三元组转成字典形式, 可解决一个关系、一个主体(subject)、多个客体(object)
        entities: list, [('xx公司', 'SUB'), ('xx公司', 'OBJ')]
        predicate: str, 关系
        :return:
        list, [{'predicate': predicate, 'subject': subj, 'object': entity},
               {'predicate': predicate, 'subject': subj, 'object': entity}...]
        """
        triplets = []
        subj = None
        for entity, tag in entities:
            if tag == 'SUB':
                subj = entity
                break

        for entity, tag in entities:
            if tag == 'OBJ':
                triplet = {
                    'predicate': predicate,
                    'subject': subj,
                    'object': entity
                }
                triplets.append(triplet)

        return triplets

    def entity_extract(self, sentence, tags):
        """
        依据tags,从sentence抽取实体
        sentence: str,句子
        tags: list, 序列标记,例如 ['O', 'B-SUB', 'I-SUB'...]
        :return:
        list, [('xx公司', 'SUB'), ('xx公司', 'OBJ')]
        """
        entities = []
        sentence_len = len(sentence)
        if sentence_len != len(tags):
            warnings.warn(
                'Token and tags have different lengths.\ndetails:\n{}\n{}'.
                format(sentence, tags))

        entity = Entity(None)
        t_zip = zip(sentence, tags)

        for i, (token, tag) in enumerate(t_zip):
            if tag == 'O':
                if entity.types:
                    entities.append(entity.get_entity_types())
                    entity = Entity(None)
                continue

            elif tag[0] == 'B':
                if entity.types:
                    entities.append(entity.get_entity_types())
                entity = Entity(tag[2:])
                entity.begin = token

            elif tag[0] == 'I':
                if i == sentence_len - 1:
                    entity.intermediate = token
                    entities.append(entity.get_entity_types())
                    break

                try:
                    entity.intermediate = token
                except Exception as e:
                    print(e)

        return entities

    def tf_serving_infer(self, examples):
        self.request.inputs['examples'].CopyFrom(
            tf.make_tensor_proto(examples, dtype=types_pb2.DT_STRING))
        response = self.stub.Predict(self.request, 5.0)
        predictions = {}
        for key in response.outputs:
            tensor_proto = response.outputs[key]
            nd_array = tf.contrib.util.make_ndarray(tensor_proto)
            predictions[key] = nd_array

        return predictions

    def local_infer(self, examples):
        """
        本地进行预测,参数解释同上
        """
        predictions = self.predict_fn({'examples': examples})

        return predictions
class TrainDataReader():
    def __init__(self, config, category_dir, vocab_file):
        self.config = config
        self.category_dir = category_dir
        self.tokenizer = FullTokenizer(vocab_file)
        if not os.path.exists(
                os.path.join(self.category_dir, 'train_data', 'raw.csv')):
            raise Exception("local raw train data not exists!!")
        if not os.path.exists(vocab_file):
            raise Exception("local vocab_file not exists")

    def transform(self):
        with open(os.path.join(self.category_dir, 'train_data', 'raw.csv')) as fr, \
             open(os.path.join(self.category_dir, 'attr_values.pkl'), 'wb') as fwa:
            attr_values_c = {}
            for row in fr:
                if row.strip() == '' or len(row.strip().split('\t')) != 10:
                    continue
                segment = row.strip().split('\t')
                attr_values_c[(segment[8], segment[9])] = 1
            attr_values = {k: i for i, k in enumerate(attr_values_c.keys())}
            attr_values_r = {i: k for k, i in attr_values.items()}
            print('start to write local attr_values.pkl!!')
            pickle.dump((attr_values, attr_values_r), fwa)
        with open(os.path.join(self.category_dir, 'train_data', 'raw.csv')) as fr, \
             open(os.path.join(self.category_dir, 'train_data', 'transform.csv'), 'w') as fwt:
            print('start to write local train_data transform.csv!!')
            for row in fr:
                if row.strip() == '' or len(row.strip().split('\t')) != 10:
                    continue
                segment = row.strip().split('\t')
                label = attr_values[(segment[8], segment[9])]
                tokens = self.tokenizer.tokenize(segment[7])
                if len(tokens) > self.config.max_seq_length - 2:
                    tokens = tokens[0:self.config.max_seq_length - 2]
                tokens = ['[CLS]'] + tokens + ['[SEP]']
                token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
                token_ids_patch = token_ids[0:self.config.max_seq_length] + [
                    0
                ] * (self.config.max_seq_length - len(token_ids))
                token_ids_patch = list(map(lambda x: str(x), token_ids_patch))
                fwt.write(
                    str(label) + ',' +
                    str(min(len(token_ids), len(token_ids_patch))) + ',' +
                    ','.join(token_ids_patch) + '\n')
        return len(attr_values)

    def read(self):
        transform = os.path.join(self.category_dir, 'train_data',
                                 'transform.csv')
        queue = tf.train.string_input_producer([transform])
        reader = tf.TextLineReader()
        _, value = reader.read(queue)
        row = tf.decode_csv(value, [[0]] * (self.config.max_seq_length + 2))
        label = tf.stack(row[0])
        length = tf.stack(row[1])
        mask = tf.cast(tf.sequence_mask(length, self.config.max_seq_length),
                       tf.int32)
        sequence = tf.stack(row[2:self.config.max_seq_length + 2])
        return tf.train.shuffle_batch([label, sequence, mask],
                                      self.config.batch_size, 50000, 10000)
Esempio n. 21
0
from bert.tokenization import FullTokenizer

export_dir = '/home/CAIL/bert_text/examples/SequenceLabel/saved_models'

vocab_file = "/home/CAIL/bert_text/examples/SequenceLabel/vocab.txt"


text = "这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般"

tokenizer = FullTokenizer(vocab_file=vocab_file)

tokens = tokenizer.tokenize(text=text)

max_seq_length = 256

input_ids = tokenizer.convert_tokens_to_ids(tokens)
input_masks = [1] * len(input_ids)
if len(input_ids) < max_seq_length:
    input_ids += [0] * (max_seq_length - len(input_ids))
    input_masks += [0] * (max_seq_length - len(input_masks))

segment_ids = [0] * max_seq_length

with tf.Session() as sess:
    meta_graph_def = tf.saved_model.loader.load(sess, [tag_constants.SERVING], export_dir)
    signature = meta_graph_def.signature_def

    x1_tensor_name = signature['predict'].inputs['input_ids'].name
    x2_tensor_name = signature['predict'].inputs['input_masks'].name
    x3_tensor_name = signature['predict'].inputs['segment_ids'].name
Esempio n. 22
0
class BertNer(object):
    def __init__(self, **kwargs):
        self.tf = import_tf(kwargs['gpu_no'], kwargs['verbose'])
        self.logger = set_logger('BertNer', kwargs['log_dir'],
                                 kwargs['verbose'])
        self.model_dir = kwargs['ner_model']

        from bert.tokenization import FullTokenizer
        self.tokenizer = FullTokenizer(
            os.path.join(self.model_dir, 'vocab.txt'))

        self.ner_sq_len = 128
        self.input_ids = self.tf.placeholder(self.tf.int32,
                                             (None, self.ner_sq_len),
                                             'input_ids')
        self.input_mask = self.tf.placeholder(self.tf.int32,
                                              (None, self.ner_sq_len),
                                              'input_mask')

        # init graph
        self._init_graph()

        # init ner assist data
        self._init_predict_var()

        self.per_proun = [
            '甲', '乙', '丙', '丁', '戊', '己', '庚', '辛', '壬', '癸', '子', '丑', '寅',
            '卯', '辰', '巳', '午', '未', '申', '酉', '戌', '亥'
        ]

    def _init_graph(self):
        """
        init bert ner graph
        :return:
        """
        try:
            with self.tf.gfile.GFile(
                    os.path.join(self.model_dir, 'ner_model.pb'), 'rb') as f:
                graph_def = self.tf.GraphDef()
                graph_def.ParseFromString(f.read())
                input_map = {
                    "input_ids:0": self.input_ids,
                    'input_mask:0': self.input_mask
                }

                self.pred_ids = self.tf.import_graph_def(
                    graph_def,
                    name='',
                    input_map=input_map,
                    return_elements=['pred_ids:0'])[0]
                graph = self.pred_ids.graph

                sess_config = self.tf.ConfigProto(allow_soft_placement=True)
                sess_config.gpu_options.allow_growth = True

                self.sess = self.tf.Session(graph=graph, config=sess_config)
                self.sess.run(self.tf.global_variables_initializer())
                self.tf.reset_default_graph()

        except Exception as e:
            self.logger.error(e)

    def _init_predict_var(self):
        """
        initialize assist of bert ner
        :return: labels num of ner, label to id dict, id to label dict
        """
        with open(os.path.join(self.model_dir, 'label2id.pkl'), 'rb') as rf:
            self.id2label = {
                value: key
                for key, value in pickle.load(rf).items()
            }

    def _convert_lst_to_features(self,
                                 lst_str,
                                 is_tokenized=True,
                                 mask_cls_sep=False):
        """
        Loads a data file into a list of `InputBatch`s.
        :param lst_str: list str
        :param is_tokenized: whether token unknown word
        :param mask_cls_sep: masking the embedding on [CLS] and [SEP] with zero.
        :return: input feature instance
        """
        from bert.extract_features import read_tokenized_examples, read_examples, InputFeatures

        examples = read_tokenized_examples(
            lst_str) if is_tokenized else read_examples(lst_str)

        _tokenize = lambda x: self.tokenizer.mark_unk_tokens(
            x) if is_tokenized else self.tokenizer.tokenize(x)

        for (ex_index, example) in enumerate(examples):
            tokens_a = _tokenize(example.text_a)

            tokens_b = None
            if example.text_b:
                tokens_b = _tokenize(example.text_b)

            if tokens_b:
                # Modifies `tokens_a` and `tokens_b` in place so that the total
                # length is less than the specified length.
                # Account for [CLS], [SEP], [SEP] with "- 3"
                self._truncate_seq_pair(tokens_a, tokens_b)
            else:
                # Account for [CLS] and [SEP] with "- 2"
                if len(tokens_a) > self.ner_sq_len - 2:
                    tokens_a = tokens_a[0:(self.ner_sq_len - 2)]

            # The convention in BERT is:
            # (a) For sequence pairs:
            #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
            #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
            # (b) For single sequences:
            #  tokens:   [CLS] the dog is hairy . [SEP]
            #  type_ids: 0     0   0   0  0     0 0
            #
            # Where "type_ids" are used to indicate whether this is the first
            # sequence or the second sequence. The embedding vectors for `type=0` and
            # `type=1` were learned during pre-training and are added to the wordpiece
            # embedding vector (and position vector). This is not *strictly* necessary
            # since the [SEP] token unambiguously separates the sequences, but it makes
            # it easier for the model to learn the concept of sequences.
            #
            # For classification tasks, the first vector (corresponding to [CLS]) is
            # used as as the "sentence vector". Note that this only makes sense because
            # the entire model is fine-tuned.
            tokens = ['[CLS]'] + tokens_a + ['[SEP]']
            input_type_ids = [0] * len(tokens)
            input_mask = [int(not mask_cls_sep)
                          ] + [1] * len(tokens_a) + [int(not mask_cls_sep)]

            if tokens_b:
                tokens += tokens_b + ['[SEP]']
                input_type_ids += [1] * (len(tokens_b) + 1)
                input_mask += [1] * len(tokens_b) + [int(not mask_cls_sep)]

            input_ids = self.tokenizer.convert_tokens_to_ids(tokens)

            # Zero-pad up to the sequence length. more pythonic
            pad_len = self.ner_sq_len - len(input_ids)
            input_ids += [0] * pad_len
            input_mask += [0] * pad_len
            input_type_ids += [0] * pad_len

            assert len(input_ids) == self.ner_sq_len
            assert len(input_mask) == self.ner_sq_len
            assert len(input_type_ids) == self.ner_sq_len

            yield InputFeatures(input_ids=input_ids,
                                input_mask=input_mask,
                                input_type_ids=input_type_ids)

    def _truncate_seq_pair(self, tokens_a, tokens_b):
        """
        Truncates a sequence pair in place to the maximum length.
        :param tokens_a: text a
        :param tokens_b: text b
        """
        try:
            while True:
                total_length = len(tokens_a) + len(tokens_b)

                if total_length <= self.ner_sq_len - 3:
                    break
                if len(tokens_a) > len(tokens_b):
                    tokens_a.pop()
                else:
                    tokens_b.pop()
        except:
            self.logger.error()

    def _convert_id_to_label(self, pred_ids_result, batch_size):
        """
        turn id to label
        :param pred_ids_result: predict result
        :param batch_size: batch size of predict ids result
        :return: label list
        """
        result = []
        index_result = []
        for row in range(batch_size):
            curr_seq = []
            curr_idx = []
            ids = pred_ids_result[row]
            for idx, id in enumerate(ids):
                if id == 0:
                    break
                curr_label = self.id2label[id]
                if curr_label in ['[CLS]', '[SEP]']:
                    if id == 102 and (idx < len(ids) and ids[idx + 1] == 0):
                        break
                    continue
                # elif curr_label == '[SEP]':
                #     break
                curr_seq.append(curr_label)
                curr_idx.append(id)
            result.append(curr_seq)
            index_result.append(curr_idx)
        return result, index_result

    def predict(self, contents):
        """
        bert ner predict
        :param content_list: content list
        :return: predict result
        """
        try:
            splited_contents = []
            all_terms = []
            for content in contents:
                content_len = len(content)
                if content_len % self.ner_sq_len - 2 == 0:
                    terms = int(content_len / (self.ner_sq_len - 2))
                else:
                    terms = int(content_len / (self.ner_sq_len - 2)) + 1
                all_terms.append(terms)

                for i in range(terms):
                    splited_contents.append(
                        content[i * (self.ner_sq_len - 2):(i + 1) *
                                (self.ner_sq_len - 2)])

            tmp_f = list(self._convert_lst_to_features(splited_contents))
            input_ids = [f.input_ids for f in tmp_f]
            input_masks = [f.input_mask for f in tmp_f]

            pred_result = self.sess.run(self.pred_ids,
                                        feed_dict={
                                            self.input_ids: input_ids,
                                            self.input_mask: input_masks
                                        })

            # restore to original string
            tmp = []
            index = 0
            for terms in all_terms:
                sub_preds = []
                for i in range(terms):
                    sub_preds.extend(pred_result[index + i])
                tmp.append(sub_preds)
                index += terms

            pred_result = tmp

            pred_result = self._convert_id_to_label(pred_result,
                                                    len(pred_result))[0]

            # zip str predict id
            str_pred = []
            for w in zip(contents, pred_result):
                sub_list = []
                for z in zip(list(w[0]), w[1]):
                    sub_list.append([z[0], z[1]])

                str_pred.append(sub_list)

            # get ner
            ner_result = [self._combine_ner(s) for s in str_pred]
            return ner_result

        except Exception as e:
            self.logger.error(e)
            return [[]]

    def _combine_ner(self, pred_result):
        """
        combine ner
        :param pred_result: model predict result and origin content words list
        :return: entity words and index
        """
        words_len = len(pred_result)
        i = 0
        tmp = ''
        _ner_list = []

        while i < words_len:
            word = pred_result[i]
            # add personal pronoun
            if word[0] in self.per_proun and word[1][0] == 'O':
                _ner_list.append([word[0], 'PER'])

            if word[1][0] == 'O' and tmp is not '':
                _ner_list.append([tmp, pred_result[i - 1][1][2:]])
                tmp = ''

            elif word[1][0] == 'I':
                tmp = tmp + word[0]
                if i == words_len - 1:
                    _ner_list.append([tmp, word[1][2:]])

            elif word[1][0] == 'B':
                if tmp is not '':
                    _ner_list.append([tmp, pred_result[i - 1][1][2:]])

                tmp = word[0]
                if i == words_len - 1:
                    _ner_list.append([tmp, word[1][2:]])

            i += 1

        return _ner_list
Esempio n. 23
0
class DisasterDetector:
    def __init__(self, bert_layer, max_sql, lr, batch_size, epochs):

        self.bert_layer = bert_layer
        self.max_sql = max_sql
        vocab = self.bert_layer.resolved_object.vocab_file.asset_path.numpy()
        lowercase = self.bert_layer.resolved_object.do_lower_case.numpy()
        self.token = FullTokenizer(vocab, lowercase)
        self.lr = lr
        self.batch_size = batch_size
        self.epochs = epochs
        self.models = []
        self.scores = {}

    def encode(self, texts):

        all_tokens = []
        all_masks = []
        all_segments = []
        for text in texts:
            text = self.token.tokenize(text)
            text = text[:self.max_sql - 2]
            input_seq = ['[CLS]'] + text + ['[SEP]']
            pad_len = self.max_sql - len(input_seq)
            tokens = self.token.convert_tokens_to_ids(input_seq)
            tokens += [0] * pad_len
            pad_masks = [1] * len(input_seq) + [0] * pad_len
            segment_ids = [0] * self.max_sql
            all_tokens.append(tokens)
            all_masks.append(pad_masks)
            all_segments.append(segment_ids)
        return np.array(all_tokens), np.array(all_masks), np.array(
            all_segments)

    def build_model(self):

        input_words = Input(shape=(self.max_sql, ),
                            dtype=tf.int32,
                            name='input_words')
        input_mask = Input(shape=(self.max_sql, ),
                           dtype=tf.int32,
                           name='input_mask')
        segmentids = Input(shape=(self.max_sql, ),
                           dtype=tf.int32,
                           name='segment_ids')
        _, sequence_output = self.bert_layer(
            [input_words, input_mask, segmentids])  # without pooled output
        clf_output = sequence_output[:, 0, :]
        out = Dense(1, activation='sigmoid')(clf_output)

        model = Model(inputs=[input_words, input_mask, segmentids],
                      outputs=out)
        optimizer = Adam(learning_rate=self.lr)
        model.compile(loss='binary_crossentropy',
                      optimizer=optimizer,
                      metrics=['accuracy'])
        return model

    def fit(self, x):
        xtrain, xval, ytrain, yval = train_test_split(x,
                                                      x.target_relabeled,
                                                      test_size=0.2,
                                                      random_state=878)
        ytrain = xtrain.target_relabeled
        xtrain = self.encode(xtrain.cleaned.str.lower())
        yval = xval.target_relabeled
        xval = self.encode(xval.cleaned.str.lower())
        metrics = ClassificationReport(train=(xtrain, ytrain),
                                       val=(xval, yval))
        checkpoint = ModelCheckpoint('model_BERT.h5',
                                     monitor='val_loss',
                                     save_best_only=True)
        model = self.build_model()
        model.fit(xtrain,
                  ytrain,
                  validation_data=(xval, yval),
                  callbacks=[metrics, checkpoint],
                  epochs=self.epochs,
                  batch_size=self.batch_size)

    def predict(self, x):
        model = self.build_model()
        model.load_weights('model_BERT.h5')
        xtest = self.encode(x.cleaned.str.lower())
        ypred = model.predict(xtest)
        return ypred
Esempio n. 24
0
class BertPreprocessor(Preprocessor):
    """Preprocessor for BERT embedding.

    This class can be used to do all the work to create the inputs (and outputs) of a Neural Network using BERT
    as embedding. Currently only single sequence classification is supported.

    Source: https://github.com/google-research/bert_keras
    """
    def __init__(self, pretrained_model_path: str, **kwargs):

        super().__init__(**kwargs)

        info = hub.Module(spec=pretrained_model_path)(
            signature="tokenization_info", as_dict=True)

        with tf.Session() as sess:
            vocab_file, do_lower_case = sess.run(
                [info["vocab_file"], info["do_lower_case"]])

        # Create the tokenizer with the vocabulary of the pretrained model
        self._tokenizer = FullTokenizer(vocab_file=vocab_file,
                                        do_lower_case=do_lower_case)

        basic_tokens = self._tokenizer.convert_tokens_to_ids(
            ["[CLS]", "[SEP]"])
        self._CLS_token = basic_tokens[0]
        self._SEP_token = basic_tokens[1]

    def _truncate_seq_pair(self, tokens_a, tokens_b, max_length):
        """Truncates a sequence pair in place to the maximum length."""

        # This is a simple heuristic which will always truncate the longer sequence
        # one token at a time. This makes more sense than truncating an equal percent
        # of tokens from each, since if one sequence is very short then each token
        # that's truncated likely contains more information than a longer sequence.
        while True:
            total_length = len(tokens_a) + len(tokens_b)
            if total_length <= max_length:
                break
            if len(tokens_a) > len(tokens_b):
                tokens_a.pop()
            else:
                tokens_b.pop()

    def _padding_sentence(self):
        """Return a zero length sentence to pad last batch.

        :return: Three sequences of zeros (tokens, masks, segment ids).
        """

        return [0] * self._max_seq_len, [0] * self._max_seq_len, [
            0
        ] * self._max_seq_len

    def tokenize(self, text_a: str, text_b: str = None):
        """Convert sequence(s) of words into sequence(s) of tokens and also compute the masking- and segment ids.

        For further details please read BERT paper.

        :param text_a: First sequence
        :param text_b: Second sequence
        :return: The sequence of tokens, masks and segment ids.
        """

        input_ids = [0] * self._max_seq_len
        # The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to.
        input_mask = [0] * self._max_seq_len
        # The segment ids are 0 for text_a and 1 for text_b
        input_segment_ids = [0] * self._max_seq_len

        tokens_a = self._tokenizer.tokenize(text_a)
        tokens_b = None
        if text_b:
            tokens_b = self._tokenizer.tokenize(text_b)

        if tokens_b:
            # Modifies `tokens_a` and `tokens_b` in place so that the total
            # length is less than the specified length.
            # Account for [CLS], [SEP], [SEP] with "- 3"
            self._truncate_seq_pair(tokens_a, tokens_b, self._max_seq_len - 3)
        else:
            # Account for [CLS] and [SEP] with "- 2"
            if len(tokens_a) > self._max_seq_len - 2:
                tokens_a = tokens_a[0:(self._max_seq_len - 2)]

        idx = 0
        input_ids[idx] = self._CLS_token
        idx += 1

        for element in self._tokenizer.convert_tokens_to_ids(tokens_a):
            input_ids[idx] = element
            input_mask[idx] = 1
            idx += 1

        if tokens_b:
            for element in self._tokenizer.convert_tokens_to_ids(tokens_b):
                input_ids[idx] = element
                input_mask[idx] = 1
                input_segment_ids[idx] = 1
                idx += 1

        input_ids[idx] = self._SEP_token

        # The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to.
        for i in range(idx + 1):
            input_mask[i] = 1

        # safety check
        assert len(input_ids) == self._max_seq_len
        assert len(input_mask) == self._max_seq_len
        assert len(input_segment_ids) == self._max_seq_len

        return input_ids, input_mask, input_segment_ids

    def fit(self, texts: List[str]) -> 'BertPreprocessor':
        """This function does nothing in case of BERT but must be implemented.

        :param texts: -
        :return: self
        """

        return self

    def transform(self, examples: List[InputExample]) -> list:
        """Transform sequences of words into sequences of tokens, masks and segment ids.

        Masks are used to separate valid and padding tokens. Here the segment ids are always one since the whole
        sequence belongs together.

        For further details please read BERT paper.

        :param texts: The sequences of texts.
        :return: The sequences of tokens, masks and segment ids.
        """

        input_ids, input_masks, segment_ids = [], [], []

        for i, example in enumerate(examples):
            input_id, input_mask, segment_id = self.tokenize(
                text_a=example.text_a, text_b=example.text_b)
            input_ids.append(input_id)
            input_masks.append(input_mask)
            segment_ids.append(segment_id)

        return [
            np.array(input_ids),
            np.array(input_masks),
            np.array(segment_ids)
        ]

    def inverse_transform(self, sequences: np.ndarray):
        """Transform sequences of tokens back to sequences of words (sentences).

        :param sequences: The sequences of tokens.
        :return: The sequences of words
        """

        return self._tokenizer.convert_ids_to_tokens(sequences)
Esempio n. 25
0
class BertInputProcessor(InputProcessor):
    def __init__(self, params):

        self.name = 'bert_input_processor'
        self._max_sent_len = params.max_sent_len
        self._use_dict = params.use_dict

        bert_module = hub.Module(params.bert_path)
        tokenization_info = bert_module(signature="tokenization_info",
                                        as_dict=True)
        with tf.Session() as sess:
            vocab_file, do_lower_case = sess.run([
                tokenization_info["vocab_file"],
                tokenization_info["do_lower_case"]
            ])
            self._do_lower_case = do_lower_case

        self._tokenizer = FullTokenizer(vocab_file, do_lower_case)

        label_vocab_path = params.data_dir + '/vocab/label_vocab.txt'
        logging.info('load label vocab from :{}'.format(label_vocab_path))
        self.label_vocab = Vocab(params.vocab_pad, params.vocab_unk)
        self.label_vocab.load(label_vocab_path)
        params.num_labels = len(self.label_vocab)
        self._num_labels = params.num_labels

        if params.use_dict:
            dict_vocab_path = params.data_dir + '/vocab/dict_vocab.txt'
            logging.info(
                'load user dict vocab from : {}'.format(dict_vocab_path))
            self.dict_vocab = Vocab(params.vocab_pad, params.vocab_unk)
            self.dict_vocab.load(dict_vocab_path)
            params.dict_vocab_size = len(self.dict_vocab)

            dict_path = params.data_dir + '/user_dict.json'
            logging.info('load user dict form : {}'.format(dict_path))
            with open(dict_path, 'r') as file:
                self.user_dict = json.load(file)

    def transform(self, data, mode="predict", verbose=False):
        word_seqs = data['word']

        bert_word_seqs = []
        mapping_seqs = []

        for word_seq in word_seqs:
            bert_word_seq, mapping_seq = [], []
            for index, word in enumerate(word_seq):
                if self._do_lower_case == True:
                    word = word.lower()
                temp_words = self._tokenizer.wordpiece_tokenizer.tokenize(word)
                for i, temp_word in enumerate(temp_words):
                    bert_word_seq.append(temp_word)
                    if i == 0:
                        mapping_seq.append(index)
                    else:
                        mapping_seq.append('X')
            bert_word_seq = ["[CLS]"] + bert_word_seq[:self._max_sent_len -
                                                      2] + ["[SEP]"]
            mapping_seq = ["[CLS]"] + mapping_seq[:self._max_sent_len -
                                                  2] + ["[SEP]"]

            bert_word_seqs.append(bert_word_seq)
            mapping_seqs.append(mapping_seq)

        data['bert_mapping'] = mapping_seqs

        bert_word_id_seqs = [
            self._tokenizer.convert_tokens_to_ids(sent)
            for sent in bert_word_seqs
        ]
        bert_mask_id_seqs = [[1] * len(sent) for sent in bert_word_seqs]
        bert_segment_id_seqs = [[0] * len(sent) for sent in bert_word_seqs]

        padded_bert_word_id_seqs = pad_sequences(bert_word_id_seqs,
                                                 padding='post',
                                                 maxlen=self._max_sent_len)
        padded_bert_mask_id_seqs = pad_sequences(bert_mask_id_seqs,
                                                 padding='post',
                                                 maxlen=self._max_sent_len)
        padded_bert_segment_id_seqs = pad_sequences(bert_segment_id_seqs,
                                                    padding='post',
                                                    maxlen=self._max_sent_len)

        x_inputs = [
            padded_bert_word_id_seqs, padded_bert_mask_id_seqs,
            padded_bert_segment_id_seqs
        ]

        if self._use_dict:
            bert_dict_tag_seqs = []
            for word_seq, mapping_seq in zip(word_seqs, mapping_seqs):
                bert_dict_tag_seq = []
                for mapping_i in mapping_seq:
                    if mapping_i in {"[CLS]", "[SEP]", "X"}:
                        dict_tag = mapping_i
                    else:
                        dict_tag = self.user_dict.get(word_seq[mapping_i],
                                                      self.dict_vocab._unknown)
                    bert_dict_tag_seq.append(dict_tag)
                bert_dict_tag_seqs.append(bert_dict_tag_seq)

            bert_dict_id_seq = [[
                self.dict_vocab.encode(dict_tag) for dict_tag in dict_tag_sent
            ] for dict_tag_sent in bert_dict_tag_seqs]
            padded_dict_id_seq = pad_sequences(bert_dict_id_seq,
                                               padding='post',
                                               maxlen=self._max_sent_len)
            x_inputs.append(padded_dict_id_seq)

        assert len(padded_bert_word_id_seqs[0]) == self._max_sent_len
        assert len(padded_bert_mask_id_seqs[0]) == self._max_sent_len
        assert len(padded_bert_segment_id_seqs[0]) == self._max_sent_len

        if mode == 'evaluate':
            label_seqs = data['label']
            bert_label_seqs = []
            for label_seq, mapping_seq in zip(label_seqs, mapping_seqs):
                bert_label_seq = []
                for mapping_i in mapping_seq:
                    if mapping_i in {"[CLS]", "[SEP]", "X"}:
                        bert_label_seq.append(mapping_i)
                    else:
                        bert_label_seq.append(label_seq[mapping_i])

                bert_label_seqs.append(bert_label_seq)

            bert_label_id_seqs = [[
                self.label_vocab.encode(label, allow_oov=False)
                for label in sent
            ] for sent in bert_label_seqs]
            padded_bert_label_id_seqs = pad_sequences(
                bert_label_id_seqs, padding='post', maxlen=self._max_sent_len)
            y_seqs = to_categorical(padded_bert_label_id_seqs,
                                    self._num_labels).astype(int)
            y_seqs = y_seqs if len(y_seqs.shape) == 3 else np.expand_dims(
                y_seqs, axis=0)

            return x_inputs, y_seqs

        elif mode == 'predict':
            return x_inputs
        else:
            raise ValueError('mode must be predict or evaluate')
Esempio n. 26
0
class BertEmbeddingsResolver:
    def __init__(self, model_folder, max_length=256, lowercase=True):

        # 1. Create tokenizer
        self.max_length = max_length
        vocab_file = os.path.join(model_folder, 'vocab.txt')
        self.tokenizer = FullTokenizer(vocab_file, do_lower_case=lowercase)

        # 2. Read Config
        config_file = os.path.join(model_folder, 'bert_config.json')
        self.config = BertConfig.from_json_file(config_file)

        # 3. Create Model
        self.session = tf.Session()
        self.token_ids_op = tf.placeholder(tf.int32,
                                           shape=(None, max_length),
                                           name='token_ids')
        self.model = BertModel(config=self.config,
                               is_training=False,
                               input_ids=self.token_ids_op,
                               use_one_hot_embeddings=False)

        # 4. Restore Trained Model
        self.saver = tf.train.Saver()
        ckpt_file = os.path.join(model_folder, 'bert_model.ckpt')
        # RCS ckpt_file = os.path.join(model_folder, 'model.ckpt-1000000')
        self.saver.restore(self.session, ckpt_file)

        hidden_layers = self.config.num_hidden_layers
        self.embeddings_op = tf.get_default_graph().get_tensor_by_name(
            "bert/encoder/Reshape_{}:0".format(hidden_layers + 1))

    def tokenize_sentence(self, tokens, add_service_tokens=True):
        result = []
        is_word_start = []
        for token in tokens:
            pieces = self.tokenizer.tokenize(token)
            result.extend(pieces)
            starts = [False] * len(pieces)
            starts[0] = True
            is_word_start.extend(starts)

        if add_service_tokens:
            if len(result) > self.max_length - 2:
                result = result[:self.max_length - 2]
                is_word_start = is_word_start[:self.max_length - 2]

            result = ['[CLS]'] + result + ['[SEP]']
            is_word_start = [False] + is_word_start + [False]
        else:
            if len(result) > self.max_length:
                result = result[:self.max_length]
                is_word_start = is_word_start[:self.max_length]

        return (result, is_word_start)

    def resolve_sentences(self, sentences):
        batch_is_word_start = []
        batch_token_ids = []
        batch_tokens = []

        for sentence in sentences:
            tokens, is_word_start = self.tokenize_sentence(sentence)
            token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
            to_input = np.pad(token_ids,
                              [(0, self.max_length - len(token_ids))],
                              mode='constant')
            batch_token_ids.append(to_input)
            batch_tokens.append(tokens)
            batch_is_word_start.append(is_word_start)

        embeddings = self.session.run(
            self.embeddings_op, feed_dict={self.token_ids_op: batch_token_ids})

        result = []
        for i in range(len(sentences)):
            tokens = batch_tokens[i]
            is_word_start = batch_is_word_start[i]
            item_embeddings = embeddings[i, :len(tokens), :]

            resolved = TokenEmbeddings.create_sentence(tokens, is_word_start,
                                                       item_embeddings)
            result.append(resolved)

        return result

    def resolve_sentence(self, sentence):
        tokens, is_word_start = self.tokenize_sentence(sentence)

        token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        to_input = np.pad(token_ids, [(0, self.max_length - len(token_ids))],
                          mode='constant')
        to_input = to_input.reshape((1, self.max_length))

        embeddings = self.session.run(self.embeddings_op,
                                      feed_dict={self.token_ids_op: to_input})
        embeddings = np.squeeze(embeddings)
        embeddings = embeddings[:len(token_ids), :]

        return TokenEmbeddings.create_sentence(tokens, is_word_start,
                                               embeddings)
Esempio n. 27
0
def build_dataset(conll_file,
                  tfrecod_file,
                  pos2id,
                  dep2id,
                  path2id,
                  truncate=False):
    max_len = 0

    tokenizer = FullTokenizer(vocab_file=VOCAB_FILE,
                              do_lower_case=DO_LOWER_CASE)

    with open(conll_file, 'r') as reader:
        text = reader.read().strip()
    sentences = text.split('\n\n')

    tf_writer = tf.python_io.TFRecordWriter(tfrecod_file)
    for sent in sentences:
        subword_list = ["[CLS]"]
        span_list = [0]
        mask_list = [0]
        cue_list = [0]

        pos_list = [0]
        dep_list = [0]
        path_list = [0]
        lpath_list = [-1]
        cp_list = [-1]

        subword_id_list = tokenizer.convert_tokens_to_ids(["[CLS]"])

        for token in sent.split('\n'):
            if len(token) >= 8:
                token = token.split('\t')

                token_ = token[0]
                subword = tokenizer.tokenize(token_)

                span = [int(token[8]) for _ in range(len(subword))]
                cue = [int(token[7]) for _ in range(len(subword))]

                pos = [
                    int(mapping(pos2id, token[2])) for _ in range(len(subword))
                ]
                dep = [
                    int(mapping(dep2id, token[3])) for _ in range(len(subword))
                ]
                path = [
                    int(mapping(path2id, token[4]))
                    for _ in range(len(subword))
                ]
                lpath = [int(token[5]) for _ in range(len(subword))]
                cp = [int(token[6]) for _ in range(len(subword))]

                mask = [0 for _ in range(len(subword))]
                mask[0] = 1

                sub_id = tokenizer.convert_tokens_to_ids(subword)

                subword_list.extend(subword)
                mask_list.extend(mask)
                subword_id_list.extend(sub_id)

                pos_list.extend(pos)
                dep_list.extend(dep)
                path_list.extend(path)
                lpath_list.extend(lpath)
                cp_list.extend(cp)

                cue_list.extend(cue)
                span_list.extend(span)

        subword_list.append("[SEP]")
        span_list.append(0)
        cue_list.append(0)
        mask_list.append(0)
        subword_id_list.extend(tokenizer.convert_tokens_to_ids(["[SEP]"]))

        pos_list.append(0)
        dep_list.append(0)
        path_list.append(0)
        lpath_list.append(-1)
        cp_list.append(-1)

        assert len(subword_list) == len(span_list) == len(mask_list) == len(
            subword_id_list)

        max_len = max(max_len, len(subword_id_list))

        if len(subword_list) > 2:
            if (not truncate) or (len(subword_id_list) <= 64):
                # write tfrecord
                token_id = [
                    tf.train.Feature(int64_list=tf.train.Int64List(value=[t_]))
                    for t_ in subword_id_list
                ]
                mask = [
                    tf.train.Feature(int64_list=tf.train.Int64List(value=[m_]))
                    for m_ in mask_list
                ]
                span = [
                    tf.train.Feature(int64_list=tf.train.Int64List(value=[s_]))
                    for s_ in span_list
                ]
                cue = [
                    tf.train.Feature(int64_list=tf.train.Int64List(value=[c_]))
                    for c_ in cue_list
                ]

                pos_features = [
                    tf.train.Feature(int64_list=tf.train.Int64List(
                        value=[pos_])) for pos_ in pos_list
                ]
                dep_features = [
                    tf.train.Feature(int64_list=tf.train.Int64List(
                        value=[dep_])) for dep_ in dep_list
                ]
                path_features = [
                    tf.train.Feature(int64_list=tf.train.Int64List(
                        value=[path_])) for path_ in path_list
                ]
                lpath_features = [
                    tf.train.Feature(int64_list=tf.train.Int64List(
                        value=[lpath_])) for lpath_ in lpath_list
                ]
                cp_features = [
                    tf.train.Feature(int64_list=tf.train.Int64List(
                        value=[cp_])) for cp_ in cp_list
                ]

                feature_list = {
                    'token_id': tf.train.FeatureList(feature=token_id),
                    'span': tf.train.FeatureList(feature=span),
                    'masks': tf.train.FeatureList(feature=mask),
                    'cue': tf.train.FeatureList(feature=cue),
                    'pos': tf.train.FeatureList(feature=pos_features),
                    'dep': tf.train.FeatureList(feature=dep_features),
                    'path': tf.train.FeatureList(feature=path_features),
                    'lpath': tf.train.FeatureList(feature=lpath_features),
                    'cp': tf.train.FeatureList(feature=cp_features),
                }

                context = tf.train.Features(
                    feature={
                        "length":
                        tf.train.Feature(int64_list=tf.train.Int64List(
                            value=[len(subword_id_list)])),
                    })

                feature_lists = tf.train.FeatureLists(
                    feature_list=feature_list)
                ex = tf.train.SequenceExample(feature_lists=feature_lists,
                                              context=context)
                tf_writer.write(ex.SerializeToString())

    tf_writer.close()
class BertNerPreprocessor:
    """Takes tokens and splits them into bert subtokens, encode subtokens with their indices.
    Creates mask of subtokens (one for first subtoken, zero for later subtokens).
    If tags are provided, calculate tags for subtokens.
    Args:
        vocab_file: path to vocabulary
        do_lower_case: set True if lowercasing is needed
        max_seq_length: max sequence length in subtokens, including [SEP] and [CLS] tokens
        max_subword_length: replace token to <unk> if it's length is larger than this
            (defaults to None, which is equal to +infinity)
        token_mask_prob: probability of masking token while training
        provide_subword_tags: output tags for subwords or for words
    Attributes:
        max_seq_length: max sequence length in subtokens, including [SEP] and [CLS] tokens
        max_subword_length: rmax lenght of a bert subtoken
        tokenizer: instance of Bert FullTokenizer
    """
    def __init__(self,
                 max_seq_length: int = 4096,
                 max_subword_length: int = 15,
                 token_maksing_prob: float = 0.0,
                 provide_subword_tags: bool = False,
                 **kwargs):
        self._re_tokenizer = re.compile(r"[\w']+|[^\w ]")
        self.provide_subword_tags = provide_subword_tags
        self.mode = kwargs.get('mode')
        self.max_seq_length = max_seq_length
        self.max_subword_length = max_subword_length
        self.tokenizer = FullTokenizer(vocab_file=VOCAB_PATH,
                                       do_lower_case=False)
        self.token_maksing_prob = token_maksing_prob

        self.log = getLogger(__name__)

    def __call__(self,
                 tokens: Union[List[List[str]], List[str]],
                 tags: List[List[str]] = None,
                 **kwargs):
        if isinstance(tokens[0], str):
            tokens = [re.findall(self._re_tokenizer, s) for s in tokens]
        subword_tokens, subword_tok_ids, subword_masks, subword_tags = [], [], [], []
        for i in range(len(tokens)):
            toks = tokens[i]
            ys = ['O'] * len(toks) if tags is None else tags[i]
            mask = [int(y != 'X') for y in ys]
            print("toks")
            print(toks)
            print("ys")
            print(ys)
            print("KKKK")
            assert len(toks) == len(ys) == len(mask), \
                f"toks({len(toks)}) should have the same length as " \
                    f" ys({len(ys)}) and mask({len(mask)}), tokens = {toks}."
            sw_toks, sw_mask, sw_ys = self._ner_bert_tokenize(
                toks,
                mask,
                ys,
                self.tokenizer,
                self.max_subword_length,
                mode=self.mode,
                token_maksing_prob=self.token_maksing_prob)
            if self.max_seq_length is not None:
                if len(sw_toks) > self.max_seq_length:
                    print("sw_toks")
                    print(sw_toks)
                    print(len(sw_toks))
                    raise RuntimeError(
                        f"input sequence after bert tokenization"
                        f" shouldn't exceed {self.max_seq_length} tokens. {len(sw_toks)} is"
                    )
            subword_tokens.append(sw_toks)
            subword_tok_ids.append(
                self.tokenizer.convert_tokens_to_ids(sw_toks))
            subword_masks.append(sw_mask)
            subword_tags.append(sw_ys)
            assert len(sw_mask) == len(sw_toks) == len(subword_tok_ids[-1]) == len(sw_ys), \
                f"length of mask({len(sw_mask)}), tokens({len(sw_toks)})," \
                    f" token ids({len(subword_tok_ids[-1])}) and ys({len(ys)})" \
                    f" for tokens = `{toks}` should match"
        subword_tok_ids = self.zero_pad(subword_tok_ids, dtype=int, padding=0)
        subword_masks = self.zero_pad(subword_masks, dtype=int, padding=0)
        if tags is not None:
            if self.provide_subword_tags:
                return tokens, subword_tokens, subword_tok_ids, subword_masks, subword_tags
            else:
                nonmasked_tags = [[t for t in ts if t != 'X'] for ts in tags]
                for swts, swids, swms, ts in zip(subword_tokens,
                                                 subword_tok_ids,
                                                 subword_masks,
                                                 nonmasked_tags):
                    if (len(swids) != len(swms)) or (len(ts) != sum(swms)):
                        self.log.warning(
                            'Not matching lengths of the tokenization!')
                        self.log.warning(
                            f'Tokens len: {len(swts)}\n Tokens: {swts}')
                        self.log.warning(
                            f'Masks len: {len(swms)}, sum: {sum(swms)}')
                        self.log.warning(f'Masks: {swms}')
                        self.log.warning(f'Tags len: {len(ts)}\n Tags: {ts}')
                return tokens, subword_tokens, subword_tok_ids, subword_masks, nonmasked_tags
        return tokens, subword_tokens, subword_tok_ids, subword_masks

    @staticmethod
    def _ner_bert_tokenize(
        tokens: List[str],
        mask: List[int],
        tags: List[str],
        tokenizer: FullTokenizer,
        max_subword_len: int = None,
        mode: str = None,
        token_maksing_prob: float = 0.0
    ) -> Tuple[List[str], List[int], List[str]]:
        tokens_subword = ['[CLS]']
        mask_subword = [0]
        tags_subword = ['X']
        for token, flag, tag in zip(tokens, mask, tags):
            subwords = tokenizer.tokenize(token)
            if not subwords or \
                    ((max_subword_len is not None) and (len(subwords) > max_subword_len)):
                tokens_subword.append('[UNK]')
                mask_subword.append(flag)
                tags_subword.append(tag)
            else:
                if mode == 'train' and token_maksing_prob > 0.0 and np.random.rand(
                ) < token_maksing_prob:
                    tokens_subword.extend(['[MASK]'] * len(subwords))
                else:
                    tokens_subword.extend(subwords)
                mask_subword.extend([flag] + [0] * (len(subwords) - 1))
                tags_subword.extend([tag] + ['X'] * (len(subwords) - 1))

        tokens_subword.append('[SEP]')
        mask_subword.append(0)
        tags_subword.append('X')

        return tokens_subword, mask_subword, tags_subword

    def zero_pad(self, batch, zp_batch=None, dtype=np.float32, padding=0):
        if zp_batch is None:
            dims = self.get_dimensions(batch)
            zp_batch = np.ones(dims, dtype=dtype) * padding
        if zp_batch.ndim == 1:
            zp_batch[:len(batch)] = batch
        else:
            for b, zp in zip(batch, zp_batch):
                self.zero_pad(b, zp)
        return zp_batch

    def get_dimensions(self, batch) -> List[int]:
        return list(map(max, self.get_all_dimensions(batch)))

    def get_all_dimensions(
            self,
            batch: Sequence,
            level: int = 0,
            res: Optional[List[List[int]]] = None) -> List[List[int]]:
        if not level:
            res = [[len(batch)]]
        if len(batch) and isinstance(batch[0],
                                     Sized) and not isinstance(batch[0], str):
            level += 1
            if len(res) <= level:
                res.append([])
            for item in batch:
                res[level].append(len(item))
                self.get_all_dimensions(item, level, res)
        return res