Esempio n. 1
0
    def parse_line(self, line, max_seq_len=512):
        """ parse one line to token_ids, sentence_ids, pos_ids, label
        """

        line = line.strip().split(",")
        assert len(line) == 3, \
            "One sample must have %d fields!" % 3

        text_left, text_right, masklabel = line
        tokenizer = FullTokenizer(self.vocab_path)
        # tokenizer = FullTokenizer(vocab_path)
        text_left = tokenizer.tokenize(text_left)
        masklabel = tokenizer.tokenize(masklabel)
        masklabel_ = len(masklabel) * ["[MASK]"]
        text_right = tokenizer.tokenize(text_right)
        all_tokens = text_left + masklabel_ + text_right
        token_ids = tokenizer.convert_tokens_to_ids(all_tokens)
        sent_ids = [0] * len(all_tokens)
        pos_ids = [i for i in range(len(all_tokens))]
        input_mask = [1.0] * len(all_tokens)
        # 这儿还差一个mask_pos
        mask_pos = []
        for idx, mask in enumerate(token_ids):
            if mask == self.mask_id:
                mask_pos.append(idx)
        # 添加一个mask_label
        mask_label = list(tokenizer.convert_tokens_to_ids(masklabel))
        assert len(token_ids) == len(sent_ids) == len(pos_ids) == len(
            input_mask
        ), "[Must be true]len(token_ids) == len(sent_ids) == len(pos_ids) == len(seg_labels)"
        if len(token_ids) > max_seq_len:
            return None
        return [token_ids, sent_ids, pos_ids, input_mask, mask_pos, mask_label]
Esempio n. 2
0
def parse_sequence(tokenizer: t10n.FullTokenizer,
                   sequence: str) -> SequenceParseResult:
    tokens = tokenizer.tokenize(sequence)
    tokens.insert(0, '[CLS]')
    tokens.append('[SEP]')

    # Could be 0 or 1, not sure which index is *supposed* to represent a first segment
    token_type_ids = [0] * len(tokens)
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    attention_mask = [1] * len(tokens)
    attention_mask[0] = 0

    # Default for our model
    max_seq_length = 128

    # Pad arrays
    while len(input_ids) < max_seq_length:

        # Not sure if padding belongs to the sequence or not
        token_type_ids.append(0)

        # Zero is the [PAD]-token for the BERT-vocab
        input_ids.append(0)

        # We probably should exclude the sequence padding from the attention-mask
        attention_mask.append(0)

    return SequenceParseResult(
        tokens=tokens,
        token_type_ids=token_type_ids,
        attention_mask=attention_mask,
        input_ids=input_ids,
    )
Esempio n. 3
0
def preprocess(data):
    tokenizer = FullTokenizer(vocab_file)
    tok_ip = np.zeros((len(data), 128), dtype="int32")
    sent_ip = np.zeros((len(data), 128), dtype="int8")
    pos_ip = np.zeros((len(data), 128), dtype="int8")
    masks = np.zeros((len(data), 128), dtype="int8")

    for pos, text in tqdm.tqdm_notebook(enumerate(data)):
        tok0 = tokenizer.tokenize(text[0])
        tok1 = tokenizer.tokenize(text[1])
        tok = tok0 + tok1
        if len(tok) > 128:
            tok = tok[:127] + ["[SEP]"]
        pad_len = 128 - len(tok)
        tok_len = len(tok)
        tok0_len = len(tok0)
        tok = tokenizer.convert_tokens_to_ids(tok) + [0] * pad_len
        pos_val = range(128)
        sent = [0] * tok0_len + [1] * (tok_len - tok0_len) + [0] * pad_len
        mask = [1] * tok_len + [0] * pad_len

        tok_ip[pos] = tok
        pos_ip[pos] = pos_val
        masks[pos] = mask

    masks = masks[:, None, None, :]
    return tok_ip, sent_ip, pos_ip, masks
Esempio n. 4
0
 def test_tokenize(self):
     tokenizer = FullTokenizer()
     sentence = '実質的変化はなかった'
     res = tokenizer.tokenize(sentence)
     firsts = [0, 2, 3, 5, 6, 9]
     tokens = [
         CharToken(c, is_first=i in firsts) for i, c in enumerate(sentence)
     ]
     self.assertEqual(res, tokens)
Esempio n. 5
0
def get_lens(data):
    tokenizer = FullTokenizer(vocab_file)
    lens = []
    for pos, text in tqdm.tqdm(enumerate(data)):
        tok0 = tokenizer.tokenize(text[0])
        tok1 = tokenizer.tokenize(text[1])
        tok = tok0 + tok1
        lens.append(len(tok))

    return np.array(lens)
Esempio n. 6
0
def convert_single_example(ex_index, example: InputExample, tag_list: list, label_list: list, max_seq_length,
                           tokenizer: tokenization.FullTokenizer):
    query = tokenizer.tokenize(example.text)

    if len(query) > max_seq_length - 2:
        query = query[0:(max_seq_length - 2)]

    tokens = ["[CLS]"]
    tags = ["[CLS]"]
    for idx, token in enumerate(query):
        tokens.append(token)
        tags.append(example.tag[idx])
    tokens.append("[SEP]")
    tags.append("[SEP]")
    segment_ids = [0] * len(tokens)

    tag_map = {}
    for idx, tag in enumerate(tag_list):
        tag_map[tag] = idx
    label_map = {}
    for idx, label in enumerate(label_list):
        label_map[label] = idx

    tag_ids = [tag_map[tag] for tag in tags]
    label_id = label_map[example.label]

    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_mask = [1] * len(input_ids)

    while len(input_ids) < max_seq_length:
        input_ids.append(0)
        input_mask.append(0)
        segment_ids.append(0)
        tag_ids.append(0)

    if ex_index < 5:
        logger.info("*** Example ***")
        logger.info("guid: %s" % example.guid)
        logger.info("tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens]))
        logger.info("tag: %s" % " ".join(tags))
        logger.info("label: %s" % example.label)

    feature = InputFeature(
        input_ids=input_ids,
        input_mask=input_mask,
        segment_ids=segment_ids,
        tag_ids=tag_ids,
        label_id=label_id
    )

    return feature
Esempio n. 7
0
 def test_tokenize_with_nelogd(self):
     NEOLOGD_PATH = "/usr/local/lib/mecab/dic/ipadic/mecab-user-dict-seed.dic"
     if not os.path.isfile(NEOLOGD_PATH):
         raise ValueError(
             'NEOLOGD_PATH is invalid. Please set a file path to neologd dic'
         )
     sentence = '実質的変化はなかった'
     tokenizer = FullTokenizer(userdic_path=NEOLOGD_PATH)
     firsts = [0, 3, 5, 6, 9]
     tokens = [
         CharToken(c, is_first=i in firsts) for i, c in enumerate(sentence)
     ]
     res = tokenizer.tokenize(sentence)
     self.assertEqual(res, tokens)
def adaptERNIEtokenization(all_sentences):
    tokenizer = FullTokenizer(vocab_file="vocab.txt", do_lower_case=True)
    ernie_tokens = [
        tokenizer.tokenize(sentence) for sentence in tqdm(all_sentences)
    ]
    print("Parsed to ERNIE tokens!")
    all_cleaned_tokens = []
    for line in tqdm(ernie_tokens):
        cleaned_tokens = []
        for i, token in enumerate(line):
            if token[:2] == "##":
                cleaned_tokens[-1] += token[2:]
            else:
                cleaned_tokens.append(token)
        all_cleaned_tokens.append(cleaned_tokens)
    return all_cleaned_tokens
Esempio n. 9
0
class BERTTextEncoder(TextEncoder):
    def __init__(self, vocab_file: str, do_lower_case: bool = True) -> None:
        self.tokenizer = FullTokenizer(vocab_file, do_lower_case)
        super().__init__(len(self.tokenizer.vocab))
        self.bert_unk_id = self.tokenizer.vocab['[UNK]']
        self.bert_msk_id = self.tokenizer.vocab['[MASK]']

    def standardize_ids(self, ids: List[int]) -> List[int]:
        for i in range(len(ids)):
            if ids[i] == self.bert_unk_id:  # UNK
                ids[i] = 0
            else:  # VOCAB
                ids[i] -= self.bert_msk_id
        return ids

    def encode(self, sent: str) -> List[int]:
        return self.standardize_ids(
            self.tokenizer.convert_tokens_to_ids(
                self.tokenizer.tokenize(sent)))
Esempio n. 10
0
class BERTFunction(object):
    def __init__(self, bert_config_file, init_checkpoint, max_seq_length, vocab_file, num_labels, use_gpu=False):
        # 导入预训练参数所需
        self.bert_config = modeling.BertConfig.from_json_file(bert_config_file)
        self.init_checkpoint = init_checkpoint
        # 数据集和计算所需
        self.max_seq_length = max_seq_length
        self.num_labels = num_labels
        # 数据预处理所需
        self.vocab_file = vocab_file
        self.tokenizer=FullTokenizer(self.vocab_file, do_lower_case=False) # 默认 cased 模型
        # gpu
        self.use_gpu=use_gpu
        
        self.graph=tf.Graph() #声明计算图
        with self.graph.as_default():
            # 定义placeholder
            self.input_ids = tf.placeholder(dtype=tf.int64, shape=(None,self.max_seq_length))
            self.input_mask = tf.placeholder(dtype=tf.int64, shape=(None,self.max_seq_length))
            self.segment_ids = tf.placeholder(dtype=tf.int64, shape=(None,self.max_seq_length))
                
            # 定义计算
            (self.logits, self.probabilities) = create_predict_model(self.bert_config, 
            self.input_ids, self.input_mask, self.segment_ids, self.num_labels)
            
            # 导入预训练参数
            self.tvars = tf.trainable_variables() #创建了计算图后,可训练的变量随之被创建。
            self.initialized_variable_names = {}
            if self.init_checkpoint: #init_checkpoint是命令行中传入的预训练BERT或先前训练过的,ckpt文件
                (self.assignment_map, self.initialized_variable_names #从init_checkpoints中获取与可用的变量的值(预训练模型与实际任务计算图的变量的交集)
                 ) = modeling.get_assignment_map_from_checkpoint(self.tvars, self.init_checkpoint)
            tf.train.init_from_checkpoint(self.init_checkpoint, self.assignment_map)
        
    def cal(self, text_a, text_b):
        features=self.process_input(text_a, text_b)
        config = None
        if self.use_gpu:
            config = tf.ConfigProto(log_device_placement=True)  
            config.gpu_options.allow_growth = True  
            print("trying to use gpu")
        else:
            print("not using cpu")
        with tf.Session(graph=self.graph, config=config) as session:
            session.run(tf.global_variables_initializer())
            feed_dict={self.input_ids: features["input_ids"], 
                       self.input_mask: features["input_mask"], 
                       self.segment_ids: features["segment_ids"]}
            prob = session.run(self.probabilities, feed_dict=feed_dict)
            print("prob: \n", prob)
            return prob
    def process_input(self, text_a, text_b):#重新精简 convert_single_example 函数
          tokens_a = self.tokenizer.tokenize(text_a)
          tokens_b = self.tokenizer.tokenize(text_b)
          #_truncate_seq_pair 函数
          while True:
              total_length = len(tokens_a) + len(tokens_b)
              if total_length <= self.max_seq_length:
                  break
              if len(tokens_a) > len(tokens_b):
                  tokens_a.pop()
              else:
                  tokens_b.pop()
          # 精简 convert_single_example 函数
          tokens = []
          segment_ids = []
          tokens.append("[CLS]")
          segment_ids.append(0)
          for token in tokens_a:
              tokens.append(token)
              segment_ids.append(0)
          tokens.append("[SEP]")
          segment_ids.append(0)

          for token in tokens_b:
              tokens.append(token)
              segment_ids.append(1)
          tokens.append("[SEP]")
          segment_ids.append(1)
          
          input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
          
          input_mask = [1] * len(input_ids)

          while len(input_ids) < self.max_seq_length:
              input_ids.append(0)
              input_mask.append(0)
              segment_ids.append(0)
        
          assert len(input_ids) == self.max_seq_length
          assert len(input_mask) == self.max_seq_length
          assert len(segment_ids) == self.max_seq_length
          
          print("tokens: %s" % " ".join([printable_text(x) for x in tokens]))
#          print("input_ids: %s" % " ".join([str(x) for x in input_ids]))
#          print("input_mask: %s" % " ".join([str(x) for x in input_mask]))
#          print("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
          
          return {"input_ids": nparray([input_ids], dtype=npint64),
                  "input_mask": nparray([input_mask], dtype=npint64),
                  "segment_ids": nparray([segment_ids], dtype=npint64)}

#用法示例
#############!!!!!!!!!!!!!需要确定max_seq_length 在模型中,对张量流动的限制,否则由 transformer 的特性,完全可以更自由
# import os
# pretrained_dir="../pretrained/multi_cased_L-12_H-768_A-12/"
# init_checkpoint = os.path.join(pretrained_dir, "./bert_model.ckpt")
# bert_config_file=os.path.join(pretrained_dir, "./bert_config.json")
# vocab_file = os.path.join(pretrained_dir, "./vocab.txt")
# max_seq_length =160 
# num_labels = 2
#
# func=BERTFunction(bert_config_file, init_checkpoint, max_seq_length, vocab_file, num_labels)
# res=func.cal("I'm gogo, who are you?","I'm Trump, I'm fine.")
Esempio n. 11
0
bert_tokenizer = FullTokenizer(
    vocab_file=bert_layer.resolved_object.vocab_file.asset_path.numpy(),
    do_lower_case=bert_layer.resolved_object.do_lower_case.numpy())

# TODO:
# Document longer than 512 words wont be able to be encoded by BERT,
# since its positional encoding has a hard limit for 512 words.
# For better results we may need to summarize the document into <= 512 tokens,
# or encode sentence by sentence then pool together.
maxlen = 256

# TODO:
# We need to manually handle CLS and SEP special token for sentence beginning and ending.

# Encode text with padding, masking, and segmentation (required by BERT even if we don't use it).
tok_seq_train = [bert_tokenizer.tokenize(text) for text in imdb_reviews_train]
wid_seq_train = [
    bert_tokenizer.convert_tokens_to_ids(toks)[:maxlen]
    for toks in tok_seq_train
]
wid_seq_train_padded = pad_sequences(wid_seq_train,
                                     padding="post",
                                     maxlen=maxlen)
wid_seq_train_mask = (wid_seq_train_padded > 0).astype(int)
segment_ids_train = np.zeros_like(wid_seq_train_mask)

tok_seq_test = [bert_tokenizer.tokenize(text) for text in imdb_reviews_test]
wid_seq_test = [
    bert_tokenizer.convert_tokens_to_ids(toks)[:maxlen]
    for toks in tok_seq_test
]
Esempio n. 12
0
class BertInference(object):
    """
    The bert model.
    """
    def __init__(self, bert_meta):
        self.graph = self._load_graph(bert_meta.model_file)

        self.tokenizer = FullTokenizer(vocab_file=bert_meta.vocab_file,
                                       do_lower_case=True)
        self.max_seq_length = 128

        # Input.
        self.input_ids = self.graph.get_tensor_by_name('infer/input_ids:0')
        self.word_ids = self.graph.get_tensor_by_name('infer/input_mask:0')
        self.segment_ids = self.graph.get_tensor_by_name('infer/segment_ids:0')
        # Output.
        self.predictions = self.graph.get_tensor_by_name(
            'infer/loss/Softmax:0')

        self.sess = tf.Session(graph=self.graph)

        self.inference(BertInputPackage(u'预热一下'))

    def inference(self, bert_input):
        """
        Call model.
        """
        input_ids, input_mask, segment_ids = self._convert_single_example(
            bert_input.query)
        preds_evaluated = self.sess.run(self.predictions,
                                        feed_dict={
                                            self.input_ids: [input_ids],
                                            self.word_ids: [input_mask],
                                            self.segment_ids: [segment_ids]
                                        })

        return preds_evaluated

    def _load_graph(self, frozen_graph_filename):
        with tf.gfile.GFile(frozen_graph_filename, "rb") as f:
            graph_def = tf.GraphDef()
            graph_def.ParseFromString(f.read())

        with tf.Graph().as_default() as graph:
            tf.import_graph_def(graph_def,
                                input_map=None,
                                return_elements=None,
                                name="infer",
                                op_dict=None,
                                producer_op_list=None)

        return graph

    def _convert_single_example(self, text_a):
        tokens_a = self.tokenizer.tokenize(text_a)

        if len(tokens_a) > self.max_seq_length - 2:
            tokens_a = tokens_a[0:(self.max_seq_length - 2)]

        tokens = []
        segment_ids = []
        tokens.append("[CLS]")
        segment_ids.append(0)
        for token in tokens_a:
            tokens.append(token)
            segment_ids.append(0)
        tokens.append("[SEP]")
        segment_ids.append(0)

        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)

        input_mask = [1] * len(input_ids)

        while len(input_ids) < self.max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
            segment_ids.append(0)
        return input_ids, input_mask, segment_ids
Esempio n. 13
0
File: bert.py Progetto: mindis/k9
bert_layer = hub.KerasLayer(bert_model_path, trainable=True)

# Build tokenizer from pre-trained BERT vocabulary.
bert_tokenizer = FullTokenizer(
    vocab_file=bert_layer.resolved_object.vocab_file.asset_path.numpy(),
    do_lower_case=bert_layer.resolved_object.do_lower_case.numpy())

# TODO:
# Document longer than 512 words wont be able to be encoded by BERT,
# since its positional encoding has a hard limit for 512 words.
# We need to summarize the document into <= 512 tokens, or encode sentence by sentence then pool together.
maxlen = 256

# Encode text with padding, masking, and segmentation (required by BERT even if we don't use it).
tok_seq_train = [
    bert_tokenizer.tokenize(text) for text in newsgroups_train.data
]
wid_seq_train = [
    bert_tokenizer.convert_tokens_to_ids(toks)[:maxlen]
    for toks in tok_seq_train
]
wid_seq_train_padded = pad_sequences(wid_seq_train,
                                     padding="post",
                                     maxlen=maxlen)
wid_seq_train_mask = (wid_seq_train_padded > 0).astype(int)
segment_ids_train = np.zeros_like(wid_seq_train_mask)

tok_seq_test = [bert_tokenizer.tokenize(text) for text in newsgroups_test.data]
wid_seq_test = [
    bert_tokenizer.convert_tokens_to_ids(toks)[:maxlen]
    for toks in tok_seq_test
#!/usr/bin/env python
# coding: utf-8

from modeling import BertForQuestionAnswering, BertConfig

#config = BertConfig.from_json_file('uncased_L-12_H-768_A-12/bert_config.json')
# config = BertConfig.from_json_file('configs/pals_config.json')
# model = BertForQuestionAnswering(config)
# model.load_pretained('initial_bert.bin', patch=True)
# print(model)

from tokenization import FullTokenizer, BasicTokenizer

tokenizer = FullTokenizer('uncased_L-12_H-768_A-12/vocab.txt', do_lower_case=True)
tokens = tokenizer.tokenize('I love China!!')
print(tokens)
tokenizer = BasicTokenizer()
tokens = tokenizer.tokenize('[SEP]')
print(tokens)
Esempio n. 15
0
from tokenization import FullTokenizer

tokenizer = FullTokenizer("./bert_model/chinese_L-12_H-768_A-12/vocab.txt")
print(tokenizer.tokenize("我 shi 许@#$%$%海明"))
print("".join(tokenizer.tokenize("我 shi 许海明")))
def main():
    # pd_all = pd.read_csv(os.path.join(path, "weibo_senti_100k.csv"))
    # pd_all = shuffle(pd_all)
    # x_data, y_data = pd_all.review.values, pd_all.label.values
    # x_data = [tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text)) for text in x_data]
    # x_train, x_test, y_train, y_test = train_test_split(np.array(x_data), y_data, test_size=0.2)
    #(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
    tokenizer = FullTokenizer("vocab.txt")
    print('Loading data...')
    # 读取训练数据
    train_data = pd.read_csv(os.path.join(path, "train.csv"))
    # 读取验证数据
    dev_data = pd.read_csv(os.path.join(path, "dev.csv"))
    # 读取测试数据
    test_data = pd.read_csv(os.path.join(path, "test.csv"))
    x_train, y_train = train_data.review.values, train_data.label.values
    x_dev, y_dev = dev_data.review.values, dev_data.label.values
    x_test, y_test = test_data.review.values, test_data.label.values
    # tokenize to ids
    x_train = [
        tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
        for text in x_train
    ]
    x_dev = [
        tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
        for text in x_dev
    ]
    x_test = [
        tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
        for text in x_test
    ]

    max_features = 21128
    # cut texts after this number of words (among top max_features most common words)
    maxlen = 128
    batch_size = 32

    print(len(x_train), 'train sequences')
    print(len(x_dev), 'dev sequences')
    print(len(x_test), 'test sequences')

    print('Pad sequences (samples x time)')
    x_train = keras.preprocessing.sequence.pad_sequences(x_train,
                                                         maxlen=maxlen)
    x_dev = keras.preprocessing.sequence.pad_sequences(x_dev, maxlen=maxlen)
    x_test = keras.preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen)
    print('x_train shape:', x_train.shape)
    print('x_dev shape:', x_dev.shape)
    print('x_test shape:', x_test.shape)

    print('Build model...')
    model = keras.models.Sequential()
    model.add(keras.layers.Embedding(max_features, 200))
    model.add(keras.layers.LSTM(300, dropout=0.2, recurrent_dropout=0.2))
    model.add(keras.layers.Dense(1, activation='sigmoid'))

    # try using different optimizers and different optimizer configs
    # metrics 设置方式一 使用keras内部函数或者自定义函数名
    # model.compile(loss='binary_crossentropy',optimizer='adam'
    #               ,metrics=['accuracy',metric_precision,metric_recall,metric_F1score])
    # metrics 设置方式二 使用metrics对象中的函数实例对象 在tensorflow.keras.metrics中才有。
    metrics = keras.metrics
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy',
                           metrics.Precision(),
                           metrics.Recall()])

    print('Train...')
    model.fit(x_train,
              y_train,
              batch_size=batch_size,
              epochs=15,
              verbose=1,
              validation_data=(x_dev, y_dev))

    # 统计测试数据集的准确率的方式一
    y_predicts = model.predict(x_test, batch_size=batch_size, verbose=1)
    #
    print('y_predicts.shape:', y_predicts.shape)
    print('y_predicts:', y_predicts)
    # 判断预测结果中每行是否大于一列,如果大于一列,每个样本的预测类别,就取概率值最大的列索引对应的类别
    if y_predicts.shape[-1] > 1:
        print('if true')
        y_predicts = y_predicts.argmax(axis=-1).tolist()
    else:
        print('if false')
        y_predicts = (y_predicts > 0.5).astype('int32').tolist()
    right_num = 0
    total = len(y_test)
    for i in range(total):
        if y_predicts[i][0] == y_test[i]:
            right_num += 1
    result = 'Test accuracy:%.2f' % (right_num * 100 / total)
    # 统计测试数据集的准确率的方式二 该方式就是直接使用keras模型实例中的评估方法去评估测试数据集即可
    evaluate = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=1)
    result += '\n=========================================\n' + 'loss,accuracy,precision,recall,f1-score:' + str(
        evaluate)
    # 方式三 使用scikit-learn 中的classification_report方法 计算p,r,f1
    y_predict = model.predict_classes(x_test, batch_size=batch_size, verbose=1)
    report = classification_report(y_test, y_predict, digits=4)
    result += '\n=========================================\n' + report
    print(result)
    with open(output_path + 'train_lstm_result.txt', 'w',
              encoding='utf-8') as f:
        f.write(result)
    # 保存网络模型
    model.save(output_path + 'weibo_lstm_model.h5')
    print('模型保存成功')
class ModelServer:
    def __init__(self, param):

        self.model_path = os.path.abspath(param["model_path"])
        self.bert_config_file = os.path.abspath(param["bert_config_file"])
        bert_config = modeling.BertConfig.from_json_file(self.bert_config_file)
        self.fulltoken = FullTokenizer(os.path.abspath(param["vocab_file"]))
        self.vocab_dict = self.fulltoken.vocab

        target_start_ids = self.vocab_dict["[CLS]"]
        target_end_ids = self.vocab_dict["[SEP]"]

        num_gpus = len(os.environ["CUDA_VISIBLE_DEVICES"].split(','))
        tf.logging.info("num_gpus is {}".format(num_gpus))
        if param["use_mul_gpu"]:
            distribute = tf.contrib.distribute.MirroredStrategy(
                num_gpus=num_gpus)
        else:
            distribute = None
        run_config = tf.estimator.RunConfig(model_dir=os.path.abspath(
            self.model_path),
                                            save_summary_steps=200,
                                            keep_checkpoint_max=2,
                                            save_checkpoints_steps=3000,
                                            train_distribute=distribute,
                                            eval_distribute=distribute)
        self.input_max_seq_length = param["max_seq_length"]
        model_fn = model_fn_builder(
            bert_config,
            init_checkpoint=None,
            learning_rate=0.0001,
            num_train_steps=10000,
            num_warmup_steps=100,
            use_one_hot_embeddings=False,  # when use tpu ,it's True
            input_seq_length=param["max_seq_length"],
            target_seq_length=param["max_target_seq_length"],
            target_start_ids=target_start_ids,
            target_end_ids=target_end_ids,
            batch_size=param["batch_size"],
            mode_type=param["mode_type"])
        self.estimator = tf.estimator.Estimator(model_fn=model_fn,
                                                config=run_config)

    #input:[(str_mask_tokens,str_labels),list_str_mask_words]
    #label 0:Not mentioned,
    #   1:Negative,
    #   2:Neutral,
    #   3:Positive
    def predict(self, inputs, limitNum=3):
        predicts = []
        if not isinstance(inputs, list):
            inputs = [inputs]

        def token_input():
            for input in inputs:
                tokens = input[0]
                labels = [int(label) for label in input[1]][:20]
                mask_words = input[2]
                assert max(labels) < 4 and min(labels) >= 0
                tokens = self.fulltoken.tokenize(
                    tokens)[:self.input_max_seq_length - 2]

                def replace_Mask(tokens, mask_words):
                    mask_index = []
                    first_maskwords = [x[0] for x in mask_words]

                    for index, token in enumerate(tokens):
                        if token in first_maskwords:
                            for mask_words_x in mask_words:
                                if token == mask_words_x[0]:
                                    _token = "".join([
                                        _t.replace("#", '')
                                        for _t in tokens[index:index +
                                                         len(mask_words_x)]
                                    ])
                                    if _token == mask_words_x:
                                        for i in range(len(mask_words_x)):
                                            mask_index.append(index + i)
                                        mask_words = [
                                            x_ for x_ in mask_words
                                            if x_ != mask_words_x
                                        ]
                                        first_maskwords = [
                                            x[0] for x in mask_words
                                        ]
                        if len(mask_words) < 1:
                            break
                    for mask_index_ in mask_index:
                        tokens[mask_index_] = '[MASK]'
                    return tokens

                tokens = replace_Mask(tokens, mask_words)
                ids = self.fulltoken.convert_tokens_to_ids(['[CLS]'] + tokens +
                                                           ['[SEP]'])
                input_mask = [1] * len(ids)
                segment_ids = [0] * self.input_max_seq_length
                while len(ids) < self.input_max_seq_length:
                    ids.append(0)
                    input_mask.append(0)
                while len(labels) < 20:
                    labels.append(0)

                yield ([ids], [input_mask], [labels], [segment_ids])

        def input_fn():

            dataset = tf.data.Dataset.from_generator(
                token_input, (tf.int64, tf.int64, tf.int64, tf.int64),
                output_shapes=(tf.TensorShape([
                    None, self.input_max_seq_length
                ]), tf.TensorShape([None, self.input_max_seq_length]),
                               tf.TensorShape([None, 20]),
                               tf.TensorShape(
                                   [None, self.input_max_seq_length])))
            dataset = dataset.map(
                lambda ids, input_mask, labels, segment_ids: {
                    "sentiment_labels": labels,
                    "input_token_ids": ids,
                    "input_mask": input_mask,
                    "target_token_ids": tf.zeros_like([1, 1]),
                    "target_mask": tf.zeros_like([1, 1]),
                    "segment_ids": segment_ids
                })

            # (ids, input_mask, labels, segment_ids)=dataset
            # features={
            #     "sentiment_labels": labels,
            #     "input_token_ids": ids,
            #     "input_mask": input_mask,
            #     "target_token_ids": tf.zeros_like([1, 1]),
            #     "target_mask": tf.zeros_like([1, 1]),
            #     "segment_ids": segment_ids}
            #
            # return features

            return dataset

        result = self.estimator.predict(input_fn=input_fn)
        for prediction in result:
            sample_id = prediction['sample_id'][:, :limitNum].T.tolist()
            ans = []
            for sample_id_ in sample_id:
                token = self.fulltoken.convert_ids_to_tokens(sample_id_)
                ans.append("".join(token[:-1]))
            predicts.append(ans)
            input = prediction['inputs'].tolist()
            print(self.fulltoken.convert_ids_to_tokens(input))

        return predicts
Esempio n. 18
0
class ApiModel:
    def __init__(self):
        self.THRESHOLD = 0.1
        self.PROB_THRESHOLD = 0.8
        
        self.LABELS_32 = [
            "sentimental",
            "afraid",
            "proud",
            "faithful",
            "terrified",
            "joyful",
            "angry",
            "sad",
            "jealous",
            "grateful",
            "prepared",
            "embarrassed",
            "excited",
            "annoyed",
            "lonely",
            "ashamed",
            "guilty",
            "surprised",
            "nostalgic",
            "confident",
            "furious",
            "disappointed",
            "caring",
            "trusting",
            "disgusted",
            "anticipating",
            "anxious",
            "hopeful",
            "content",
            "impressed",
            "apprehensive",
            "devastated"
        ]

        self.MAX_SEQ_LENGTH = 50

        self.tokenizer = FullTokenizer(
            vocab_file='vocab.txt', do_lower_case=True)

        self.model = load_model('model_data/model32')

        self.matrix = np.genfromtxt('emotion_multiplier.csv')

        self.map_probabilities = np.vectorize(lambda x: 1 if x >= self.PROB_THRESHOLD else 0)

    def predict(self, text: str):

        input_ids, input_mask, segment_ids, label_ids = self._convert_single_example(
            text)

        features: str = self._serialize_features(
            input_ids, input_mask, segment_ids, label_ids)

        probabilities = self.model({'examples': [features]})[
            "probabilities"][0]
        
        # excluded_emotions = ['nostalgic', 'sentimental', 'prepared', 'anticipating']
        # emotions = [k for k,v in zip(self.LABELS_32, probabilities) if (v>self.PROB_THRESHOLD) and (k not in excluded_emotions)] # recheck
        # if len(emotions) == 0:
        #     emotions = ['neutral']

        animations = list(np.matmul(self.matrix, self.map_probabilities(probabilities)))

        top_probabilities = [(k, v)
                             for k, v in zip(self.LABELS_32, probabilities)
                             if v >= self.THRESHOLD]
        top_emotions = dict(sorted(top_probabilities, key=lambda x: -x[1]))

        return {'emotions': top_emotions, 'animations': animations}


    def _convert_single_example(self, text):
        """Modified from goemotions/bert_classifier.py"""
        tokens = self.tokenizer.tokenize(text)

        if len(tokens) > self.MAX_SEQ_LENGTH - 2:
            tokens = tokens[0:(self.MAX_SEQ_LENGTH - 2)]

        tokens = ["[CLS]"] + tokens + ["[SEP]"]
        segment_ids = [0] * len(tokens)
        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        input_mask = [1] * len(input_ids)

        while len(input_ids) < self.MAX_SEQ_LENGTH:
            input_ids.append(0)
            input_mask.append(0)
            segment_ids.append(0)

        return input_ids, input_mask, segment_ids, [0] * len(self.LABELS_32)

    def _serialize_features(self, input_ids, input_mask, segment_ids, label_ids):
        features = {
            "input_ids": self._create_int_feature(input_ids),
            "input_mask": self._create_int_feature(input_mask),
            "segment_ids": self._create_int_feature(segment_ids),
            "label_ids": self._create_int_feature(label_ids)
        }

        tf_example = Example(features=Features(feature=features))

        return tf_example.SerializeToString()

    def _create_int_feature(self, values):
        return Feature(int64_list=Int64List(value=list(values)))
Esempio n. 19
0
            if user_inputs.startswith("!"):
                cmd = user_inputs[1:].upper()
                if cmd == "EXIT":
                    break
                elif cmd == "HELP":
                    print(usage)
                else:
                    with os.popen(cmd) as f:
                        for line in f:
                            print(line, end="")
                continue

            user_inputs = user_inputs.split("&")
            _replace_mask = lambda x: [
                token if token != "#" else "[MASK]"
                for token in tokenizer.tokenize(x)
            ]

            sentence = _replace_mask(user_inputs[0])
            print("第一个句子:", "".join(sentence))
            if len(user_inputs) > 1:
                sentence_next = _replace_mask(user_inputs[1])
                print("第二个句子:", "".join(sentence_next))
            else:
                sentence_next = None
            example = create_single_example(tokenizer, sentence, sentence_next)
            scores, indices, nsp_pred = sess.run(
                [mlm_scores, mlm_indices, nsp_predictions], feed_dict=example)

            if sentence_next is not None:
                print("输入句子相关联的概率: {:.3f}".format(nsp_pred[0]))
Esempio n. 20
0
class NERTransformer(object):
    '''
    Input: a dict with ['text'], ['spo_list'], ['docid'] and ['entity_list']
    Output: tsv file with ['text_a'], ['labels'], tokens separated by u"". Keep the doc order as in json.

    Todo: add docid?
    '''
    def __init__(self, sep=u"", do_lower_case=True):
        self.tokenizer = FullTokenizer(vocab_file='config/vocab.txt',
                                       do_lower_case=do_lower_case)
        self.sep = sep

    def _transform_one(self, instance):
        '''
        Tokenize a piece of text and generate a sequence of NER labels for tokens.
        '''
        text_tokens = self.tokenizer.tokenize(instance['text'])
        labels = ["O"] * len(text_tokens)

        entities = instance.get('entity_list', [])
        n_overlap = 0
        for e in entities:
            e_tokens = self.tokenizer.tokenize(e['text'])
            try:
                e_start, e_end = self._find_sublist_boundary(
                    e_tokens, text_tokens)
            except:
                continue

            # if the span already labelled, skip
            if len(set(labels[e_start:e_end + 1])) > 1:
                # print('Overlap.')
                n_overlap += 1
                continue

            # Add entity BIO labels (57 labels)
            labels[e_start] = 'B-%s' % e['type']
            labels[e_start + 1:e_end +
                   1] = ['I-%s' % e['type']] * (len(e_tokens) - 1)
            # print('entity:', text_tokens[e_start:e_end+1], labels[e_start:e_end+1])

        return n_overlap, dict(text_a=self.sep.join(text_tokens),
                               label=self.sep.join(labels))

    def _find_sublist_boundary(self, sublist, full_list):
        '''
        Todo: A few instances cannot find sublist boundary.
        '''
        for start in (i for i, v in enumerate(full_list) if v == sublist[0]):
            if full_list[start:start + len(sublist)] == sublist:
                return (start, start + len(sublist) - 1)

    # end def

    def transform(self, instances):
        '''
        Transform a list of json instances (documents) into a list of NER instances.
        '''
        print("Json instances: %d" % len(instances))

        transformed = []
        n_total, n_overlap = 0, 0
        for i, d in enumerate(instances):
            if i % 10000 == 0:
                print('Processed %d documents.' % i)
            n_total += len(d.get('entity_list', []))
            n_overlap += self._transform_one(d)[0]
            transformed.append(self._transform_one(d)[1])

        print('(Entities) Total: %d, Overlapped: %d, Labelled: %d' %
              (n_total, n_overlap, n_total - n_overlap))

        return transformed
Esempio n. 21
0
class BERTFunction(object):
    def __init__(self,
                 bert_config_file,
                 init_checkpoint,
                 max_seq_length,
                 vocab_file,
                 num_labels,
                 use_gpu=False):
        # 导入预训练参数所需
        self.bert_config = modeling.BertConfig.from_json_file(bert_config_file)
        self.init_checkpoint = init_checkpoint
        # 数据集和计算所需
        self.max_seq_length = max_seq_length  #使用了几何级数方式的position embedding,最大长度为512,这时固定在计算图中的,而具体计算的长度可以自己给定
        self.num_labels = num_labels
        # 数据预处理所需
        self.vocab_file = vocab_file
        self.tokenizer = FullTokenizer(self.vocab_file,
                                       do_lower_case=False)  # 默认 cased 模型
        # gpu
        self.use_gpu = use_gpu

        self.graph = tf.Graph()  #声明计算图
        with self.graph.as_default():
            # 定义placeholder
            self.input_ids = tf.placeholder(dtype=tf.int64,
                                            shape=(None, self.max_seq_length))
            self.input_mask = tf.placeholder(dtype=tf.int64,
                                             shape=(None, self.max_seq_length))
            self.segment_ids = tf.placeholder(dtype=tf.int64,
                                              shape=(None,
                                                     self.max_seq_length))

            # 定义计算
            (self.logits, self.probabilities) = create_predict_model(
                self.bert_config, self.input_ids, self.input_mask,
                self.segment_ids, self.num_labels)

            # 导入预训练参数
            self.tvars = tf.trainable_variables()  #创建了计算图后,可训练的变量随之被创建。

            self.initialized_variable_names = {}
            if self.init_checkpoint:  #init_checkpoint是命令行中传入的预训练BERT或先前训练过的,ckpt文件
                (
                    self.assignment_map,
                    self.
                    initialized_variable_names  #从init_checkpoints中获取与可用的变量的值(预训练模型与实际任务计算图的变量的交集)
                ) = modeling.get_assignment_map_from_checkpoint(
                    self.tvars, self.init_checkpoint)
            tf.train.init_from_checkpoint(
                self.init_checkpoint,
                self.assignment_map)  #需要包含在graph中,但是这样它是什么时候运行的呢?

            init = tf.global_variables_initializer(
            )  #理论上,如果完整地恢复了模型,已经不需要再初始化了

        #创建会话,这个会话将随对象一直存在,保持初始导入的参数;并用于新输入数据的计算
        config = None
        if self.use_gpu:
            config = tf.ConfigProto(log_device_placement=True)
            config.gpu_options.allow_growth = True
            print("trying to use gpu")
        else:
            print("using cpu")
        self.sess = tf.Session(graph=self.graph, config=config)
        self.sess.run(init)

    def print_tvars_names(self):
        print(self.tvars)

    def print_tvar_value(self, i):  #获得trainable_variables列表中下标为i的tvar
        try:
            print(self.sess.run(self.tvars[i]))
        except:
            print("can't get it, may be the index is out of range.")

    def cal(self, features):
        feed_dict = {
            self.input_ids: features["input_ids"],
            self.input_mask: features["input_mask"],
            self.segment_ids: features["segment_ids"]
        }
        prob = self.sess.run(self.probabilities, feed_dict=feed_dict)
        print("prob: \n", prob)
        return prob

    def process_batch_input(self, text_as,
                            text_bs):  #重新精简 convert_single_example 函数
        input_idss = []
        input_masks = []
        segment_idss = []
        for text_a, text_b in zip(text_as, text_bs):
            print(text_a)
            print(text_b)
            tokens_a = self.tokenizer.tokenize(text_a)
            tokens_b = self.tokenizer.tokenize(text_b)
            #_truncate_seq_pair 函数
            while True:
                total_length = len(tokens_a) + len(tokens_b)
                if total_length <= self.max_seq_length - 3:
                    break
                if len(tokens_a) > len(tokens_b):
                    tokens_a.pop()
                else:
                    tokens_b.pop()
            # 精简 convert_single_example 函数
            tokens = []
            segment_ids = []
            tokens.append("[CLS]")
            segment_ids.append(0)
            for token in tokens_a:
                tokens.append(token)
                segment_ids.append(0)
            tokens.append("[SEP]")
            segment_ids.append(0)

            for token in tokens_b:
                tokens.append(token)
                segment_ids.append(1)
            tokens.append("[SEP]")
            segment_ids.append(1)

            input_ids = self.tokenizer.convert_tokens_to_ids(tokens)

            input_mask = [1] * len(input_ids)

            while len(input_ids) < self.max_seq_length:
                input_ids.append(0)
                input_mask.append(0)
                segment_ids.append(0)

            assert len(input_ids) == self.max_seq_length
            assert len(input_mask) == self.max_seq_length
            assert len(segment_ids) == self.max_seq_length

            print("tokens: %s" % " ".join([printable_text(x) for x in tokens]))

            input_idss.append(input_ids)
            input_masks.append(input_mask)
            segment_idss.append(segment_ids)
#          print("input_ids: %s" % " ".join([str(x) for x in input_ids]))
#          print("input_mask: %s" % " ".join([str(x) for x in input_mask]))
#          print("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))

        return {
            "input_ids": nparray(input_idss, dtype=npint64),
            "input_mask": nparray(input_masks, dtype=npint64),
            "segment_ids": nparray(segment_idss, dtype=npint64)
        }

    def batch_cal(self, text_as, text_bs):
        features = self.process_batch_input(text_as, text_bs)
        return self.cal(features)

    def process_context_list_and_candidates(self, context_list, candidates):
        #将 max_seq_length 一分为二,分别存放 text_a 和 text_b,以保持平衡;
        #如果 text_b,即候选的回复长度较小,则将剩余空间都赋给 text_a,即上文
        input_idss = []
        input_masks = []
        segment_idss = []
        for cdd in candidates:
            t_c = self.tokenizer.tokenize(cdd)
            length = len(t_c) + 2  #'[CLS]', '[SEP]'
            t_us = []
            tokens = []
            for utterance in context_list[-1::-1]:
                t_u = self.tokenizer.tokenize(utterance)
                length += len(t_u) + 1
                while length > self.max_seq_length:
                    if len(t_c) + 1 > self.max_seq_length / 2:
                        t_c.pop()
                        length -= 1
                    else:
                        t_u.pop()
                        length -= 1
                t_u.append('[SEP]')
                t_us = t_u + t_us
                if length == self.max_seq_length and len(
                        t_c) + 1 <= self.max_seq_length / 2:
                    break
            tokens.append('[CLS]')
            tokens.extend(t_us)
            tokens.extend(t_c)
            tokens.append('[SEP]')
            input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
            input_mask = [1] * len(input_ids)
            segment_ids = [1] * (len(t_us) + 1) + ([0] * (len(t_c) + 1))

            while len(input_ids) < self.max_seq_length:
                input_ids.append(0)
                input_mask.append(0)
                segment_ids.append(0)
            assert len(input_ids) == self.max_seq_length
            assert len(input_mask) == self.max_seq_length
            assert len(segment_ids) == self.max_seq_length

            print("tokens: %s" % " ".join([printable_text(x) for x in tokens]))
            print("length:" + str(len(tokens)))

            input_idss.append(input_ids)
            input_masks.append(input_mask)
            segment_idss.append(segment_ids)

        return {
            "input_ids": nparray(input_idss, dtype=npint64),
            "input_mask": nparray(input_masks, dtype=npint64),
            "segment_ids": nparray(segment_idss, dtype=npint64)
        }

    def context_list_and_candidates_cal(self, context_list, candidates):
        features = self.process_context_list_and_candidates(
            context_list, candidates)
        return self.cal(features)


#用法示例
#############!!!!!!!!!!!!!需要确定max_seq_length 在模型中,对张量流动的限制,否则由 transformer 的特性,完全可以更自由
# import os
# pretrained_dir="./pretrained/multi_cased_L-12_H-768_A-12/"
# init_checkpoint = os.path.join(pretrained_dir, "./bert_model.ckpt")
# bert_config_file=os.path.join(pretrained_dir, "./bert_config.json")
# vocab_file = os.path.join(pretrained_dir, "./vocab.txt")
# max_seq_length =160
# num_labels = 2
#
# func=BERTFunction(bert_config_file, init_checkpoint, max_seq_length, vocab_file, num_labels)
# res=func.cal("I'm gogo, who are you?","I'm Trump, I'm fine.")
parser.add_argument("--separator",
                    required=False,
                    type=str,
                    default=" ",
                    help="token separator. DEFAULT:whitespace")
parser.add_argument("--case_sensitive",
                    action="store_true",
                    help="case sensitive tokenizatoin. DEFAULT:False")
parser.add_argument("--eos",
                    required=False,
                    type=str,
                    default="",
                    help="end-of-sentence characters. DEFAULT:empty")
args = parser.parse_args()

sys.path.append(args.bert_dir)

from tokenization import FullTokenizer

tokenizer = FullTokenizer(vocab_file=args.vocab_file,
                          do_lower_case=not args.case_sensitive)

separator = args.separator
eos = args.eos
with io.open(args.corpus, mode="r") as ifs:
    for sentence in ifs:
        sentence = sentence.strip()
        if len(sentence) == 0:
            continue
        lst_token = tokenizer.tokenize(sentence)
        print(separator.join(lst_token) + eos)
Esempio n. 23
0
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from tokenization import FullTokenizer

path = "./"
pd_all = pd.read_csv(os.path.join(path, "weibo_senti_100k.csv"))

tokenizer = FullTokenizer("vocab.txt")

pd_all = shuffle(pd_all)

x_data, y_data = pd_all.review.values, pd_all.label.values

x_data = [
    tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
    for text in x_data
]

x_train, x_test, y_train, y_test = train_test_split(np.array(x_data),
                                                    y_data,
                                                    test_size=0.2)

max_features = 21128
# cut texts after this number of words (among top max_features most common words)
maxlen = 128
batch_size = 32

print('Loading data...')
#(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)