Esempio n. 1
0
    def parse_line(self, line, max_seq_len=512):
        """ parse one line to token_ids, sentence_ids, pos_ids, label
        """

        line = line.strip().split(",")
        assert len(line) == 3, \
            "One sample must have %d fields!" % 3

        text_left, text_right, masklabel = line
        tokenizer = FullTokenizer(self.vocab_path)
        # tokenizer = FullTokenizer(vocab_path)
        text_left = tokenizer.tokenize(text_left)
        masklabel = tokenizer.tokenize(masklabel)
        masklabel_ = len(masklabel) * ["[MASK]"]
        text_right = tokenizer.tokenize(text_right)
        all_tokens = text_left + masklabel_ + text_right
        token_ids = tokenizer.convert_tokens_to_ids(all_tokens)
        sent_ids = [0] * len(all_tokens)
        pos_ids = [i for i in range(len(all_tokens))]
        input_mask = [1.0] * len(all_tokens)
        # 这儿还差一个mask_pos
        mask_pos = []
        for idx, mask in enumerate(token_ids):
            if mask == self.mask_id:
                mask_pos.append(idx)
        # 添加一个mask_label
        mask_label = list(tokenizer.convert_tokens_to_ids(masklabel))
        assert len(token_ids) == len(sent_ids) == len(pos_ids) == len(
            input_mask
        ), "[Must be true]len(token_ids) == len(sent_ids) == len(pos_ids) == len(seg_labels)"
        if len(token_ids) > max_seq_len:
            return None
        return [token_ids, sent_ids, pos_ids, input_mask, mask_pos, mask_label]
Esempio n. 2
0
def preprocess(data):
    tokenizer = FullTokenizer(vocab_file)
    tok_ip = np.zeros((len(data), 128), dtype="int32")
    sent_ip = np.zeros((len(data), 128), dtype="int8")
    pos_ip = np.zeros((len(data), 128), dtype="int8")
    masks = np.zeros((len(data), 128), dtype="int8")

    for pos, text in tqdm.tqdm_notebook(enumerate(data)):
        tok0 = tokenizer.tokenize(text[0])
        tok1 = tokenizer.tokenize(text[1])
        tok = tok0 + tok1
        if len(tok) > 128:
            tok = tok[:127] + ["[SEP]"]
        pad_len = 128 - len(tok)
        tok_len = len(tok)
        tok0_len = len(tok0)
        tok = tokenizer.convert_tokens_to_ids(tok) + [0] * pad_len
        pos_val = range(128)
        sent = [0] * tok0_len + [1] * (tok_len - tok0_len) + [0] * pad_len
        mask = [1] * tok_len + [0] * pad_len

        tok_ip[pos] = tok
        pos_ip[pos] = pos_val
        masks[pos] = mask

    masks = masks[:, None, None, :]
    return tok_ip, sent_ip, pos_ip, masks
Esempio n. 3
0
def parse_sequence(tokenizer: t10n.FullTokenizer,
                   sequence: str) -> SequenceParseResult:
    tokens = tokenizer.tokenize(sequence)
    tokens.insert(0, '[CLS]')
    tokens.append('[SEP]')

    # Could be 0 or 1, not sure which index is *supposed* to represent a first segment
    token_type_ids = [0] * len(tokens)
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    attention_mask = [1] * len(tokens)
    attention_mask[0] = 0

    # Default for our model
    max_seq_length = 128

    # Pad arrays
    while len(input_ids) < max_seq_length:

        # Not sure if padding belongs to the sequence or not
        token_type_ids.append(0)

        # Zero is the [PAD]-token for the BERT-vocab
        input_ids.append(0)

        # We probably should exclude the sequence padding from the attention-mask
        attention_mask.append(0)

    return SequenceParseResult(
        tokens=tokens,
        token_type_ids=token_type_ids,
        attention_mask=attention_mask,
        input_ids=input_ids,
    )
Esempio n. 4
0
def convert_single_example(ex_index, example: InputExample, tag_list: list, label_list: list, max_seq_length,
                           tokenizer: tokenization.FullTokenizer):
    query = tokenizer.tokenize(example.text)

    if len(query) > max_seq_length - 2:
        query = query[0:(max_seq_length - 2)]

    tokens = ["[CLS]"]
    tags = ["[CLS]"]
    for idx, token in enumerate(query):
        tokens.append(token)
        tags.append(example.tag[idx])
    tokens.append("[SEP]")
    tags.append("[SEP]")
    segment_ids = [0] * len(tokens)

    tag_map = {}
    for idx, tag in enumerate(tag_list):
        tag_map[tag] = idx
    label_map = {}
    for idx, label in enumerate(label_list):
        label_map[label] = idx

    tag_ids = [tag_map[tag] for tag in tags]
    label_id = label_map[example.label]

    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_mask = [1] * len(input_ids)

    while len(input_ids) < max_seq_length:
        input_ids.append(0)
        input_mask.append(0)
        segment_ids.append(0)
        tag_ids.append(0)

    if ex_index < 5:
        logger.info("*** Example ***")
        logger.info("guid: %s" % example.guid)
        logger.info("tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens]))
        logger.info("tag: %s" % " ".join(tags))
        logger.info("label: %s" % example.label)

    feature = InputFeature(
        input_ids=input_ids,
        input_mask=input_mask,
        segment_ids=segment_ids,
        tag_ids=tag_ids,
        label_id=label_id
    )

    return feature
Esempio n. 5
0
class BERTTextEncoder(TextEncoder):
    def __init__(self, vocab_file: str, do_lower_case: bool = True) -> None:
        self.tokenizer = FullTokenizer(vocab_file, do_lower_case)
        super().__init__(len(self.tokenizer.vocab))
        self.bert_unk_id = self.tokenizer.vocab['[UNK]']
        self.bert_msk_id = self.tokenizer.vocab['[MASK]']

    def standardize_ids(self, ids: List[int]) -> List[int]:
        for i in range(len(ids)):
            if ids[i] == self.bert_unk_id:  # UNK
                ids[i] = 0
            else:  # VOCAB
                ids[i] -= self.bert_msk_id
        return ids

    def encode(self, sent: str) -> List[int]:
        return self.standardize_ids(
            self.tokenizer.convert_tokens_to_ids(
                self.tokenizer.tokenize(sent)))
Esempio n. 6
0
    do_lower_case=bert_layer.resolved_object.do_lower_case.numpy())

# TODO:
# Document longer than 512 words wont be able to be encoded by BERT,
# since its positional encoding has a hard limit for 512 words.
# For better results we may need to summarize the document into <= 512 tokens,
# or encode sentence by sentence then pool together.
maxlen = 256

# TODO:
# We need to manually handle CLS and SEP special token for sentence beginning and ending.

# Encode text with padding, masking, and segmentation (required by BERT even if we don't use it).
tok_seq_train = [bert_tokenizer.tokenize(text) for text in imdb_reviews_train]
wid_seq_train = [
    bert_tokenizer.convert_tokens_to_ids(toks)[:maxlen]
    for toks in tok_seq_train
]
wid_seq_train_padded = pad_sequences(wid_seq_train,
                                     padding="post",
                                     maxlen=maxlen)
wid_seq_train_mask = (wid_seq_train_padded > 0).astype(int)
segment_ids_train = np.zeros_like(wid_seq_train_mask)

tok_seq_test = [bert_tokenizer.tokenize(text) for text in imdb_reviews_test]
wid_seq_test = [
    bert_tokenizer.convert_tokens_to_ids(toks)[:maxlen]
    for toks in tok_seq_test
]
wid_seq_test_padded = pad_sequences(wid_seq_test,
                                    padding="post",
Esempio n. 7
0
class BertInference(object):
    """
    The bert model.
    """
    def __init__(self, bert_meta):
        self.graph = self._load_graph(bert_meta.model_file)

        self.tokenizer = FullTokenizer(vocab_file=bert_meta.vocab_file,
                                       do_lower_case=True)
        self.max_seq_length = 128

        # Input.
        self.input_ids = self.graph.get_tensor_by_name('infer/input_ids:0')
        self.word_ids = self.graph.get_tensor_by_name('infer/input_mask:0')
        self.segment_ids = self.graph.get_tensor_by_name('infer/segment_ids:0')
        # Output.
        self.predictions = self.graph.get_tensor_by_name(
            'infer/loss/Softmax:0')

        self.sess = tf.Session(graph=self.graph)

        self.inference(BertInputPackage(u'预热一下'))

    def inference(self, bert_input):
        """
        Call model.
        """
        input_ids, input_mask, segment_ids = self._convert_single_example(
            bert_input.query)
        preds_evaluated = self.sess.run(self.predictions,
                                        feed_dict={
                                            self.input_ids: [input_ids],
                                            self.word_ids: [input_mask],
                                            self.segment_ids: [segment_ids]
                                        })

        return preds_evaluated

    def _load_graph(self, frozen_graph_filename):
        with tf.gfile.GFile(frozen_graph_filename, "rb") as f:
            graph_def = tf.GraphDef()
            graph_def.ParseFromString(f.read())

        with tf.Graph().as_default() as graph:
            tf.import_graph_def(graph_def,
                                input_map=None,
                                return_elements=None,
                                name="infer",
                                op_dict=None,
                                producer_op_list=None)

        return graph

    def _convert_single_example(self, text_a):
        tokens_a = self.tokenizer.tokenize(text_a)

        if len(tokens_a) > self.max_seq_length - 2:
            tokens_a = tokens_a[0:(self.max_seq_length - 2)]

        tokens = []
        segment_ids = []
        tokens.append("[CLS]")
        segment_ids.append(0)
        for token in tokens_a:
            tokens.append(token)
            segment_ids.append(0)
        tokens.append("[SEP]")
        segment_ids.append(0)

        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)

        input_mask = [1] * len(input_ids)

        while len(input_ids) < self.max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
            segment_ids.append(0)
        return input_ids, input_mask, segment_ids
def main():
    # pd_all = pd.read_csv(os.path.join(path, "weibo_senti_100k.csv"))
    # pd_all = shuffle(pd_all)
    # x_data, y_data = pd_all.review.values, pd_all.label.values
    # x_data = [tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text)) for text in x_data]
    # x_train, x_test, y_train, y_test = train_test_split(np.array(x_data), y_data, test_size=0.2)
    #(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
    tokenizer = FullTokenizer("vocab.txt")
    print('Loading data...')
    # 读取训练数据
    train_data = pd.read_csv(os.path.join(path, "train.csv"))
    # 读取验证数据
    dev_data = pd.read_csv(os.path.join(path, "dev.csv"))
    # 读取测试数据
    test_data = pd.read_csv(os.path.join(path, "test.csv"))
    x_train, y_train = train_data.review.values, train_data.label.values
    x_dev, y_dev = dev_data.review.values, dev_data.label.values
    x_test, y_test = test_data.review.values, test_data.label.values
    # tokenize to ids
    x_train = [
        tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
        for text in x_train
    ]
    x_dev = [
        tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
        for text in x_dev
    ]
    x_test = [
        tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
        for text in x_test
    ]

    max_features = 21128
    # cut texts after this number of words (among top max_features most common words)
    maxlen = 128
    batch_size = 32

    print(len(x_train), 'train sequences')
    print(len(x_dev), 'dev sequences')
    print(len(x_test), 'test sequences')

    print('Pad sequences (samples x time)')
    x_train = keras.preprocessing.sequence.pad_sequences(x_train,
                                                         maxlen=maxlen)
    x_dev = keras.preprocessing.sequence.pad_sequences(x_dev, maxlen=maxlen)
    x_test = keras.preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen)
    print('x_train shape:', x_train.shape)
    print('x_dev shape:', x_dev.shape)
    print('x_test shape:', x_test.shape)

    print('Build model...')
    model = keras.models.Sequential()
    model.add(keras.layers.Embedding(max_features, 200))
    model.add(keras.layers.LSTM(300, dropout=0.2, recurrent_dropout=0.2))
    model.add(keras.layers.Dense(1, activation='sigmoid'))

    # try using different optimizers and different optimizer configs
    # metrics 设置方式一 使用keras内部函数或者自定义函数名
    # model.compile(loss='binary_crossentropy',optimizer='adam'
    #               ,metrics=['accuracy',metric_precision,metric_recall,metric_F1score])
    # metrics 设置方式二 使用metrics对象中的函数实例对象 在tensorflow.keras.metrics中才有。
    metrics = keras.metrics
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy',
                           metrics.Precision(),
                           metrics.Recall()])

    print('Train...')
    model.fit(x_train,
              y_train,
              batch_size=batch_size,
              epochs=15,
              verbose=1,
              validation_data=(x_dev, y_dev))

    # 统计测试数据集的准确率的方式一
    y_predicts = model.predict(x_test, batch_size=batch_size, verbose=1)
    #
    print('y_predicts.shape:', y_predicts.shape)
    print('y_predicts:', y_predicts)
    # 判断预测结果中每行是否大于一列,如果大于一列,每个样本的预测类别,就取概率值最大的列索引对应的类别
    if y_predicts.shape[-1] > 1:
        print('if true')
        y_predicts = y_predicts.argmax(axis=-1).tolist()
    else:
        print('if false')
        y_predicts = (y_predicts > 0.5).astype('int32').tolist()
    right_num = 0
    total = len(y_test)
    for i in range(total):
        if y_predicts[i][0] == y_test[i]:
            right_num += 1
    result = 'Test accuracy:%.2f' % (right_num * 100 / total)
    # 统计测试数据集的准确率的方式二 该方式就是直接使用keras模型实例中的评估方法去评估测试数据集即可
    evaluate = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=1)
    result += '\n=========================================\n' + 'loss,accuracy,precision,recall,f1-score:' + str(
        evaluate)
    # 方式三 使用scikit-learn 中的classification_report方法 计算p,r,f1
    y_predict = model.predict_classes(x_test, batch_size=batch_size, verbose=1)
    report = classification_report(y_test, y_predict, digits=4)
    result += '\n=========================================\n' + report
    print(result)
    with open(output_path + 'train_lstm_result.txt', 'w',
              encoding='utf-8') as f:
        f.write(result)
    # 保存网络模型
    model.save(output_path + 'weibo_lstm_model.h5')
    print('模型保存成功')
Esempio n. 9
0
class ApiModel:
    def __init__(self):
        self.THRESHOLD = 0.1
        self.PROB_THRESHOLD = 0.8
        
        self.LABELS_32 = [
            "sentimental",
            "afraid",
            "proud",
            "faithful",
            "terrified",
            "joyful",
            "angry",
            "sad",
            "jealous",
            "grateful",
            "prepared",
            "embarrassed",
            "excited",
            "annoyed",
            "lonely",
            "ashamed",
            "guilty",
            "surprised",
            "nostalgic",
            "confident",
            "furious",
            "disappointed",
            "caring",
            "trusting",
            "disgusted",
            "anticipating",
            "anxious",
            "hopeful",
            "content",
            "impressed",
            "apprehensive",
            "devastated"
        ]

        self.MAX_SEQ_LENGTH = 50

        self.tokenizer = FullTokenizer(
            vocab_file='vocab.txt', do_lower_case=True)

        self.model = load_model('model_data/model32')

        self.matrix = np.genfromtxt('emotion_multiplier.csv')

        self.map_probabilities = np.vectorize(lambda x: 1 if x >= self.PROB_THRESHOLD else 0)

    def predict(self, text: str):

        input_ids, input_mask, segment_ids, label_ids = self._convert_single_example(
            text)

        features: str = self._serialize_features(
            input_ids, input_mask, segment_ids, label_ids)

        probabilities = self.model({'examples': [features]})[
            "probabilities"][0]
        
        # excluded_emotions = ['nostalgic', 'sentimental', 'prepared', 'anticipating']
        # emotions = [k for k,v in zip(self.LABELS_32, probabilities) if (v>self.PROB_THRESHOLD) and (k not in excluded_emotions)] # recheck
        # if len(emotions) == 0:
        #     emotions = ['neutral']

        animations = list(np.matmul(self.matrix, self.map_probabilities(probabilities)))

        top_probabilities = [(k, v)
                             for k, v in zip(self.LABELS_32, probabilities)
                             if v >= self.THRESHOLD]
        top_emotions = dict(sorted(top_probabilities, key=lambda x: -x[1]))

        return {'emotions': top_emotions, 'animations': animations}


    def _convert_single_example(self, text):
        """Modified from goemotions/bert_classifier.py"""
        tokens = self.tokenizer.tokenize(text)

        if len(tokens) > self.MAX_SEQ_LENGTH - 2:
            tokens = tokens[0:(self.MAX_SEQ_LENGTH - 2)]

        tokens = ["[CLS]"] + tokens + ["[SEP]"]
        segment_ids = [0] * len(tokens)
        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        input_mask = [1] * len(input_ids)

        while len(input_ids) < self.MAX_SEQ_LENGTH:
            input_ids.append(0)
            input_mask.append(0)
            segment_ids.append(0)

        return input_ids, input_mask, segment_ids, [0] * len(self.LABELS_32)

    def _serialize_features(self, input_ids, input_mask, segment_ids, label_ids):
        features = {
            "input_ids": self._create_int_feature(input_ids),
            "input_mask": self._create_int_feature(input_mask),
            "segment_ids": self._create_int_feature(segment_ids),
            "label_ids": self._create_int_feature(label_ids)
        }

        tf_example = Example(features=Features(feature=features))

        return tf_example.SerializeToString()

    def _create_int_feature(self, values):
        return Feature(int64_list=Int64List(value=list(values)))
Esempio n. 10
0
class BERTFunction(object):
    def __init__(self,
                 bert_config_file,
                 init_checkpoint,
                 max_seq_length,
                 vocab_file,
                 num_labels,
                 use_gpu=False):
        # 导入预训练参数所需
        self.bert_config = modeling.BertConfig.from_json_file(bert_config_file)
        self.init_checkpoint = init_checkpoint
        # 数据集和计算所需
        self.max_seq_length = max_seq_length  #使用了几何级数方式的position embedding,最大长度为512,这时固定在计算图中的,而具体计算的长度可以自己给定
        self.num_labels = num_labels
        # 数据预处理所需
        self.vocab_file = vocab_file
        self.tokenizer = FullTokenizer(self.vocab_file,
                                       do_lower_case=False)  # 默认 cased 模型
        # gpu
        self.use_gpu = use_gpu

        self.graph = tf.Graph()  #声明计算图
        with self.graph.as_default():
            # 定义placeholder
            self.input_ids = tf.placeholder(dtype=tf.int64,
                                            shape=(None, self.max_seq_length))
            self.input_mask = tf.placeholder(dtype=tf.int64,
                                             shape=(None, self.max_seq_length))
            self.segment_ids = tf.placeholder(dtype=tf.int64,
                                              shape=(None,
                                                     self.max_seq_length))

            # 定义计算
            (self.logits, self.probabilities) = create_predict_model(
                self.bert_config, self.input_ids, self.input_mask,
                self.segment_ids, self.num_labels)

            # 导入预训练参数
            self.tvars = tf.trainable_variables()  #创建了计算图后,可训练的变量随之被创建。

            self.initialized_variable_names = {}
            if self.init_checkpoint:  #init_checkpoint是命令行中传入的预训练BERT或先前训练过的,ckpt文件
                (
                    self.assignment_map,
                    self.
                    initialized_variable_names  #从init_checkpoints中获取与可用的变量的值(预训练模型与实际任务计算图的变量的交集)
                ) = modeling.get_assignment_map_from_checkpoint(
                    self.tvars, self.init_checkpoint)
            tf.train.init_from_checkpoint(
                self.init_checkpoint,
                self.assignment_map)  #需要包含在graph中,但是这样它是什么时候运行的呢?

            init = tf.global_variables_initializer(
            )  #理论上,如果完整地恢复了模型,已经不需要再初始化了

        #创建会话,这个会话将随对象一直存在,保持初始导入的参数;并用于新输入数据的计算
        config = None
        if self.use_gpu:
            config = tf.ConfigProto(log_device_placement=True)
            config.gpu_options.allow_growth = True
            print("trying to use gpu")
        else:
            print("using cpu")
        self.sess = tf.Session(graph=self.graph, config=config)
        self.sess.run(init)

    def print_tvars_names(self):
        print(self.tvars)

    def print_tvar_value(self, i):  #获得trainable_variables列表中下标为i的tvar
        try:
            print(self.sess.run(self.tvars[i]))
        except:
            print("can't get it, may be the index is out of range.")

    def cal(self, features):
        feed_dict = {
            self.input_ids: features["input_ids"],
            self.input_mask: features["input_mask"],
            self.segment_ids: features["segment_ids"]
        }
        prob = self.sess.run(self.probabilities, feed_dict=feed_dict)
        print("prob: \n", prob)
        return prob

    def process_batch_input(self, text_as,
                            text_bs):  #重新精简 convert_single_example 函数
        input_idss = []
        input_masks = []
        segment_idss = []
        for text_a, text_b in zip(text_as, text_bs):
            print(text_a)
            print(text_b)
            tokens_a = self.tokenizer.tokenize(text_a)
            tokens_b = self.tokenizer.tokenize(text_b)
            #_truncate_seq_pair 函数
            while True:
                total_length = len(tokens_a) + len(tokens_b)
                if total_length <= self.max_seq_length - 3:
                    break
                if len(tokens_a) > len(tokens_b):
                    tokens_a.pop()
                else:
                    tokens_b.pop()
            # 精简 convert_single_example 函数
            tokens = []
            segment_ids = []
            tokens.append("[CLS]")
            segment_ids.append(0)
            for token in tokens_a:
                tokens.append(token)
                segment_ids.append(0)
            tokens.append("[SEP]")
            segment_ids.append(0)

            for token in tokens_b:
                tokens.append(token)
                segment_ids.append(1)
            tokens.append("[SEP]")
            segment_ids.append(1)

            input_ids = self.tokenizer.convert_tokens_to_ids(tokens)

            input_mask = [1] * len(input_ids)

            while len(input_ids) < self.max_seq_length:
                input_ids.append(0)
                input_mask.append(0)
                segment_ids.append(0)

            assert len(input_ids) == self.max_seq_length
            assert len(input_mask) == self.max_seq_length
            assert len(segment_ids) == self.max_seq_length

            print("tokens: %s" % " ".join([printable_text(x) for x in tokens]))

            input_idss.append(input_ids)
            input_masks.append(input_mask)
            segment_idss.append(segment_ids)
#          print("input_ids: %s" % " ".join([str(x) for x in input_ids]))
#          print("input_mask: %s" % " ".join([str(x) for x in input_mask]))
#          print("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))

        return {
            "input_ids": nparray(input_idss, dtype=npint64),
            "input_mask": nparray(input_masks, dtype=npint64),
            "segment_ids": nparray(segment_idss, dtype=npint64)
        }

    def batch_cal(self, text_as, text_bs):
        features = self.process_batch_input(text_as, text_bs)
        return self.cal(features)

    def process_context_list_and_candidates(self, context_list, candidates):
        #将 max_seq_length 一分为二,分别存放 text_a 和 text_b,以保持平衡;
        #如果 text_b,即候选的回复长度较小,则将剩余空间都赋给 text_a,即上文
        input_idss = []
        input_masks = []
        segment_idss = []
        for cdd in candidates:
            t_c = self.tokenizer.tokenize(cdd)
            length = len(t_c) + 2  #'[CLS]', '[SEP]'
            t_us = []
            tokens = []
            for utterance in context_list[-1::-1]:
                t_u = self.tokenizer.tokenize(utterance)
                length += len(t_u) + 1
                while length > self.max_seq_length:
                    if len(t_c) + 1 > self.max_seq_length / 2:
                        t_c.pop()
                        length -= 1
                    else:
                        t_u.pop()
                        length -= 1
                t_u.append('[SEP]')
                t_us = t_u + t_us
                if length == self.max_seq_length and len(
                        t_c) + 1 <= self.max_seq_length / 2:
                    break
            tokens.append('[CLS]')
            tokens.extend(t_us)
            tokens.extend(t_c)
            tokens.append('[SEP]')
            input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
            input_mask = [1] * len(input_ids)
            segment_ids = [1] * (len(t_us) + 1) + ([0] * (len(t_c) + 1))

            while len(input_ids) < self.max_seq_length:
                input_ids.append(0)
                input_mask.append(0)
                segment_ids.append(0)
            assert len(input_ids) == self.max_seq_length
            assert len(input_mask) == self.max_seq_length
            assert len(segment_ids) == self.max_seq_length

            print("tokens: %s" % " ".join([printable_text(x) for x in tokens]))
            print("length:" + str(len(tokens)))

            input_idss.append(input_ids)
            input_masks.append(input_mask)
            segment_idss.append(segment_ids)

        return {
            "input_ids": nparray(input_idss, dtype=npint64),
            "input_mask": nparray(input_masks, dtype=npint64),
            "segment_ids": nparray(segment_idss, dtype=npint64)
        }

    def context_list_and_candidates_cal(self, context_list, candidates):
        features = self.process_context_list_and_candidates(
            context_list, candidates)
        return self.cal(features)


#用法示例
#############!!!!!!!!!!!!!需要确定max_seq_length 在模型中,对张量流动的限制,否则由 transformer 的特性,完全可以更自由
# import os
# pretrained_dir="./pretrained/multi_cased_L-12_H-768_A-12/"
# init_checkpoint = os.path.join(pretrained_dir, "./bert_model.ckpt")
# bert_config_file=os.path.join(pretrained_dir, "./bert_config.json")
# vocab_file = os.path.join(pretrained_dir, "./vocab.txt")
# max_seq_length =160
# num_labels = 2
#
# func=BERTFunction(bert_config_file, init_checkpoint, max_seq_length, vocab_file, num_labels)
# res=func.cal("I'm gogo, who are you?","I'm Trump, I'm fine.")
Esempio n. 11
0
def create_pretraining_data_from_docs(docs,
                                      save_path,
                                      vocab_path,
                                      token_method='wordpiece',
                                      language='en',
                                      max_seq_length=128,
                                      dupe_factor=10,
                                      short_seq_prob=0.1,
                                      masked_lm_prob=0.15,
                                      max_predictons_per_seq=20):
    """docs: sequence of sequence of sentences.

    Args:
        docs: Sequence of sequence. Docs is a sequence of documents.
            A document is a sequence of sentences.
        save_path: path to save pretraining data.
        vocab_path: The vocabulary file that the BERT model was trained on.
            only enable when token_method='wordpiece'.
        token_method: string. 'wordpiece' or 'spacy'
        language: string. 'en' or 'chn'
        max_seq_length: integer. Maximum sequence length.
        dupe_factor: integer. Number of times to duplicate the input data (with different masks).
        short_seq_prob: float. Probability of creating sequences which are shorter than the maximum length.
        masked_lm_prob: float. Masked LM probability.
        max_predictons_per_seq: integer. Maximum number of masked LM predictions per sequence.
    """

    if not hasattr(docs, '__len__'):
        raise ValueError("`docs` should be sequence of sequence.")
    else:
        if not hasattr(docs[0], '__len__'):
            raise ValueError("`docs` should be sequence of sequence.")
    if token_method not in ['wordpiece', 'spacy']:
        raise ValueError(
            "`token_method` must be one of `wordpiece` and `spacy`.")
    if language not in ['en', 'chn']:
        raise ValueError("`language` should be one of `en` and `chn`.")

    if token_method == "spacy" and language == "chn":
        raise ValueError(
            "spacy tokenizer only enable when `language` is `en`.")

    if token_method == "wordpiece":
        tokenizer = FullTokenizer(vocab_path, do_lower_case=True)
    else:
        tokenizer = SpacyTokenizer(vocab_path, do_lower_case=True)

    instances = create_training_instances(
        docs,
        tokenizer=tokenizer,
        max_seq_length=max_seq_length,
        dupe_factor=dupe_factor,
        short_seq_prob=short_seq_prob,
        masked_lm_prob=masked_lm_prob,
        max_predictions_per_seq=max_predictons_per_seq)

    pretraining_data = dict(tokens=[],
                            segment_ids=[],
                            is_random_next=[],
                            masked_lm_positions=[],
                            masked_lm_labels=[])

    for i, instance in enumerate(instances):
        if i < 10:
            print("num-{}: {}".format(i, instance))
        pretraining_data['tokens'].append(instance.tokens)
        pretraining_data['segment_ids'].append(instance.segment_ids)
        pretraining_data['is_random_next'].append(int(instance.is_random_next))
        pretraining_data['masked_lm_positions'].append(
            instance.masked_lm_positions)
        pretraining_data['masked_lm_labels'].append(instance.masked_lm_labels)

    tokens_ids = []
    tokens_mask = []
    for tokens in pretraining_data['tokens']:
        sub_ids = tokenizer.convert_tokens_to_ids(tokens)
        sub_mask = [1] * len(sub_ids)
        tokens_ids.append(sub_ids)
        tokens_mask.append(sub_mask)

    masked_lm_ids = []
    for mask_labels in pretraining_data['masked_lm_labels']:
        sub_masked_lm_ids = tokenizer.convert_tokens_to_ids(mask_labels)
        masked_lm_ids.append(sub_masked_lm_ids)

    # input
    tokens_ids = pad_sequences(tokens_ids,
                               maxlen=128,
                               padding='post',
                               truncating='post')
    tokens_mask = pad_sequences(tokens_mask,
                                maxlen=128,
                                padding='post',
                                truncating='post')
    segment_ids = pad_sequences(pretraining_data['segment_ids'],
                                maxlen=128,
                                padding='post',
                                truncating='post')
    masked_lm_positions = pad_sequences(
        pretraining_data['masked_lm_positions'],
        maxlen=20,
        padding='post',
        truncating='post')
    # label
    is_random_next = to_categorical(pretraining_data['is_random_next'],
                                    num_classes=2)
    masked_lm_labels = pad_sequences(masked_lm_ids,
                                     maxlen=20,
                                     padding='post',
                                     truncating='post')

    # save
    np.savez(file=save_path,
             tokens_ids=tokens_ids,
             tokens_mask=tokens_mask,
             segment_ids=segment_ids,
             is_random_next=is_random_next,
             masked_lm_positions=masked_lm_positions,
             masked_lm_labels=masked_lm_labels)

    print("[INFO] number of train data:", len(tokens_ids))
    print("[INFO] is_random_next ratio:",
          np.sum(pretraining_data['is_random_next']) / len(is_random_next))
Esempio n. 12
0
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from tokenization import FullTokenizer

path = "./"
pd_all = pd.read_csv(os.path.join(path, "weibo_senti_100k.csv"))

tokenizer = FullTokenizer("vocab.txt")

pd_all = shuffle(pd_all)

x_data, y_data = pd_all.review.values, pd_all.label.values

x_data = [
    tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
    for text in x_data
]

x_train, x_test, y_train, y_test = train_test_split(np.array(x_data),
                                                    y_data,
                                                    test_size=0.2)

max_features = 21128
# cut texts after this number of words (among top max_features most common words)
maxlen = 128
batch_size = 32

print('Loading data...')
#(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
class ModelServer:
    def __init__(self, param):

        self.model_path = os.path.abspath(param["model_path"])
        self.bert_config_file = os.path.abspath(param["bert_config_file"])
        bert_config = modeling.BertConfig.from_json_file(self.bert_config_file)
        self.fulltoken = FullTokenizer(os.path.abspath(param["vocab_file"]))
        self.vocab_dict = self.fulltoken.vocab

        target_start_ids = self.vocab_dict["[CLS]"]
        target_end_ids = self.vocab_dict["[SEP]"]

        num_gpus = len(os.environ["CUDA_VISIBLE_DEVICES"].split(','))
        tf.logging.info("num_gpus is {}".format(num_gpus))
        if param["use_mul_gpu"]:
            distribute = tf.contrib.distribute.MirroredStrategy(
                num_gpus=num_gpus)
        else:
            distribute = None
        run_config = tf.estimator.RunConfig(model_dir=os.path.abspath(
            self.model_path),
                                            save_summary_steps=200,
                                            keep_checkpoint_max=2,
                                            save_checkpoints_steps=3000,
                                            train_distribute=distribute,
                                            eval_distribute=distribute)
        self.input_max_seq_length = param["max_seq_length"]
        model_fn = model_fn_builder(
            bert_config,
            init_checkpoint=None,
            learning_rate=0.0001,
            num_train_steps=10000,
            num_warmup_steps=100,
            use_one_hot_embeddings=False,  # when use tpu ,it's True
            input_seq_length=param["max_seq_length"],
            target_seq_length=param["max_target_seq_length"],
            target_start_ids=target_start_ids,
            target_end_ids=target_end_ids,
            batch_size=param["batch_size"],
            mode_type=param["mode_type"])
        self.estimator = tf.estimator.Estimator(model_fn=model_fn,
                                                config=run_config)

    #input:[(str_mask_tokens,str_labels),list_str_mask_words]
    #label 0:Not mentioned,
    #   1:Negative,
    #   2:Neutral,
    #   3:Positive
    def predict(self, inputs, limitNum=3):
        predicts = []
        if not isinstance(inputs, list):
            inputs = [inputs]

        def token_input():
            for input in inputs:
                tokens = input[0]
                labels = [int(label) for label in input[1]][:20]
                mask_words = input[2]
                assert max(labels) < 4 and min(labels) >= 0
                tokens = self.fulltoken.tokenize(
                    tokens)[:self.input_max_seq_length - 2]

                def replace_Mask(tokens, mask_words):
                    mask_index = []
                    first_maskwords = [x[0] for x in mask_words]

                    for index, token in enumerate(tokens):
                        if token in first_maskwords:
                            for mask_words_x in mask_words:
                                if token == mask_words_x[0]:
                                    _token = "".join([
                                        _t.replace("#", '')
                                        for _t in tokens[index:index +
                                                         len(mask_words_x)]
                                    ])
                                    if _token == mask_words_x:
                                        for i in range(len(mask_words_x)):
                                            mask_index.append(index + i)
                                        mask_words = [
                                            x_ for x_ in mask_words
                                            if x_ != mask_words_x
                                        ]
                                        first_maskwords = [
                                            x[0] for x in mask_words
                                        ]
                        if len(mask_words) < 1:
                            break
                    for mask_index_ in mask_index:
                        tokens[mask_index_] = '[MASK]'
                    return tokens

                tokens = replace_Mask(tokens, mask_words)
                ids = self.fulltoken.convert_tokens_to_ids(['[CLS]'] + tokens +
                                                           ['[SEP]'])
                input_mask = [1] * len(ids)
                segment_ids = [0] * self.input_max_seq_length
                while len(ids) < self.input_max_seq_length:
                    ids.append(0)
                    input_mask.append(0)
                while len(labels) < 20:
                    labels.append(0)

                yield ([ids], [input_mask], [labels], [segment_ids])

        def input_fn():

            dataset = tf.data.Dataset.from_generator(
                token_input, (tf.int64, tf.int64, tf.int64, tf.int64),
                output_shapes=(tf.TensorShape([
                    None, self.input_max_seq_length
                ]), tf.TensorShape([None, self.input_max_seq_length]),
                               tf.TensorShape([None, 20]),
                               tf.TensorShape(
                                   [None, self.input_max_seq_length])))
            dataset = dataset.map(
                lambda ids, input_mask, labels, segment_ids: {
                    "sentiment_labels": labels,
                    "input_token_ids": ids,
                    "input_mask": input_mask,
                    "target_token_ids": tf.zeros_like([1, 1]),
                    "target_mask": tf.zeros_like([1, 1]),
                    "segment_ids": segment_ids
                })

            # (ids, input_mask, labels, segment_ids)=dataset
            # features={
            #     "sentiment_labels": labels,
            #     "input_token_ids": ids,
            #     "input_mask": input_mask,
            #     "target_token_ids": tf.zeros_like([1, 1]),
            #     "target_mask": tf.zeros_like([1, 1]),
            #     "segment_ids": segment_ids}
            #
            # return features

            return dataset

        result = self.estimator.predict(input_fn=input_fn)
        for prediction in result:
            sample_id = prediction['sample_id'][:, :limitNum].T.tolist()
            ans = []
            for sample_id_ in sample_id:
                token = self.fulltoken.convert_ids_to_tokens(sample_id_)
                ans.append("".join(token[:-1]))
            predicts.append(ans)
            input = prediction['inputs'].tolist()
            print(self.fulltoken.convert_ids_to_tokens(input))

        return predicts
Esempio n. 14
0
class BERTFunction(object):
    def __init__(self, bert_config_file, init_checkpoint, max_seq_length, vocab_file, num_labels, use_gpu=False):
        # 导入预训练参数所需
        self.bert_config = modeling.BertConfig.from_json_file(bert_config_file)
        self.init_checkpoint = init_checkpoint
        # 数据集和计算所需
        self.max_seq_length = max_seq_length
        self.num_labels = num_labels
        # 数据预处理所需
        self.vocab_file = vocab_file
        self.tokenizer=FullTokenizer(self.vocab_file, do_lower_case=False) # 默认 cased 模型
        # gpu
        self.use_gpu=use_gpu
        
        self.graph=tf.Graph() #声明计算图
        with self.graph.as_default():
            # 定义placeholder
            self.input_ids = tf.placeholder(dtype=tf.int64, shape=(None,self.max_seq_length))
            self.input_mask = tf.placeholder(dtype=tf.int64, shape=(None,self.max_seq_length))
            self.segment_ids = tf.placeholder(dtype=tf.int64, shape=(None,self.max_seq_length))
                
            # 定义计算
            (self.logits, self.probabilities) = create_predict_model(self.bert_config, 
            self.input_ids, self.input_mask, self.segment_ids, self.num_labels)
            
            # 导入预训练参数
            self.tvars = tf.trainable_variables() #创建了计算图后,可训练的变量随之被创建。
            self.initialized_variable_names = {}
            if self.init_checkpoint: #init_checkpoint是命令行中传入的预训练BERT或先前训练过的,ckpt文件
                (self.assignment_map, self.initialized_variable_names #从init_checkpoints中获取与可用的变量的值(预训练模型与实际任务计算图的变量的交集)
                 ) = modeling.get_assignment_map_from_checkpoint(self.tvars, self.init_checkpoint)
            tf.train.init_from_checkpoint(self.init_checkpoint, self.assignment_map)
        
    def cal(self, text_a, text_b):
        features=self.process_input(text_a, text_b)
        config = None
        if self.use_gpu:
            config = tf.ConfigProto(log_device_placement=True)  
            config.gpu_options.allow_growth = True  
            print("trying to use gpu")
        else:
            print("not using cpu")
        with tf.Session(graph=self.graph, config=config) as session:
            session.run(tf.global_variables_initializer())
            feed_dict={self.input_ids: features["input_ids"], 
                       self.input_mask: features["input_mask"], 
                       self.segment_ids: features["segment_ids"]}
            prob = session.run(self.probabilities, feed_dict=feed_dict)
            print("prob: \n", prob)
            return prob
    def process_input(self, text_a, text_b):#重新精简 convert_single_example 函数
          tokens_a = self.tokenizer.tokenize(text_a)
          tokens_b = self.tokenizer.tokenize(text_b)
          #_truncate_seq_pair 函数
          while True:
              total_length = len(tokens_a) + len(tokens_b)
              if total_length <= self.max_seq_length:
                  break
              if len(tokens_a) > len(tokens_b):
                  tokens_a.pop()
              else:
                  tokens_b.pop()
          # 精简 convert_single_example 函数
          tokens = []
          segment_ids = []
          tokens.append("[CLS]")
          segment_ids.append(0)
          for token in tokens_a:
              tokens.append(token)
              segment_ids.append(0)
          tokens.append("[SEP]")
          segment_ids.append(0)

          for token in tokens_b:
              tokens.append(token)
              segment_ids.append(1)
          tokens.append("[SEP]")
          segment_ids.append(1)
          
          input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
          
          input_mask = [1] * len(input_ids)

          while len(input_ids) < self.max_seq_length:
              input_ids.append(0)
              input_mask.append(0)
              segment_ids.append(0)
        
          assert len(input_ids) == self.max_seq_length
          assert len(input_mask) == self.max_seq_length
          assert len(segment_ids) == self.max_seq_length
          
          print("tokens: %s" % " ".join([printable_text(x) for x in tokens]))
#          print("input_ids: %s" % " ".join([str(x) for x in input_ids]))
#          print("input_mask: %s" % " ".join([str(x) for x in input_mask]))
#          print("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
          
          return {"input_ids": nparray([input_ids], dtype=npint64),
                  "input_mask": nparray([input_mask], dtype=npint64),
                  "segment_ids": nparray([segment_ids], dtype=npint64)}

#用法示例
#############!!!!!!!!!!!!!需要确定max_seq_length 在模型中,对张量流动的限制,否则由 transformer 的特性,完全可以更自由
# import os
# pretrained_dir="../pretrained/multi_cased_L-12_H-768_A-12/"
# init_checkpoint = os.path.join(pretrained_dir, "./bert_model.ckpt")
# bert_config_file=os.path.join(pretrained_dir, "./bert_config.json")
# vocab_file = os.path.join(pretrained_dir, "./vocab.txt")
# max_seq_length =160 
# num_labels = 2
#
# func=BERTFunction(bert_config_file, init_checkpoint, max_seq_length, vocab_file, num_labels)
# res=func.cal("I'm gogo, who are you?","I'm Trump, I'm fine.")