Example #1
0
def parse_sequence(tokenizer: t10n.FullTokenizer,
                   sequence: str) -> SequenceParseResult:
    tokens = tokenizer.tokenize(sequence)
    tokens.insert(0, '[CLS]')
    tokens.append('[SEP]')

    # Could be 0 or 1, not sure which index is *supposed* to represent a first segment
    token_type_ids = [0] * len(tokens)
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    attention_mask = [1] * len(tokens)
    attention_mask[0] = 0

    # Default for our model
    max_seq_length = 128

    # Pad arrays
    while len(input_ids) < max_seq_length:

        # Not sure if padding belongs to the sequence or not
        token_type_ids.append(0)

        # Zero is the [PAD]-token for the BERT-vocab
        input_ids.append(0)

        # We probably should exclude the sequence padding from the attention-mask
        attention_mask.append(0)

    return SequenceParseResult(
        tokens=tokens,
        token_type_ids=token_type_ids,
        attention_mask=attention_mask,
        input_ids=input_ids,
    )
Example #2
0
def preprocess(data):
    tokenizer = FullTokenizer(vocab_file)
    tok_ip = np.zeros((len(data), 128), dtype="int32")
    sent_ip = np.zeros((len(data), 128), dtype="int8")
    pos_ip = np.zeros((len(data), 128), dtype="int8")
    masks = np.zeros((len(data), 128), dtype="int8")

    for pos, text in tqdm.tqdm_notebook(enumerate(data)):
        tok0 = tokenizer.tokenize(text[0])
        tok1 = tokenizer.tokenize(text[1])
        tok = tok0 + tok1
        if len(tok) > 128:
            tok = tok[:127] + ["[SEP]"]
        pad_len = 128 - len(tok)
        tok_len = len(tok)
        tok0_len = len(tok0)
        tok = tokenizer.convert_tokens_to_ids(tok) + [0] * pad_len
        pos_val = range(128)
        sent = [0] * tok0_len + [1] * (tok_len - tok0_len) + [0] * pad_len
        mask = [1] * tok_len + [0] * pad_len

        tok_ip[pos] = tok
        pos_ip[pos] = pos_val
        masks[pos] = mask

    masks = masks[:, None, None, :]
    return tok_ip, sent_ip, pos_ip, masks
Example #3
0
 def __init__(self, bert_config_file, init_checkpoint, max_seq_length, vocab_file, num_labels, use_gpu=False):
     # 导入预训练参数所需
     self.bert_config = modeling.BertConfig.from_json_file(bert_config_file)
     self.init_checkpoint = init_checkpoint
     # 数据集和计算所需
     self.max_seq_length = max_seq_length
     self.num_labels = num_labels
     # 数据预处理所需
     self.vocab_file = vocab_file
     self.tokenizer=FullTokenizer(self.vocab_file, do_lower_case=False) # 默认 cased 模型
     # gpu
     self.use_gpu=use_gpu
     
     self.graph=tf.Graph() #声明计算图
     with self.graph.as_default():
         # 定义placeholder
         self.input_ids = tf.placeholder(dtype=tf.int64, shape=(None,self.max_seq_length))
         self.input_mask = tf.placeholder(dtype=tf.int64, shape=(None,self.max_seq_length))
         self.segment_ids = tf.placeholder(dtype=tf.int64, shape=(None,self.max_seq_length))
             
         # 定义计算
         (self.logits, self.probabilities) = create_predict_model(self.bert_config, 
         self.input_ids, self.input_mask, self.segment_ids, self.num_labels)
         
         # 导入预训练参数
         self.tvars = tf.trainable_variables() #创建了计算图后,可训练的变量随之被创建。
         self.initialized_variable_names = {}
         if self.init_checkpoint: #init_checkpoint是命令行中传入的预训练BERT或先前训练过的,ckpt文件
             (self.assignment_map, self.initialized_variable_names #从init_checkpoints中获取与可用的变量的值(预训练模型与实际任务计算图的变量的交集)
              ) = modeling.get_assignment_map_from_checkpoint(self.tvars, self.init_checkpoint)
         tf.train.init_from_checkpoint(self.init_checkpoint, self.assignment_map)
Example #4
0
 def test_tokenize(self):
     tokenizer = FullTokenizer()
     sentence = '実質的変化はなかった'
     res = tokenizer.tokenize(sentence)
     firsts = [0, 2, 3, 5, 6, 9]
     tokens = [
         CharToken(c, is_first=i in firsts) for i, c in enumerate(sentence)
     ]
     self.assertEqual(res, tokens)
Example #5
0
def get_lens(data):
    tokenizer = FullTokenizer(vocab_file)
    lens = []
    for pos, text in tqdm.tqdm(enumerate(data)):
        tok0 = tokenizer.tokenize(text[0])
        tok1 = tokenizer.tokenize(text[1])
        tok = tok0 + tok1
        lens.append(len(tok))

    return np.array(lens)
Example #6
0
def convert_single_example(ex_index, example: InputExample, tag_list: list, label_list: list, max_seq_length,
                           tokenizer: tokenization.FullTokenizer):
    query = tokenizer.tokenize(example.text)

    if len(query) > max_seq_length - 2:
        query = query[0:(max_seq_length - 2)]

    tokens = ["[CLS]"]
    tags = ["[CLS]"]
    for idx, token in enumerate(query):
        tokens.append(token)
        tags.append(example.tag[idx])
    tokens.append("[SEP]")
    tags.append("[SEP]")
    segment_ids = [0] * len(tokens)

    tag_map = {}
    for idx, tag in enumerate(tag_list):
        tag_map[tag] = idx
    label_map = {}
    for idx, label in enumerate(label_list):
        label_map[label] = idx

    tag_ids = [tag_map[tag] for tag in tags]
    label_id = label_map[example.label]

    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_mask = [1] * len(input_ids)

    while len(input_ids) < max_seq_length:
        input_ids.append(0)
        input_mask.append(0)
        segment_ids.append(0)
        tag_ids.append(0)

    if ex_index < 5:
        logger.info("*** Example ***")
        logger.info("guid: %s" % example.guid)
        logger.info("tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens]))
        logger.info("tag: %s" % " ".join(tags))
        logger.info("label: %s" % example.label)

    feature = InputFeature(
        input_ids=input_ids,
        input_mask=input_mask,
        segment_ids=segment_ids,
        tag_ids=tag_ids,
        label_id=label_id
    )

    return feature
Example #7
0
 def test_tokenize_with_nelogd(self):
     NEOLOGD_PATH = "/usr/local/lib/mecab/dic/ipadic/mecab-user-dict-seed.dic"
     if not os.path.isfile(NEOLOGD_PATH):
         raise ValueError(
             'NEOLOGD_PATH is invalid. Please set a file path to neologd dic'
         )
     sentence = '実質的変化はなかった'
     tokenizer = FullTokenizer(userdic_path=NEOLOGD_PATH)
     firsts = [0, 3, 5, 6, 9]
     tokens = [
         CharToken(c, is_first=i in firsts) for i, c in enumerate(sentence)
     ]
     res = tokenizer.tokenize(sentence)
     self.assertEqual(res, tokens)
Example #8
0
    def __init__(self):
        self.THRESHOLD = 0.1
        self.PROB_THRESHOLD = 0.8
        
        self.LABELS_32 = [
            "sentimental",
            "afraid",
            "proud",
            "faithful",
            "terrified",
            "joyful",
            "angry",
            "sad",
            "jealous",
            "grateful",
            "prepared",
            "embarrassed",
            "excited",
            "annoyed",
            "lonely",
            "ashamed",
            "guilty",
            "surprised",
            "nostalgic",
            "confident",
            "furious",
            "disappointed",
            "caring",
            "trusting",
            "disgusted",
            "anticipating",
            "anxious",
            "hopeful",
            "content",
            "impressed",
            "apprehensive",
            "devastated"
        ]

        self.MAX_SEQ_LENGTH = 50

        self.tokenizer = FullTokenizer(
            vocab_file='vocab.txt', do_lower_case=True)

        self.model = load_model('model_data/model32')

        self.matrix = np.genfromtxt('emotion_multiplier.csv')

        self.map_probabilities = np.vectorize(lambda x: 1 if x >= self.PROB_THRESHOLD else 0)
Example #9
0
def main(_):
    tokenizer_zh = FullTokenizer(vocab_file=FLAGS.bert_vocab_file,
                                 do_lower_case=True)

    tokenizer_en = load_subword_vocab(FLAGS.vocab_file)
    target_vocab_size = tokenizer_en.vocab_size + 2

    config = FileConfig(FLAGS.config_file)
    transformer = Transformer(config=config,
                              target_vocab_size=target_vocab_size,
                              bert_config_file=FLAGS.bert_config_file)

    inp = tf.random.uniform((1, FLAGS.max_seq_length))
    tar_inp = tf.random.uniform((1, FLAGS.max_seq_length))
    fn_out, _ = transformer(inp,
                            tar_inp,
                            True,
                            look_ahead_mask=None,
                            dec_padding_mask=None)

    transformer.load_weights(FLAGS.init_checkpoint)

    print(transformer.encoder.weights[0])

    result, _ = evaluate(transformer, tokenizer_zh, tokenizer_en,
                         FLAGS.inp_sentence, FLAGS.max_seq_length)

    predicted_sentence = tokenizer_en.decode(
        [i for i in result if i < tokenizer_en.vocab_size])

    print('Input: {}'.format(FLAGS.inp_sentence))
    print('Predicted translation: {}'.format(predicted_sentence))
def adaptERNIEtokenization(all_sentences):
    tokenizer = FullTokenizer(vocab_file="vocab.txt", do_lower_case=True)
    ernie_tokens = [
        tokenizer.tokenize(sentence) for sentence in tqdm(all_sentences)
    ]
    print("Parsed to ERNIE tokens!")
    all_cleaned_tokens = []
    for line in tqdm(ernie_tokens):
        cleaned_tokens = []
        for i, token in enumerate(line):
            if token[:2] == "##":
                cleaned_tokens[-1] += token[2:]
            else:
                cleaned_tokens.append(token)
        all_cleaned_tokens.append(cleaned_tokens)
    return all_cleaned_tokens
Example #11
0
def bert_tokenizer(sess):
    bert_module = hub.Module(bert_path)
    tokenization_info = bert_module(signature="tokenization_info",
                                    as_dict=True)
    vocab_file = sess.run(tokenization_info["vocab_file"])
    do_lower_case = sess.run(tokenization_info["do_lower_case"])
    return FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)
Example #12
0
def create_tokenizer_from_hub_module():
	global do_lower_case
	"""Get the vocab file and casing info from the Hub module."""
	bert_module = hub.Module(bert_path)
	tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
	vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"], tokenization_info["do_lower_case"]])
	return FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)
Example #13
0
    def __init__(self,gcn=False,soft_masked=True, lr=5e-5,  beta=3.0, layers_num=2, dropout=1.0, bert_max_len=192, dir_model='../checkpoint/train/soft-masked-bert/', bert_dir='../rest-api/app/models/chinese_L-12_H-768_A-12', train=True):
        # super(BERT_GCN, self).__init__(config)
        self.gcn = gcn
        self.soft_masked=soft_masked
        self.lr = lr
        self.dir_model = dir_model
        # self.nepochs=nepochs
        self.dropout = dropout  # 1
        self.bert_dir = bert_dir
        self.bert_max_len = bert_max_len
        self.beta = beta
        self.layers_num = layers_num
        config_file = bert_dir + '/bert_config.json'
        self.init_checkpoint = bert_dir + '/bert_model.ckpt'
        vocab_file = bert_dir + '/vocab.txt'
        self.vocab = construct_vocab(vocab_file)
        self.tokenizer = FullTokenizer(
            vocab_file=vocab_file, do_lower_case=True)
        self.bert_api = BertModel(
            config=BertConfig.from_json_file(config_file),soft_masked=soft_masked)
        if self.soft_masked:
            self.tags_dict={'O':0,
            'B-Err':1}
        if self.gcn:
            # Read data from checkpoint file
            reader = pywrap_tensorflow.NewCheckpointReader(self.init_checkpoint)
            var_to_shape_map = reader.get_variable_to_shape_map()
            # Print tensor name and values

            for key in var_to_shape_map:
                if "word_embeddings" in key:
                    emb_table = reader.get_tensor(key)
                    break

            with open("filter_dict.json", 'r') as load_f:
                dict_filter = json.load(load_f)
                # in_conf_ind=sorted(dict_filter.items(),key=lambda d:d[0])
            zero_id = len(dict_filter)
            print("zero_id:", zero_id)
            self.emb_table_filted = []
            # y=-1
            self.w_index = [zero_id]*21128
            # self.b_index = []
            self.emb_mask = np.ones([21128, 768])
            for x in dict_filter:

                self.w_index[int(x)] = dict_filter[x]
                self.emb_table_filted.append(emb_table[int(x)])
                # emb_table[int(x)]=np.zeros([768])
                self.emb_mask[int(x)] = np.zeros([768])



            self.emb_table_filted = np.array(self.emb_table_filted)
            r = np.load('spellgcn_adj_norm.npz')
            self.p_A = r['A_p'].astype(np.float32)
            self.s_A = r['A_s'].astype(np.float32)
            self.p_A = tf.constant(self.p_A)
            self.s_A = tf.constant(self.s_A)
Example #14
0
def dump_node_feat(args):
    log.info("Dump node feat starting...")
    id2str = np.load(os.path.join(args.outpath, "id2str.npy"), mmap_mode="r")
    pool = multiprocessing.Pool()
    tokenizer = FullTokenizer(args.vocab_file)
    term_ids = pool.map(partial(term2id, tokenizer=tokenizer, max_seqlen=args.max_seqlen), id2str)
    np.save(os.path.join(args.outpath, "term_ids.npy"), np.array(term_ids))
    log.info("Dump node feat done.")
    pool.terminate()
Example #15
0
    def __init__(self, bert_meta):
        self.graph = self._load_graph(bert_meta.model_file)

        self.tokenizer = FullTokenizer(vocab_file=bert_meta.vocab_file,
                                       do_lower_case=True)
        self.max_seq_length = 128

        # Input.
        self.input_ids = self.graph.get_tensor_by_name('infer/input_ids:0')
        self.word_ids = self.graph.get_tensor_by_name('infer/input_mask:0')
        self.segment_ids = self.graph.get_tensor_by_name('infer/segment_ids:0')
        # Output.
        self.predictions = self.graph.get_tensor_by_name(
            'infer/loss/Softmax:0')

        self.sess = tf.Session(graph=self.graph)

        self.inference(BertInputPackage(u'预热一下'))
Example #16
0
class BERTTextEncoder(TextEncoder):
    def __init__(self, vocab_file: str, do_lower_case: bool = True) -> None:
        self.tokenizer = FullTokenizer(vocab_file, do_lower_case)
        super().__init__(len(self.tokenizer.vocab))
        self.bert_unk_id = self.tokenizer.vocab['[UNK]']
        self.bert_msk_id = self.tokenizer.vocab['[MASK]']

    def standardize_ids(self, ids: List[int]) -> List[int]:
        for i in range(len(ids)):
            if ids[i] == self.bert_unk_id:  # UNK
                ids[i] = 0
            else:  # VOCAB
                ids[i] -= self.bert_msk_id
        return ids

    def encode(self, sent: str) -> List[int]:
        return self.standardize_ids(
            self.tokenizer.convert_tokens_to_ids(
                self.tokenizer.tokenize(sent)))
    def __init__(self, param):

        self.model_path = os.path.abspath(param["model_path"])
        self.bert_config_file = os.path.abspath(param["bert_config_file"])
        bert_config = modeling.BertConfig.from_json_file(self.bert_config_file)
        self.fulltoken = FullTokenizer(os.path.abspath(param["vocab_file"]))
        self.vocab_dict = self.fulltoken.vocab

        target_start_ids = self.vocab_dict["[CLS]"]
        target_end_ids = self.vocab_dict["[SEP]"]

        num_gpus = len(os.environ["CUDA_VISIBLE_DEVICES"].split(','))
        tf.logging.info("num_gpus is {}".format(num_gpus))
        if param["use_mul_gpu"]:
            distribute = tf.contrib.distribute.MirroredStrategy(
                num_gpus=num_gpus)
        else:
            distribute = None
        run_config = tf.estimator.RunConfig(model_dir=os.path.abspath(
            self.model_path),
                                            save_summary_steps=200,
                                            keep_checkpoint_max=2,
                                            save_checkpoints_steps=3000,
                                            train_distribute=distribute,
                                            eval_distribute=distribute)
        self.input_max_seq_length = param["max_seq_length"]
        model_fn = model_fn_builder(
            bert_config,
            init_checkpoint=None,
            learning_rate=0.0001,
            num_train_steps=10000,
            num_warmup_steps=100,
            use_one_hot_embeddings=False,  # when use tpu ,it's True
            input_seq_length=param["max_seq_length"],
            target_seq_length=param["max_target_seq_length"],
            target_start_ids=target_start_ids,
            target_end_ids=target_end_ids,
            batch_size=param["batch_size"],
            mode_type=param["mode_type"])
        self.estimator = tf.estimator.Estimator(model_fn=model_fn,
                                                config=run_config)
Example #18
0
    def parse_line(self, line, max_seq_len=512):
        """ parse one line to token_ids, sentence_ids, pos_ids, label
        """

        line = line.strip().split(",")
        assert len(line) == 3, \
            "One sample must have %d fields!" % 3

        text_left, text_right, masklabel = line
        tokenizer = FullTokenizer(self.vocab_path)
        # tokenizer = FullTokenizer(vocab_path)
        text_left = tokenizer.tokenize(text_left)
        masklabel = tokenizer.tokenize(masklabel)
        masklabel_ = len(masklabel) * ["[MASK]"]
        text_right = tokenizer.tokenize(text_right)
        all_tokens = text_left + masklabel_ + text_right
        token_ids = tokenizer.convert_tokens_to_ids(all_tokens)
        sent_ids = [0] * len(all_tokens)
        pos_ids = [i for i in range(len(all_tokens))]
        input_mask = [1.0] * len(all_tokens)
        # 这儿还差一个mask_pos
        mask_pos = []
        for idx, mask in enumerate(token_ids):
            if mask == self.mask_id:
                mask_pos.append(idx)
        # 添加一个mask_label
        mask_label = list(tokenizer.convert_tokens_to_ids(masklabel))
        assert len(token_ids) == len(sent_ids) == len(pos_ids) == len(
            input_mask
        ), "[Must be true]len(token_ids) == len(sent_ids) == len(pos_ids) == len(seg_labels)"
        if len(token_ids) > max_seq_len:
            return None
        return [token_ids, sent_ids, pos_ids, input_mask, mask_pos, mask_label]
Example #19
0
def dump_node_feat(args):
    log.info("Dump node feat starting...")
    id2str = [
        line.strip("\n").split("\t")[1]
        for line in io.open(os.path.join(args.outpath, "terms.txt"),
                            encoding=args.encoding)
    ]
    pool = multiprocessing.Pool()
    tokenizer = FullTokenizer(args.vocab_file)
    term_ids = pool.map(
        partial(term2id, tokenizer=tokenizer, max_seqlen=args.max_seqlen),
        id2str)
    np.save(os.path.join(args.outpath, "term_ids.npy"),
            np.array(term_ids, np.uint16))
    log.info("Dump node feat done.")
    pool.terminate()
Example #20
0
 def setUp(self):
     with NamedTemporaryFile(mode='w') as tf:
         tf.write("a\n[CLS]\nb\n[SEP]c\nd\ne\nf\ng\nh\n")
         tf.seek(0)
         tokenizer = FullTokenizer(vocab_file=tf.name)
         self.vocab_words = list(tokenizer.vocab.keys())
     self.tokens = [
         CharToken('a', True),
         CharToken('b', False),
         CharToken('c', False),
         CharToken('d', True),
         CharToken('e', False),
         CharToken('f', True),
         CharToken('g', False),
         CharToken('h', True)
     ]
Example #21
0
 def load_model(self,
                model_dir: str,
                model_config: str = "model_config.json"):
     model_config = os.path.join(model_dir, model_config)
     model_config = json.load(open(model_config))
     bert_config = json.load(
         open(os.path.join(model_dir, "bert_config.json")))
     model = BertNer(bert_config, tf.float32, model_config['num_labels'],
                     model_config['max_seq_length'])
     ids = tf.ones((1, 128), dtype=tf.int64)
     _ = model(ids, ids, ids, ids, training=False)
     model.load_weights(os.path.join(model_dir, "model.h5"))
     voacb = os.path.join(model_dir, "vocab.txt")
     tokenizer = FullTokenizer(vocab_file=voacb,
                               do_lower_case=model_config["do_lower"])
     return model, tokenizer, model_config
Example #22
0
def gen_data(in_file, out_file, tagType):
    with open(in_file, 'r', encoding='utf8') as f:
        raw_data = [_.strip() for _ in f.readlines()]

    vocab_file = '../models/vocab.txt'
    full_tokenizer = FullTokenizer(vocab_file, do_lower_case=True)
    basic_tokenizer = BasicTokenizer(do_lower_case=True)

    data_all = [
        preprocess2dict(s, tagType, full_tokenizer, basic_tokenizer)
        for s in tqdm(raw_data)
    ]

    df = pd.DataFrame(data_all)
    # separate with \t
    df.to_csv(out_file, sep='\t', encoding='utf-8', index=False)

    print('Finish writing generated ' + tagType + ' data in ' + out_file)
Example #23
0
 def gen(self):
     from extract_features import convert_lst_to_features
     from tokenization import FullTokenizer
     tokenizer = FullTokenizer(
         vocab_file=os.path.join(self.bert_model_dir, 'vocab.txt'))
     # Windows does not support logger in MP environment, thus get a new logger
     #这个while 循环保证生成器hold住,estimator.predict不用重新加载
     while not self.closed:
         is_tokenized = all(isinstance(el, list) for el in self.text)
         tmp_f = list(
             convert_lst_to_features(self.text,
                                     self.seq_length,
                                     tokenizer,
                                     is_tokenized,
                                     mask_cls_sep=True))
         # print([f.input_ids for f in tmp_f])
         yield {
             'input_ids': [f.input_ids for f in tmp_f],
             'input_mask': [f.input_mask for f in tmp_f],
             'input_type_ids': [f.input_type_ids for f in tmp_f]
         }
Example #24
0
    clf_output = sequence_output[:, 0, :]

    out = keras.layers.LSTM(128)
    out = keras.layers.Dense(1, activation='sigmoid')(clf_output)

    model = keras.Model(inputs=[input_word_ids, input_mask, segment_ids],
                        outputs=out)
    model.compile(keras.optimizers.Adam(lr=0.00001),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model


vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

train_input = bert_encode(train.text1.values, tokenizer, maxlen)
test_input = bert_encode(test.text1.values, tokenizer, maxlen)

model = build_model(bert_layer, maxlen)
model.summary()

# model = keras.Sequential([
#         bert_layer([input_word_ids, input_mask, segment_ids]),
#         # keras.layers.Dropout(0.3),
#         keras.layers.LSTM(128),
#         # keras.layers.Dropout(0.3),
#         keras.layers.Dense(64),
#         keras.layers.Dense(1, activation = 'sigmoid')
#     ]
Example #25
0
        help="Maximum number of contexts to output for an example.")
    parser.add_argument(
        "--max_position",
        type=int,
        default=50,
        help="Maximum context position for which to generate special tokens.")
    parser.add_argument(
        "--skip_nested_contexts",
        type=bool,
        default=True,
        help=
        "Completely ignore context that are not top level nodes in the page.")

    args = parser.parse_args()
    tokenizer = FullTokenizer(
        'check_points/bert-large-wwm-finetuned-squad/vocab.txt',
        do_lower_case=True)

    # train preprocess
    import ipdb

    output_file = os.path.join(
        args.output_dir, 'train_data_maxlen{}.bin'.format(args.max_seq_length))
    ipdb.set_trace()
    examples = read_nq_examples(input_file=args.train_file,
                                is_training=True,
                                args=args)
    num_spans_to_ids, features = convert_examples_to_features(
        examples=examples, tokenizer=tokenizer, is_training=True, args=args)
    torch.save((features, examples), output_file)
    for spans, ids in num_spans_to_ids.items():
 def onSetup(self):
     BERT_DIR = os.path.join(ue.get_content_dir(), 'Scripts', 'BertModel')
     self.imported = tf.saved_model.load(BERT_DIR)
     self.f = self.imported.signatures["serving_default"]
     VOCAB_PATH = os.path.join(BERT_DIR, "assets", "vocab.txt")
     self.tokenizer = FullTokenizer(VOCAB_PATH)
Example #27
0
for x, y in imdb["train"].batch(128):
    imdb_reviews_train.extend(x.numpy())
    y_train.extend(y.numpy())
for x, y in imdb["test"].batch(128):
    imdb_reviews_test.extend(x.numpy())
    y_test.extend(y.numpy())
y_train = np.array(y_train)
y_test = np.array(y_test)

# Extract pre-trained BERT as a Keras layer.
bert_model_path = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1"
bert_layer = hub.KerasLayer(bert_model_path, trainable=False)

# Build tokenizer from pre-trained BERT vocabulary.
bert_tokenizer = FullTokenizer(
    vocab_file=bert_layer.resolved_object.vocab_file.asset_path.numpy(),
    do_lower_case=bert_layer.resolved_object.do_lower_case.numpy())

# TODO:
# Document longer than 512 words wont be able to be encoded by BERT,
# since its positional encoding has a hard limit for 512 words.
# For better results we may need to summarize the document into <= 512 tokens,
# or encode sentence by sentence then pool together.
maxlen = 256

# TODO:
# We need to manually handle CLS and SEP special token for sentence beginning and ending.

# Encode text with padding, masking, and segmentation (required by BERT even if we don't use it).
tok_seq_train = [bert_tokenizer.tokenize(text) for text in imdb_reviews_train]
wid_seq_train = [
Example #28
0
class BertInference(object):
    """
    The bert model.
    """
    def __init__(self, bert_meta):
        self.graph = self._load_graph(bert_meta.model_file)

        self.tokenizer = FullTokenizer(vocab_file=bert_meta.vocab_file,
                                       do_lower_case=True)
        self.max_seq_length = 128

        # Input.
        self.input_ids = self.graph.get_tensor_by_name('infer/input_ids:0')
        self.word_ids = self.graph.get_tensor_by_name('infer/input_mask:0')
        self.segment_ids = self.graph.get_tensor_by_name('infer/segment_ids:0')
        # Output.
        self.predictions = self.graph.get_tensor_by_name(
            'infer/loss/Softmax:0')

        self.sess = tf.Session(graph=self.graph)

        self.inference(BertInputPackage(u'预热一下'))

    def inference(self, bert_input):
        """
        Call model.
        """
        input_ids, input_mask, segment_ids = self._convert_single_example(
            bert_input.query)
        preds_evaluated = self.sess.run(self.predictions,
                                        feed_dict={
                                            self.input_ids: [input_ids],
                                            self.word_ids: [input_mask],
                                            self.segment_ids: [segment_ids]
                                        })

        return preds_evaluated

    def _load_graph(self, frozen_graph_filename):
        with tf.gfile.GFile(frozen_graph_filename, "rb") as f:
            graph_def = tf.GraphDef()
            graph_def.ParseFromString(f.read())

        with tf.Graph().as_default() as graph:
            tf.import_graph_def(graph_def,
                                input_map=None,
                                return_elements=None,
                                name="infer",
                                op_dict=None,
                                producer_op_list=None)

        return graph

    def _convert_single_example(self, text_a):
        tokens_a = self.tokenizer.tokenize(text_a)

        if len(tokens_a) > self.max_seq_length - 2:
            tokens_a = tokens_a[0:(self.max_seq_length - 2)]

        tokens = []
        segment_ids = []
        tokens.append("[CLS]")
        segment_ids.append(0)
        for token in tokens_a:
            tokens.append(token)
            segment_ids.append(0)
        tokens.append("[SEP]")
        segment_ids.append(0)

        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)

        input_mask = [1] * len(input_ids)

        while len(input_ids) < self.max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
            segment_ids.append(0)
        return input_ids, input_mask, segment_ids
def main(_):
  import time
  os.environ['CUDA_VISIBLE_DEVICES'] = str(FLAGS.gpu_id)
  current_path = os.path.dirname(os.path.abspath(__file__))
  tokenizer = FullTokenizer(os.path.join(current_path, './model/chinese_L-12_H-768_A-12/vocab.txt'))
  Configuration=namedtuple('Configuration', ['fp16', 'bert_config', 'checkpoint_path', 'graph_tmp_dir', 'max_seq_length'])

  fp16=False
  bert_config='./model/chinese_L-12_H-768_A-12/bert_config.json'
  #checkpoint_path='./model/ad/model_0622/model.ckpt-610194'
  checkpoint_path='./model/ad/model_0626/model.ckpt-610194'
  checkpoint_path='./model/ad/model_pretrain_ctr_0826/model.ckpt-16352'
  graph_tmp_dir='./model/ad/tmp/'
  max_seq_length=70
  configuration=Configuration(fp16, bert_config, checkpoint_path,graph_tmp_dir,max_seq_length)

  graph_path, bert_config = optimize_graph(configuration)
  worker = BertWorker(0, graph_path, configuration)

  start=time.time()
  for no in range(140):
    suffix=10000+no
    slice_path='/data1/zhangpengpeng/ad_data/eval_ins2_20190701/lm_validset_%s' % str(suffix)[1:]
    slice_output_path=os.path.join('/data1/zhangpengpeng/ad_data/eval_ins2_20190701/', 'lm_validset_mask_output_%s' % str(suffix)[1:])
    slice_output_file=tf.gfile.Open(slice_output_path, 'w')
    if not tf.gfile.Exists(slice_path):
      continue
    if tf.gfile.Exists(slice_output_path):
      continue
    print(slice_path, slice_output_path)
    count=0
    with tf.gfile.Open(slice_path, 'r') as f:
      input_ids_list=[]
      input_mask_list=[]
      segment_ids_list=[]
      rows=[]
      for index, line in enumerate(f):
        row=line.split('\t', 4)
        if len(row)!=5:
          continue
        text_a=row[3]
        text_b=row[-1].strip()
        text_c=row[1]
        start_position=text_b.find(text_c)
        feature=get_example(tokenizer, text_a, text_b, text_c, start_position, 70)
        input_ids_list.append(feature[0])
        input_mask_list.append(feature[1])
        segment_ids_list.append(feature[2])
        rows.append(row)
        if len(input_ids_list) == 60:
          features=(input_ids_list, input_mask_list, segment_ids_list)
          tags, scores = worker.predict(features)
          for i in range(len(input_ids_list)):
            slice_output_file.write('%f\t%s\n' % (scores[i][1], '\t'.join(rows[i][:3])))
          input_ids_list=[]
          input_mask_list=[]
          segment_ids_list=[]
          rows=[]
        count+=1
      if len(rows)>0:
        features=(input_ids_list, input_mask_list, segment_ids_list)
        tags, scores = worker.predict(features)
        for i in range(len(input_ids_list)):
          slice_output_file.write('%f\t%s\n' % (scores[i][1], '\t'.join(rows[i][:3])))
        slice_output_file.close()
    end=time.time()
    print("filename: %s\tqps: %d" % (slice_path, count/(end-start)))
Example #30
0
def convert_single_example(example,
                           max_seq_length=256,
                           tokenizer=FullTokenizer()):
    """Converts a single `InputExample` into a single `InputFeatures`."""
    label_map = {label: i for i, label in enumerate(label_list)}

    tokens_a = tokenizer.tokenize(example.text_a)
    tokens_b = None
    if example.text_b:
        tokens_b = tokenizer.tokenize(example.text_b)

    if tokens_b:
        # Modifies `tokens_a` and `tokens_b` in place so that the total
        # length is less than the specified length.
        # Account for [CLS], [SEP], [SEP] with "- 3"
        _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
    else:
        # Account for [CLS] and [SEP] with "- 2"
        if len(tokens_a) > max_seq_length - 2:
            tokens_a = tokens_a[0:(max_seq_length - 2)]

    # The convention in BERT is:
    # (a) For sequence pairs:
    #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
    #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
    # (b) For single sequences:
    #  tokens:   [CLS] the dog is hairy . [SEP]
    #  type_ids: 0     0   0   0  0     0 0
    #
    # Where "type_ids" are used to indicate whether this is the first
    # sequence or the second sequence. The embedding vectors for `type=0` and
    # `type=1` were learned during pre-training and are added to the wordpiece
    # embedding vector (and position vector). This is not *strictly* necessary
    # since the [SEP] token unambiguously separates the sequences, but it makes
    # it easier for the model to learn the concept of sequences.
    #
    # For classification tasks, the first vector (corresponding to [CLS]) is
    # used as the "sentence vector". Note that this only makes sense because
    # the entire model is fine-tuned.
    tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
    segment_ids = [0] * len(tokens)

    if tokens_b:
        tokens += tokens_b + ["[SEP]"]
        segment_ids += [1] * (len(tokens_b) + 1)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1] * len(input_ids)

    # Zero-pad up to the sequence length.
    if len(input_ids) < max_seq_length:
        input_ids += [0] * (max_seq_length - len(input_ids))
        input_mask += [0] * (max_seq_length - len(input_mask))
        segment_ids += [0] * (max_seq_length - len(segment_ids))
    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    label_id = label_map[example.label] if example.label else 0

    feature = InputFeatures(input_ids=input_ids,
                            input_mask=input_mask,
                            segment_ids=segment_ids,
                            label_id=label_id,
                            is_real_example=True)
    return feature