Esempio n. 1
0
def bert_tokenizer(sess):
    bert_module = hub.Module(bert_path)
    tokenization_info = bert_module(signature="tokenization_info",
                                    as_dict=True)
    vocab_file = sess.run(tokenization_info["vocab_file"])
    do_lower_case = sess.run(tokenization_info["do_lower_case"])
    return FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)
Esempio n. 2
0
 def __init__(self, bert_config_file, init_checkpoint, max_seq_length, vocab_file, num_labels, use_gpu=False):
     # 导入预训练参数所需
     self.bert_config = modeling.BertConfig.from_json_file(bert_config_file)
     self.init_checkpoint = init_checkpoint
     # 数据集和计算所需
     self.max_seq_length = max_seq_length
     self.num_labels = num_labels
     # 数据预处理所需
     self.vocab_file = vocab_file
     self.tokenizer=FullTokenizer(self.vocab_file, do_lower_case=False) # 默认 cased 模型
     # gpu
     self.use_gpu=use_gpu
     
     self.graph=tf.Graph() #声明计算图
     with self.graph.as_default():
         # 定义placeholder
         self.input_ids = tf.placeholder(dtype=tf.int64, shape=(None,self.max_seq_length))
         self.input_mask = tf.placeholder(dtype=tf.int64, shape=(None,self.max_seq_length))
         self.segment_ids = tf.placeholder(dtype=tf.int64, shape=(None,self.max_seq_length))
             
         # 定义计算
         (self.logits, self.probabilities) = create_predict_model(self.bert_config, 
         self.input_ids, self.input_mask, self.segment_ids, self.num_labels)
         
         # 导入预训练参数
         self.tvars = tf.trainable_variables() #创建了计算图后,可训练的变量随之被创建。
         self.initialized_variable_names = {}
         if self.init_checkpoint: #init_checkpoint是命令行中传入的预训练BERT或先前训练过的,ckpt文件
             (self.assignment_map, self.initialized_variable_names #从init_checkpoints中获取与可用的变量的值(预训练模型与实际任务计算图的变量的交集)
              ) = modeling.get_assignment_map_from_checkpoint(self.tvars, self.init_checkpoint)
         tf.train.init_from_checkpoint(self.init_checkpoint, self.assignment_map)
Esempio n. 3
0
def main(_):
    tokenizer_zh = FullTokenizer(vocab_file=FLAGS.bert_vocab_file,
                                 do_lower_case=True)

    tokenizer_en = load_subword_vocab(FLAGS.vocab_file)
    target_vocab_size = tokenizer_en.vocab_size + 2

    config = FileConfig(FLAGS.config_file)
    transformer = Transformer(config=config,
                              target_vocab_size=target_vocab_size,
                              bert_config_file=FLAGS.bert_config_file)

    inp = tf.random.uniform((1, FLAGS.max_seq_length))
    tar_inp = tf.random.uniform((1, FLAGS.max_seq_length))
    fn_out, _ = transformer(inp,
                            tar_inp,
                            True,
                            look_ahead_mask=None,
                            dec_padding_mask=None)

    transformer.load_weights(FLAGS.init_checkpoint)

    print(transformer.encoder.weights[0])

    result, _ = evaluate(transformer, tokenizer_zh, tokenizer_en,
                         FLAGS.inp_sentence, FLAGS.max_seq_length)

    predicted_sentence = tokenizer_en.decode(
        [i for i in result if i < tokenizer_en.vocab_size])

    print('Input: {}'.format(FLAGS.inp_sentence))
    print('Predicted translation: {}'.format(predicted_sentence))
Esempio n. 4
0
def create_tokenizer_from_hub_module():
	global do_lower_case
	"""Get the vocab file and casing info from the Hub module."""
	bert_module = hub.Module(bert_path)
	tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
	vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"], tokenization_info["do_lower_case"]])
	return FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)
Esempio n. 5
0
def preprocess(data):
    tokenizer = FullTokenizer(vocab_file)
    tok_ip = np.zeros((len(data), 128), dtype="int32")
    sent_ip = np.zeros((len(data), 128), dtype="int8")
    pos_ip = np.zeros((len(data), 128), dtype="int8")
    masks = np.zeros((len(data), 128), dtype="int8")

    for pos, text in tqdm.tqdm_notebook(enumerate(data)):
        tok0 = tokenizer.tokenize(text[0])
        tok1 = tokenizer.tokenize(text[1])
        tok = tok0 + tok1
        if len(tok) > 128:
            tok = tok[:127] + ["[SEP]"]
        pad_len = 128 - len(tok)
        tok_len = len(tok)
        tok0_len = len(tok0)
        tok = tokenizer.convert_tokens_to_ids(tok) + [0] * pad_len
        pos_val = range(128)
        sent = [0] * tok0_len + [1] * (tok_len - tok0_len) + [0] * pad_len
        mask = [1] * tok_len + [0] * pad_len

        tok_ip[pos] = tok
        pos_ip[pos] = pos_val
        masks[pos] = mask

    masks = masks[:, None, None, :]
    return tok_ip, sent_ip, pos_ip, masks
Esempio n. 6
0
    def parse_line(self, line, max_seq_len=512):
        """ parse one line to token_ids, sentence_ids, pos_ids, label
        """

        line = line.strip().split(",")
        assert len(line) == 3, \
            "One sample must have %d fields!" % 3

        text_left, text_right, masklabel = line
        tokenizer = FullTokenizer(self.vocab_path)
        # tokenizer = FullTokenizer(vocab_path)
        text_left = tokenizer.tokenize(text_left)
        masklabel = tokenizer.tokenize(masklabel)
        masklabel_ = len(masklabel) * ["[MASK]"]
        text_right = tokenizer.tokenize(text_right)
        all_tokens = text_left + masklabel_ + text_right
        token_ids = tokenizer.convert_tokens_to_ids(all_tokens)
        sent_ids = [0] * len(all_tokens)
        pos_ids = [i for i in range(len(all_tokens))]
        input_mask = [1.0] * len(all_tokens)
        # 这儿还差一个mask_pos
        mask_pos = []
        for idx, mask in enumerate(token_ids):
            if mask == self.mask_id:
                mask_pos.append(idx)
        # 添加一个mask_label
        mask_label = list(tokenizer.convert_tokens_to_ids(masklabel))
        assert len(token_ids) == len(sent_ids) == len(pos_ids) == len(
            input_mask
        ), "[Must be true]len(token_ids) == len(sent_ids) == len(pos_ids) == len(seg_labels)"
        if len(token_ids) > max_seq_len:
            return None
        return [token_ids, sent_ids, pos_ids, input_mask, mask_pos, mask_label]
Esempio n. 7
0
    def __init__(self,gcn=False,soft_masked=True, lr=5e-5,  beta=3.0, layers_num=2, dropout=1.0, bert_max_len=192, dir_model='../checkpoint/train/soft-masked-bert/', bert_dir='../rest-api/app/models/chinese_L-12_H-768_A-12', train=True):
        # super(BERT_GCN, self).__init__(config)
        self.gcn = gcn
        self.soft_masked=soft_masked
        self.lr = lr
        self.dir_model = dir_model
        # self.nepochs=nepochs
        self.dropout = dropout  # 1
        self.bert_dir = bert_dir
        self.bert_max_len = bert_max_len
        self.beta = beta
        self.layers_num = layers_num
        config_file = bert_dir + '/bert_config.json'
        self.init_checkpoint = bert_dir + '/bert_model.ckpt'
        vocab_file = bert_dir + '/vocab.txt'
        self.vocab = construct_vocab(vocab_file)
        self.tokenizer = FullTokenizer(
            vocab_file=vocab_file, do_lower_case=True)
        self.bert_api = BertModel(
            config=BertConfig.from_json_file(config_file),soft_masked=soft_masked)
        if self.soft_masked:
            self.tags_dict={'O':0,
            'B-Err':1}
        if self.gcn:
            # Read data from checkpoint file
            reader = pywrap_tensorflow.NewCheckpointReader(self.init_checkpoint)
            var_to_shape_map = reader.get_variable_to_shape_map()
            # Print tensor name and values

            for key in var_to_shape_map:
                if "word_embeddings" in key:
                    emb_table = reader.get_tensor(key)
                    break

            with open("filter_dict.json", 'r') as load_f:
                dict_filter = json.load(load_f)
                # in_conf_ind=sorted(dict_filter.items(),key=lambda d:d[0])
            zero_id = len(dict_filter)
            print("zero_id:", zero_id)
            self.emb_table_filted = []
            # y=-1
            self.w_index = [zero_id]*21128
            # self.b_index = []
            self.emb_mask = np.ones([21128, 768])
            for x in dict_filter:

                self.w_index[int(x)] = dict_filter[x]
                self.emb_table_filted.append(emb_table[int(x)])
                # emb_table[int(x)]=np.zeros([768])
                self.emb_mask[int(x)] = np.zeros([768])



            self.emb_table_filted = np.array(self.emb_table_filted)
            r = np.load('spellgcn_adj_norm.npz')
            self.p_A = r['A_p'].astype(np.float32)
            self.s_A = r['A_s'].astype(np.float32)
            self.p_A = tf.constant(self.p_A)
            self.s_A = tf.constant(self.s_A)
Esempio n. 8
0
 def test_tokenize(self):
     tokenizer = FullTokenizer()
     sentence = '実質的変化はなかった'
     res = tokenizer.tokenize(sentence)
     firsts = [0, 2, 3, 5, 6, 9]
     tokens = [
         CharToken(c, is_first=i in firsts) for i, c in enumerate(sentence)
     ]
     self.assertEqual(res, tokens)
Esempio n. 9
0
def dump_node_feat(args):
    log.info("Dump node feat starting...")
    id2str = np.load(os.path.join(args.outpath, "id2str.npy"), mmap_mode="r")
    pool = multiprocessing.Pool()
    tokenizer = FullTokenizer(args.vocab_file)
    term_ids = pool.map(partial(term2id, tokenizer=tokenizer, max_seqlen=args.max_seqlen), id2str)
    np.save(os.path.join(args.outpath, "term_ids.npy"), np.array(term_ids))
    log.info("Dump node feat done.")
    pool.terminate()
Esempio n. 10
0
def get_lens(data):
    tokenizer = FullTokenizer(vocab_file)
    lens = []
    for pos, text in tqdm.tqdm(enumerate(data)):
        tok0 = tokenizer.tokenize(text[0])
        tok1 = tokenizer.tokenize(text[1])
        tok = tok0 + tok1
        lens.append(len(tok))

    return np.array(lens)
Esempio n. 11
0
 def test_tokenize_with_nelogd(self):
     NEOLOGD_PATH = "/usr/local/lib/mecab/dic/ipadic/mecab-user-dict-seed.dic"
     if not os.path.isfile(NEOLOGD_PATH):
         raise ValueError(
             'NEOLOGD_PATH is invalid. Please set a file path to neologd dic'
         )
     sentence = '実質的変化はなかった'
     tokenizer = FullTokenizer(userdic_path=NEOLOGD_PATH)
     firsts = [0, 3, 5, 6, 9]
     tokens = [
         CharToken(c, is_first=i in firsts) for i, c in enumerate(sentence)
     ]
     res = tokenizer.tokenize(sentence)
     self.assertEqual(res, tokens)
Esempio n. 12
0
    def __init__(self):
        self.THRESHOLD = 0.1
        self.PROB_THRESHOLD = 0.8
        
        self.LABELS_32 = [
            "sentimental",
            "afraid",
            "proud",
            "faithful",
            "terrified",
            "joyful",
            "angry",
            "sad",
            "jealous",
            "grateful",
            "prepared",
            "embarrassed",
            "excited",
            "annoyed",
            "lonely",
            "ashamed",
            "guilty",
            "surprised",
            "nostalgic",
            "confident",
            "furious",
            "disappointed",
            "caring",
            "trusting",
            "disgusted",
            "anticipating",
            "anxious",
            "hopeful",
            "content",
            "impressed",
            "apprehensive",
            "devastated"
        ]

        self.MAX_SEQ_LENGTH = 50

        self.tokenizer = FullTokenizer(
            vocab_file='vocab.txt', do_lower_case=True)

        self.model = load_model('model_data/model32')

        self.matrix = np.genfromtxt('emotion_multiplier.csv')

        self.map_probabilities = np.vectorize(lambda x: 1 if x >= self.PROB_THRESHOLD else 0)
Esempio n. 13
0
 def setUp(self):
     with NamedTemporaryFile(mode='w') as tf:
         tf.write("a\n[CLS]\nb\n[SEP]c\nd\ne\nf\ng\nh\n")
         tf.seek(0)
         tokenizer = FullTokenizer(vocab_file=tf.name)
         self.vocab_words = list(tokenizer.vocab.keys())
     self.tokens = [
         CharToken('a', True),
         CharToken('b', False),
         CharToken('c', False),
         CharToken('d', True),
         CharToken('e', False),
         CharToken('f', True),
         CharToken('g', False),
         CharToken('h', True)
     ]
def adaptERNIEtokenization(all_sentences):
    tokenizer = FullTokenizer(vocab_file="vocab.txt", do_lower_case=True)
    ernie_tokens = [
        tokenizer.tokenize(sentence) for sentence in tqdm(all_sentences)
    ]
    print("Parsed to ERNIE tokens!")
    all_cleaned_tokens = []
    for line in tqdm(ernie_tokens):
        cleaned_tokens = []
        for i, token in enumerate(line):
            if token[:2] == "##":
                cleaned_tokens[-1] += token[2:]
            else:
                cleaned_tokens.append(token)
        all_cleaned_tokens.append(cleaned_tokens)
    return all_cleaned_tokens
Esempio n. 15
0
 def load_model(self,
                model_dir: str,
                model_config: str = "model_config.json"):
     model_config = os.path.join(model_dir, model_config)
     model_config = json.load(open(model_config))
     bert_config = json.load(
         open(os.path.join(model_dir, "bert_config.json")))
     model = BertNer(bert_config, tf.float32, model_config['num_labels'],
                     model_config['max_seq_length'])
     ids = tf.ones((1, 128), dtype=tf.int64)
     _ = model(ids, ids, ids, ids, training=False)
     model.load_weights(os.path.join(model_dir, "model.h5"))
     voacb = os.path.join(model_dir, "vocab.txt")
     tokenizer = FullTokenizer(vocab_file=voacb,
                               do_lower_case=model_config["do_lower"])
     return model, tokenizer, model_config
Esempio n. 16
0
def dump_node_feat(args):
    log.info("Dump node feat starting...")
    id2str = [
        line.strip("\n").split("\t")[1]
        for line in io.open(os.path.join(args.outpath, "terms.txt"),
                            encoding=args.encoding)
    ]
    pool = multiprocessing.Pool()
    tokenizer = FullTokenizer(args.vocab_file)
    term_ids = pool.map(
        partial(term2id, tokenizer=tokenizer, max_seqlen=args.max_seqlen),
        id2str)
    np.save(os.path.join(args.outpath, "term_ids.npy"),
            np.array(term_ids, np.uint16))
    log.info("Dump node feat done.")
    pool.terminate()
Esempio n. 17
0
    def __init__(self, bert_meta):
        self.graph = self._load_graph(bert_meta.model_file)

        self.tokenizer = FullTokenizer(vocab_file=bert_meta.vocab_file,
                                       do_lower_case=True)
        self.max_seq_length = 128

        # Input.
        self.input_ids = self.graph.get_tensor_by_name('infer/input_ids:0')
        self.word_ids = self.graph.get_tensor_by_name('infer/input_mask:0')
        self.segment_ids = self.graph.get_tensor_by_name('infer/segment_ids:0')
        # Output.
        self.predictions = self.graph.get_tensor_by_name(
            'infer/loss/Softmax:0')

        self.sess = tf.Session(graph=self.graph)

        self.inference(BertInputPackage(u'预热一下'))
Esempio n. 18
0
def gen_data(in_file, out_file, tagType):
    with open(in_file, 'r', encoding='utf8') as f:
        raw_data = [_.strip() for _ in f.readlines()]

    vocab_file = '../models/vocab.txt'
    full_tokenizer = FullTokenizer(vocab_file, do_lower_case=True)
    basic_tokenizer = BasicTokenizer(do_lower_case=True)

    data_all = [
        preprocess2dict(s, tagType, full_tokenizer, basic_tokenizer)
        for s in tqdm(raw_data)
    ]

    df = pd.DataFrame(data_all)
    # separate with \t
    df.to_csv(out_file, sep='\t', encoding='utf-8', index=False)

    print('Finish writing generated ' + tagType + ' data in ' + out_file)
    def __init__(self, param):

        self.model_path = os.path.abspath(param["model_path"])
        self.bert_config_file = os.path.abspath(param["bert_config_file"])
        bert_config = modeling.BertConfig.from_json_file(self.bert_config_file)
        self.fulltoken = FullTokenizer(os.path.abspath(param["vocab_file"]))
        self.vocab_dict = self.fulltoken.vocab

        target_start_ids = self.vocab_dict["[CLS]"]
        target_end_ids = self.vocab_dict["[SEP]"]

        num_gpus = len(os.environ["CUDA_VISIBLE_DEVICES"].split(','))
        tf.logging.info("num_gpus is {}".format(num_gpus))
        if param["use_mul_gpu"]:
            distribute = tf.contrib.distribute.MirroredStrategy(
                num_gpus=num_gpus)
        else:
            distribute = None
        run_config = tf.estimator.RunConfig(model_dir=os.path.abspath(
            self.model_path),
                                            save_summary_steps=200,
                                            keep_checkpoint_max=2,
                                            save_checkpoints_steps=3000,
                                            train_distribute=distribute,
                                            eval_distribute=distribute)
        self.input_max_seq_length = param["max_seq_length"]
        model_fn = model_fn_builder(
            bert_config,
            init_checkpoint=None,
            learning_rate=0.0001,
            num_train_steps=10000,
            num_warmup_steps=100,
            use_one_hot_embeddings=False,  # when use tpu ,it's True
            input_seq_length=param["max_seq_length"],
            target_seq_length=param["max_target_seq_length"],
            target_start_ids=target_start_ids,
            target_end_ids=target_end_ids,
            batch_size=param["batch_size"],
            mode_type=param["mode_type"])
        self.estimator = tf.estimator.Estimator(model_fn=model_fn,
                                                config=run_config)
Esempio n. 20
0
 def gen(self):
     from extract_features import convert_lst_to_features
     from tokenization import FullTokenizer
     tokenizer = FullTokenizer(
         vocab_file=os.path.join(self.bert_model_dir, 'vocab.txt'))
     # Windows does not support logger in MP environment, thus get a new logger
     #这个while 循环保证生成器hold住,estimator.predict不用重新加载
     while not self.closed:
         is_tokenized = all(isinstance(el, list) for el in self.text)
         tmp_f = list(
             convert_lst_to_features(self.text,
                                     self.seq_length,
                                     tokenizer,
                                     is_tokenized,
                                     mask_cls_sep=True))
         # print([f.input_ids for f in tmp_f])
         yield {
             'input_ids': [f.input_ids for f in tmp_f],
             'input_mask': [f.input_mask for f in tmp_f],
             'input_type_ids': [f.input_type_ids for f in tmp_f]
         }
Esempio n. 21
0
        help="Maximum number of contexts to output for an example.")
    parser.add_argument(
        "--max_position",
        type=int,
        default=50,
        help="Maximum context position for which to generate special tokens.")
    parser.add_argument(
        "--skip_nested_contexts",
        type=bool,
        default=True,
        help=
        "Completely ignore context that are not top level nodes in the page.")

    args = parser.parse_args()
    tokenizer = FullTokenizer(
        'check_points/bert-large-wwm-finetuned-squad/vocab.txt',
        do_lower_case=True)

    # train preprocess
    import ipdb

    output_file = os.path.join(
        args.output_dir, 'train_data_maxlen{}.bin'.format(args.max_seq_length))
    ipdb.set_trace()
    examples = read_nq_examples(input_file=args.train_file,
                                is_training=True,
                                args=args)
    num_spans_to_ids, features = convert_examples_to_features(
        examples=examples, tokenizer=tokenizer, is_training=True, args=args)
    torch.save((features, examples), output_file)
    for spans, ids in num_spans_to_ids.items():
 def onSetup(self):
     BERT_DIR = os.path.join(ue.get_content_dir(), 'Scripts', 'BertModel')
     self.imported = tf.saved_model.load(BERT_DIR)
     self.f = self.imported.signatures["serving_default"]
     VOCAB_PATH = os.path.join(BERT_DIR, "assets", "vocab.txt")
     self.tokenizer = FullTokenizer(VOCAB_PATH)
Esempio n. 23
0
for x, y in imdb["train"].batch(128):
    imdb_reviews_train.extend(x.numpy())
    y_train.extend(y.numpy())
for x, y in imdb["test"].batch(128):
    imdb_reviews_test.extend(x.numpy())
    y_test.extend(y.numpy())
y_train = np.array(y_train)
y_test = np.array(y_test)

# Extract pre-trained BERT as a Keras layer.
bert_model_path = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1"
bert_layer = hub.KerasLayer(bert_model_path, trainable=False)

# Build tokenizer from pre-trained BERT vocabulary.
bert_tokenizer = FullTokenizer(
    vocab_file=bert_layer.resolved_object.vocab_file.asset_path.numpy(),
    do_lower_case=bert_layer.resolved_object.do_lower_case.numpy())

# TODO:
# Document longer than 512 words wont be able to be encoded by BERT,
# since its positional encoding has a hard limit for 512 words.
# For better results we may need to summarize the document into <= 512 tokens,
# or encode sentence by sentence then pool together.
maxlen = 256

# TODO:
# We need to manually handle CLS and SEP special token for sentence beginning and ending.

# Encode text with padding, masking, and segmentation (required by BERT even if we don't use it).
tok_seq_train = [bert_tokenizer.tokenize(text) for text in imdb_reviews_train]
wid_seq_train = [
Esempio n. 24
0
    clf_output = sequence_output[:, 0, :]

    out = keras.layers.LSTM(128)
    out = keras.layers.Dense(1, activation='sigmoid')(clf_output)

    model = keras.Model(inputs=[input_word_ids, input_mask, segment_ids],
                        outputs=out)
    model.compile(keras.optimizers.Adam(lr=0.00001),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model


vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

train_input = bert_encode(train.text1.values, tokenizer, maxlen)
test_input = bert_encode(test.text1.values, tokenizer, maxlen)

model = build_model(bert_layer, maxlen)
model.summary()

# model = keras.Sequential([
#         bert_layer([input_word_ids, input_mask, segment_ids]),
#         # keras.layers.Dropout(0.3),
#         keras.layers.LSTM(128),
#         # keras.layers.Dropout(0.3),
#         keras.layers.Dense(64),
#         keras.layers.Dense(1, activation = 'sigmoid')
#     ]
Esempio n. 25
0
def convert_single_example(example,
                           max_seq_length=256,
                           tokenizer=FullTokenizer()):
    """Converts a single `InputExample` into a single `InputFeatures`."""
    label_map = {label: i for i, label in enumerate(label_list)}

    tokens_a = tokenizer.tokenize(example.text_a)
    tokens_b = None
    if example.text_b:
        tokens_b = tokenizer.tokenize(example.text_b)

    if tokens_b:
        # Modifies `tokens_a` and `tokens_b` in place so that the total
        # length is less than the specified length.
        # Account for [CLS], [SEP], [SEP] with "- 3"
        _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
    else:
        # Account for [CLS] and [SEP] with "- 2"
        if len(tokens_a) > max_seq_length - 2:
            tokens_a = tokens_a[0:(max_seq_length - 2)]

    # The convention in BERT is:
    # (a) For sequence pairs:
    #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
    #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
    # (b) For single sequences:
    #  tokens:   [CLS] the dog is hairy . [SEP]
    #  type_ids: 0     0   0   0  0     0 0
    #
    # Where "type_ids" are used to indicate whether this is the first
    # sequence or the second sequence. The embedding vectors for `type=0` and
    # `type=1` were learned during pre-training and are added to the wordpiece
    # embedding vector (and position vector). This is not *strictly* necessary
    # since the [SEP] token unambiguously separates the sequences, but it makes
    # it easier for the model to learn the concept of sequences.
    #
    # For classification tasks, the first vector (corresponding to [CLS]) is
    # used as the "sentence vector". Note that this only makes sense because
    # the entire model is fine-tuned.
    tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
    segment_ids = [0] * len(tokens)

    if tokens_b:
        tokens += tokens_b + ["[SEP]"]
        segment_ids += [1] * (len(tokens_b) + 1)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1] * len(input_ids)

    # Zero-pad up to the sequence length.
    if len(input_ids) < max_seq_length:
        input_ids += [0] * (max_seq_length - len(input_ids))
        input_mask += [0] * (max_seq_length - len(input_mask))
        segment_ids += [0] * (max_seq_length - len(segment_ids))
    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    label_id = label_map[example.label] if example.label else 0

    feature = InputFeatures(input_ids=input_ids,
                            input_mask=input_mask,
                            segment_ids=segment_ids,
                            label_id=label_id,
                            is_real_example=True)
    return feature
def main(_):
  import time
  os.environ['CUDA_VISIBLE_DEVICES'] = str(FLAGS.gpu_id)
  current_path = os.path.dirname(os.path.abspath(__file__))
  tokenizer = FullTokenizer(os.path.join(current_path, './model/chinese_L-12_H-768_A-12/vocab.txt'))
  Configuration=namedtuple('Configuration', ['fp16', 'bert_config', 'checkpoint_path', 'graph_tmp_dir', 'max_seq_length'])

  fp16=False
  bert_config='./model/chinese_L-12_H-768_A-12/bert_config.json'
  #checkpoint_path='./model/ad/model_0622/model.ckpt-610194'
  checkpoint_path='./model/ad/model_0626/model.ckpt-610194'
  checkpoint_path='./model/ad/model_pretrain_ctr_0826/model.ckpt-16352'
  graph_tmp_dir='./model/ad/tmp/'
  max_seq_length=70
  configuration=Configuration(fp16, bert_config, checkpoint_path,graph_tmp_dir,max_seq_length)

  graph_path, bert_config = optimize_graph(configuration)
  worker = BertWorker(0, graph_path, configuration)

  start=time.time()
  for no in range(140):
    suffix=10000+no
    slice_path='/data1/zhangpengpeng/ad_data/eval_ins2_20190701/lm_validset_%s' % str(suffix)[1:]
    slice_output_path=os.path.join('/data1/zhangpengpeng/ad_data/eval_ins2_20190701/', 'lm_validset_mask_output_%s' % str(suffix)[1:])
    slice_output_file=tf.gfile.Open(slice_output_path, 'w')
    if not tf.gfile.Exists(slice_path):
      continue
    if tf.gfile.Exists(slice_output_path):
      continue
    print(slice_path, slice_output_path)
    count=0
    with tf.gfile.Open(slice_path, 'r') as f:
      input_ids_list=[]
      input_mask_list=[]
      segment_ids_list=[]
      rows=[]
      for index, line in enumerate(f):
        row=line.split('\t', 4)
        if len(row)!=5:
          continue
        text_a=row[3]
        text_b=row[-1].strip()
        text_c=row[1]
        start_position=text_b.find(text_c)
        feature=get_example(tokenizer, text_a, text_b, text_c, start_position, 70)
        input_ids_list.append(feature[0])
        input_mask_list.append(feature[1])
        segment_ids_list.append(feature[2])
        rows.append(row)
        if len(input_ids_list) == 60:
          features=(input_ids_list, input_mask_list, segment_ids_list)
          tags, scores = worker.predict(features)
          for i in range(len(input_ids_list)):
            slice_output_file.write('%f\t%s\n' % (scores[i][1], '\t'.join(rows[i][:3])))
          input_ids_list=[]
          input_mask_list=[]
          segment_ids_list=[]
          rows=[]
        count+=1
      if len(rows)>0:
        features=(input_ids_list, input_mask_list, segment_ids_list)
        tags, scores = worker.predict(features)
        for i in range(len(input_ids_list)):
          slice_output_file.write('%f\t%s\n' % (scores[i][1], '\t'.join(rows[i][:3])))
        slice_output_file.close()
    end=time.time()
    print("filename: %s\tqps: %d" % (slice_path, count/(end-start)))
        axis=1,
    )

    data.test_InputExamples = data.test.apply(
        lambda x: run_classifier.InputExample(guid=None,
                                              text_a=x[DATA_COLUMN],
                                              text_b=None,
                                              label=x[LABEL_COLUMN]),
        axis=1,
    )
#%%
# We'll set sequences to be at most 128 tokens long.
MAX_SEQ_LENGTH = 256

VOC_FNAME = "./64000_vocab_sp_70m.txt"
tokenizer = FullTokenizer(VOC_FNAME)

for data in tqdm(all_datasets):
    # Convert our train and test features to InputFeatures that BERT understands.
    data.train_features = run_classifier.convert_examples_to_features(
        data.train_InputExamples, data.label_list, MAX_SEQ_LENGTH, tokenizer)
    data.test_features = run_classifier.convert_examples_to_features(
        data.test_InputExamples, data.label_list, MAX_SEQ_LENGTH, tokenizer)

# %%
import pickle

with open("all_datasets_64k_farasa_256.pickle", "wb") as fp:  # Pickling
    pickle.dump(all_datasets, fp)

# %%
#!/usr/bin/env python
# coding: utf-8

from modeling import BertForQuestionAnswering, BertConfig

#config = BertConfig.from_json_file('uncased_L-12_H-768_A-12/bert_config.json')
# config = BertConfig.from_json_file('configs/pals_config.json')
# model = BertForQuestionAnswering(config)
# model.load_pretained('initial_bert.bin', patch=True)
# print(model)

from tokenization import FullTokenizer, BasicTokenizer

tokenizer = FullTokenizer('uncased_L-12_H-768_A-12/vocab.txt', do_lower_case=True)
tokens = tokenizer.tokenize('I love China!!')
print(tokens)
tokenizer = BasicTokenizer()
tokens = tokenizer.tokenize('[SEP]')
print(tokens)
Esempio n. 29
0
                    help='whether to use gpu for finetuning')
args = parser.parse_args()
logging.info(args)
batch_size = args.batch_size
test_batch_size = args.test_batch_size
lr = args.lr

ctx = mx.cpu() if args.gpu is None or args.gpu == '' else mx.gpu()

bert, vocabulary = bert_12_768_12(dataset_name='book_corpus_wiki_en_uncased',
                                  pretrained=True,
                                  ctx=ctx,
                                  use_pooler=True,
                                  use_decoder=False,
                                  use_classifier=False)
tokenizer = FullTokenizer(vocabulary, do_lower_case=True)

model = BERTClassifier(bert, dropout=0.1)
model.classifier.initialize(init=mx.init.Normal(0.02), ctx=ctx)
model.hybridize(static_alloc=True)
logging.info(model)

loss_function = gluon.loss.SoftmaxCELoss()
loss_function.hybridize(static_alloc=True)

metric = mx.metric.Accuracy()

trans = ClassificationTransform(tokenizer, MRPCDataset.get_labels(),
                                args.max_len)
data_train = MRPCDataset('train').transform(trans)
data_dev = MRPCDataset('dev').transform(trans)
Esempio n. 30
0
#set_dir = 'pleasant_unpleasant'
set_dir = 'career_family'
#set_dir = 'unpleasant_pleasant'
#set_dir = 'follower_leader'

targets = ['male', 'female']

attributes = ['career', 'family']
#attributes = ['unpleasant']
#attributes = ['pleasant']
#attributes = ['leader', 'follower']

templates = ['templates']

vocab_file = 'uncased_L-12_H-768_A-12/vocab.txt'
tok = FullTokenizer(vocab_file)

if SWAP_TARGETS:
    tmp = copy.deepcopy(targets)
    targets = attributes
    attributes = tmp


def open_results_file(path):
    result = pickle.load(open(path, 'rb'))
    for res in result:
        res['file'] = path
    return result


def load_results(base_results_dir,