def bert_tokenizer(sess): bert_module = hub.Module(bert_path) tokenization_info = bert_module(signature="tokenization_info", as_dict=True) vocab_file = sess.run(tokenization_info["vocab_file"]) do_lower_case = sess.run(tokenization_info["do_lower_case"]) return FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)
def __init__(self, bert_config_file, init_checkpoint, max_seq_length, vocab_file, num_labels, use_gpu=False): # 导入预训练参数所需 self.bert_config = modeling.BertConfig.from_json_file(bert_config_file) self.init_checkpoint = init_checkpoint # 数据集和计算所需 self.max_seq_length = max_seq_length self.num_labels = num_labels # 数据预处理所需 self.vocab_file = vocab_file self.tokenizer=FullTokenizer(self.vocab_file, do_lower_case=False) # 默认 cased 模型 # gpu self.use_gpu=use_gpu self.graph=tf.Graph() #声明计算图 with self.graph.as_default(): # 定义placeholder self.input_ids = tf.placeholder(dtype=tf.int64, shape=(None,self.max_seq_length)) self.input_mask = tf.placeholder(dtype=tf.int64, shape=(None,self.max_seq_length)) self.segment_ids = tf.placeholder(dtype=tf.int64, shape=(None,self.max_seq_length)) # 定义计算 (self.logits, self.probabilities) = create_predict_model(self.bert_config, self.input_ids, self.input_mask, self.segment_ids, self.num_labels) # 导入预训练参数 self.tvars = tf.trainable_variables() #创建了计算图后,可训练的变量随之被创建。 self.initialized_variable_names = {} if self.init_checkpoint: #init_checkpoint是命令行中传入的预训练BERT或先前训练过的,ckpt文件 (self.assignment_map, self.initialized_variable_names #从init_checkpoints中获取与可用的变量的值(预训练模型与实际任务计算图的变量的交集) ) = modeling.get_assignment_map_from_checkpoint(self.tvars, self.init_checkpoint) tf.train.init_from_checkpoint(self.init_checkpoint, self.assignment_map)
def main(_): tokenizer_zh = FullTokenizer(vocab_file=FLAGS.bert_vocab_file, do_lower_case=True) tokenizer_en = load_subword_vocab(FLAGS.vocab_file) target_vocab_size = tokenizer_en.vocab_size + 2 config = FileConfig(FLAGS.config_file) transformer = Transformer(config=config, target_vocab_size=target_vocab_size, bert_config_file=FLAGS.bert_config_file) inp = tf.random.uniform((1, FLAGS.max_seq_length)) tar_inp = tf.random.uniform((1, FLAGS.max_seq_length)) fn_out, _ = transformer(inp, tar_inp, True, look_ahead_mask=None, dec_padding_mask=None) transformer.load_weights(FLAGS.init_checkpoint) print(transformer.encoder.weights[0]) result, _ = evaluate(transformer, tokenizer_zh, tokenizer_en, FLAGS.inp_sentence, FLAGS.max_seq_length) predicted_sentence = tokenizer_en.decode( [i for i in result if i < tokenizer_en.vocab_size]) print('Input: {}'.format(FLAGS.inp_sentence)) print('Predicted translation: {}'.format(predicted_sentence))
def create_tokenizer_from_hub_module(): global do_lower_case """Get the vocab file and casing info from the Hub module.""" bert_module = hub.Module(bert_path) tokenization_info = bert_module(signature="tokenization_info", as_dict=True) vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"], tokenization_info["do_lower_case"]]) return FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)
def preprocess(data): tokenizer = FullTokenizer(vocab_file) tok_ip = np.zeros((len(data), 128), dtype="int32") sent_ip = np.zeros((len(data), 128), dtype="int8") pos_ip = np.zeros((len(data), 128), dtype="int8") masks = np.zeros((len(data), 128), dtype="int8") for pos, text in tqdm.tqdm_notebook(enumerate(data)): tok0 = tokenizer.tokenize(text[0]) tok1 = tokenizer.tokenize(text[1]) tok = tok0 + tok1 if len(tok) > 128: tok = tok[:127] + ["[SEP]"] pad_len = 128 - len(tok) tok_len = len(tok) tok0_len = len(tok0) tok = tokenizer.convert_tokens_to_ids(tok) + [0] * pad_len pos_val = range(128) sent = [0] * tok0_len + [1] * (tok_len - tok0_len) + [0] * pad_len mask = [1] * tok_len + [0] * pad_len tok_ip[pos] = tok pos_ip[pos] = pos_val masks[pos] = mask masks = masks[:, None, None, :] return tok_ip, sent_ip, pos_ip, masks
def parse_line(self, line, max_seq_len=512): """ parse one line to token_ids, sentence_ids, pos_ids, label """ line = line.strip().split(",") assert len(line) == 3, \ "One sample must have %d fields!" % 3 text_left, text_right, masklabel = line tokenizer = FullTokenizer(self.vocab_path) # tokenizer = FullTokenizer(vocab_path) text_left = tokenizer.tokenize(text_left) masklabel = tokenizer.tokenize(masklabel) masklabel_ = len(masklabel) * ["[MASK]"] text_right = tokenizer.tokenize(text_right) all_tokens = text_left + masklabel_ + text_right token_ids = tokenizer.convert_tokens_to_ids(all_tokens) sent_ids = [0] * len(all_tokens) pos_ids = [i for i in range(len(all_tokens))] input_mask = [1.0] * len(all_tokens) # 这儿还差一个mask_pos mask_pos = [] for idx, mask in enumerate(token_ids): if mask == self.mask_id: mask_pos.append(idx) # 添加一个mask_label mask_label = list(tokenizer.convert_tokens_to_ids(masklabel)) assert len(token_ids) == len(sent_ids) == len(pos_ids) == len( input_mask ), "[Must be true]len(token_ids) == len(sent_ids) == len(pos_ids) == len(seg_labels)" if len(token_ids) > max_seq_len: return None return [token_ids, sent_ids, pos_ids, input_mask, mask_pos, mask_label]
def __init__(self,gcn=False,soft_masked=True, lr=5e-5, beta=3.0, layers_num=2, dropout=1.0, bert_max_len=192, dir_model='../checkpoint/train/soft-masked-bert/', bert_dir='../rest-api/app/models/chinese_L-12_H-768_A-12', train=True): # super(BERT_GCN, self).__init__(config) self.gcn = gcn self.soft_masked=soft_masked self.lr = lr self.dir_model = dir_model # self.nepochs=nepochs self.dropout = dropout # 1 self.bert_dir = bert_dir self.bert_max_len = bert_max_len self.beta = beta self.layers_num = layers_num config_file = bert_dir + '/bert_config.json' self.init_checkpoint = bert_dir + '/bert_model.ckpt' vocab_file = bert_dir + '/vocab.txt' self.vocab = construct_vocab(vocab_file) self.tokenizer = FullTokenizer( vocab_file=vocab_file, do_lower_case=True) self.bert_api = BertModel( config=BertConfig.from_json_file(config_file),soft_masked=soft_masked) if self.soft_masked: self.tags_dict={'O':0, 'B-Err':1} if self.gcn: # Read data from checkpoint file reader = pywrap_tensorflow.NewCheckpointReader(self.init_checkpoint) var_to_shape_map = reader.get_variable_to_shape_map() # Print tensor name and values for key in var_to_shape_map: if "word_embeddings" in key: emb_table = reader.get_tensor(key) break with open("filter_dict.json", 'r') as load_f: dict_filter = json.load(load_f) # in_conf_ind=sorted(dict_filter.items(),key=lambda d:d[0]) zero_id = len(dict_filter) print("zero_id:", zero_id) self.emb_table_filted = [] # y=-1 self.w_index = [zero_id]*21128 # self.b_index = [] self.emb_mask = np.ones([21128, 768]) for x in dict_filter: self.w_index[int(x)] = dict_filter[x] self.emb_table_filted.append(emb_table[int(x)]) # emb_table[int(x)]=np.zeros([768]) self.emb_mask[int(x)] = np.zeros([768]) self.emb_table_filted = np.array(self.emb_table_filted) r = np.load('spellgcn_adj_norm.npz') self.p_A = r['A_p'].astype(np.float32) self.s_A = r['A_s'].astype(np.float32) self.p_A = tf.constant(self.p_A) self.s_A = tf.constant(self.s_A)
def test_tokenize(self): tokenizer = FullTokenizer() sentence = '実質的変化はなかった' res = tokenizer.tokenize(sentence) firsts = [0, 2, 3, 5, 6, 9] tokens = [ CharToken(c, is_first=i in firsts) for i, c in enumerate(sentence) ] self.assertEqual(res, tokens)
def dump_node_feat(args): log.info("Dump node feat starting...") id2str = np.load(os.path.join(args.outpath, "id2str.npy"), mmap_mode="r") pool = multiprocessing.Pool() tokenizer = FullTokenizer(args.vocab_file) term_ids = pool.map(partial(term2id, tokenizer=tokenizer, max_seqlen=args.max_seqlen), id2str) np.save(os.path.join(args.outpath, "term_ids.npy"), np.array(term_ids)) log.info("Dump node feat done.") pool.terminate()
def get_lens(data): tokenizer = FullTokenizer(vocab_file) lens = [] for pos, text in tqdm.tqdm(enumerate(data)): tok0 = tokenizer.tokenize(text[0]) tok1 = tokenizer.tokenize(text[1]) tok = tok0 + tok1 lens.append(len(tok)) return np.array(lens)
def test_tokenize_with_nelogd(self): NEOLOGD_PATH = "/usr/local/lib/mecab/dic/ipadic/mecab-user-dict-seed.dic" if not os.path.isfile(NEOLOGD_PATH): raise ValueError( 'NEOLOGD_PATH is invalid. Please set a file path to neologd dic' ) sentence = '実質的変化はなかった' tokenizer = FullTokenizer(userdic_path=NEOLOGD_PATH) firsts = [0, 3, 5, 6, 9] tokens = [ CharToken(c, is_first=i in firsts) for i, c in enumerate(sentence) ] res = tokenizer.tokenize(sentence) self.assertEqual(res, tokens)
def __init__(self): self.THRESHOLD = 0.1 self.PROB_THRESHOLD = 0.8 self.LABELS_32 = [ "sentimental", "afraid", "proud", "faithful", "terrified", "joyful", "angry", "sad", "jealous", "grateful", "prepared", "embarrassed", "excited", "annoyed", "lonely", "ashamed", "guilty", "surprised", "nostalgic", "confident", "furious", "disappointed", "caring", "trusting", "disgusted", "anticipating", "anxious", "hopeful", "content", "impressed", "apprehensive", "devastated" ] self.MAX_SEQ_LENGTH = 50 self.tokenizer = FullTokenizer( vocab_file='vocab.txt', do_lower_case=True) self.model = load_model('model_data/model32') self.matrix = np.genfromtxt('emotion_multiplier.csv') self.map_probabilities = np.vectorize(lambda x: 1 if x >= self.PROB_THRESHOLD else 0)
def setUp(self): with NamedTemporaryFile(mode='w') as tf: tf.write("a\n[CLS]\nb\n[SEP]c\nd\ne\nf\ng\nh\n") tf.seek(0) tokenizer = FullTokenizer(vocab_file=tf.name) self.vocab_words = list(tokenizer.vocab.keys()) self.tokens = [ CharToken('a', True), CharToken('b', False), CharToken('c', False), CharToken('d', True), CharToken('e', False), CharToken('f', True), CharToken('g', False), CharToken('h', True) ]
def adaptERNIEtokenization(all_sentences): tokenizer = FullTokenizer(vocab_file="vocab.txt", do_lower_case=True) ernie_tokens = [ tokenizer.tokenize(sentence) for sentence in tqdm(all_sentences) ] print("Parsed to ERNIE tokens!") all_cleaned_tokens = [] for line in tqdm(ernie_tokens): cleaned_tokens = [] for i, token in enumerate(line): if token[:2] == "##": cleaned_tokens[-1] += token[2:] else: cleaned_tokens.append(token) all_cleaned_tokens.append(cleaned_tokens) return all_cleaned_tokens
def load_model(self, model_dir: str, model_config: str = "model_config.json"): model_config = os.path.join(model_dir, model_config) model_config = json.load(open(model_config)) bert_config = json.load( open(os.path.join(model_dir, "bert_config.json"))) model = BertNer(bert_config, tf.float32, model_config['num_labels'], model_config['max_seq_length']) ids = tf.ones((1, 128), dtype=tf.int64) _ = model(ids, ids, ids, ids, training=False) model.load_weights(os.path.join(model_dir, "model.h5")) voacb = os.path.join(model_dir, "vocab.txt") tokenizer = FullTokenizer(vocab_file=voacb, do_lower_case=model_config["do_lower"]) return model, tokenizer, model_config
def dump_node_feat(args): log.info("Dump node feat starting...") id2str = [ line.strip("\n").split("\t")[1] for line in io.open(os.path.join(args.outpath, "terms.txt"), encoding=args.encoding) ] pool = multiprocessing.Pool() tokenizer = FullTokenizer(args.vocab_file) term_ids = pool.map( partial(term2id, tokenizer=tokenizer, max_seqlen=args.max_seqlen), id2str) np.save(os.path.join(args.outpath, "term_ids.npy"), np.array(term_ids, np.uint16)) log.info("Dump node feat done.") pool.terminate()
def __init__(self, bert_meta): self.graph = self._load_graph(bert_meta.model_file) self.tokenizer = FullTokenizer(vocab_file=bert_meta.vocab_file, do_lower_case=True) self.max_seq_length = 128 # Input. self.input_ids = self.graph.get_tensor_by_name('infer/input_ids:0') self.word_ids = self.graph.get_tensor_by_name('infer/input_mask:0') self.segment_ids = self.graph.get_tensor_by_name('infer/segment_ids:0') # Output. self.predictions = self.graph.get_tensor_by_name( 'infer/loss/Softmax:0') self.sess = tf.Session(graph=self.graph) self.inference(BertInputPackage(u'预热一下'))
def gen_data(in_file, out_file, tagType): with open(in_file, 'r', encoding='utf8') as f: raw_data = [_.strip() for _ in f.readlines()] vocab_file = '../models/vocab.txt' full_tokenizer = FullTokenizer(vocab_file, do_lower_case=True) basic_tokenizer = BasicTokenizer(do_lower_case=True) data_all = [ preprocess2dict(s, tagType, full_tokenizer, basic_tokenizer) for s in tqdm(raw_data) ] df = pd.DataFrame(data_all) # separate with \t df.to_csv(out_file, sep='\t', encoding='utf-8', index=False) print('Finish writing generated ' + tagType + ' data in ' + out_file)
def __init__(self, param): self.model_path = os.path.abspath(param["model_path"]) self.bert_config_file = os.path.abspath(param["bert_config_file"]) bert_config = modeling.BertConfig.from_json_file(self.bert_config_file) self.fulltoken = FullTokenizer(os.path.abspath(param["vocab_file"])) self.vocab_dict = self.fulltoken.vocab target_start_ids = self.vocab_dict["[CLS]"] target_end_ids = self.vocab_dict["[SEP]"] num_gpus = len(os.environ["CUDA_VISIBLE_DEVICES"].split(',')) tf.logging.info("num_gpus is {}".format(num_gpus)) if param["use_mul_gpu"]: distribute = tf.contrib.distribute.MirroredStrategy( num_gpus=num_gpus) else: distribute = None run_config = tf.estimator.RunConfig(model_dir=os.path.abspath( self.model_path), save_summary_steps=200, keep_checkpoint_max=2, save_checkpoints_steps=3000, train_distribute=distribute, eval_distribute=distribute) self.input_max_seq_length = param["max_seq_length"] model_fn = model_fn_builder( bert_config, init_checkpoint=None, learning_rate=0.0001, num_train_steps=10000, num_warmup_steps=100, use_one_hot_embeddings=False, # when use tpu ,it's True input_seq_length=param["max_seq_length"], target_seq_length=param["max_target_seq_length"], target_start_ids=target_start_ids, target_end_ids=target_end_ids, batch_size=param["batch_size"], mode_type=param["mode_type"]) self.estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config)
def gen(self): from extract_features import convert_lst_to_features from tokenization import FullTokenizer tokenizer = FullTokenizer( vocab_file=os.path.join(self.bert_model_dir, 'vocab.txt')) # Windows does not support logger in MP environment, thus get a new logger #这个while 循环保证生成器hold住,estimator.predict不用重新加载 while not self.closed: is_tokenized = all(isinstance(el, list) for el in self.text) tmp_f = list( convert_lst_to_features(self.text, self.seq_length, tokenizer, is_tokenized, mask_cls_sep=True)) # print([f.input_ids for f in tmp_f]) yield { 'input_ids': [f.input_ids for f in tmp_f], 'input_mask': [f.input_mask for f in tmp_f], 'input_type_ids': [f.input_type_ids for f in tmp_f] }
help="Maximum number of contexts to output for an example.") parser.add_argument( "--max_position", type=int, default=50, help="Maximum context position for which to generate special tokens.") parser.add_argument( "--skip_nested_contexts", type=bool, default=True, help= "Completely ignore context that are not top level nodes in the page.") args = parser.parse_args() tokenizer = FullTokenizer( 'check_points/bert-large-wwm-finetuned-squad/vocab.txt', do_lower_case=True) # train preprocess import ipdb output_file = os.path.join( args.output_dir, 'train_data_maxlen{}.bin'.format(args.max_seq_length)) ipdb.set_trace() examples = read_nq_examples(input_file=args.train_file, is_training=True, args=args) num_spans_to_ids, features = convert_examples_to_features( examples=examples, tokenizer=tokenizer, is_training=True, args=args) torch.save((features, examples), output_file) for spans, ids in num_spans_to_ids.items():
def onSetup(self): BERT_DIR = os.path.join(ue.get_content_dir(), 'Scripts', 'BertModel') self.imported = tf.saved_model.load(BERT_DIR) self.f = self.imported.signatures["serving_default"] VOCAB_PATH = os.path.join(BERT_DIR, "assets", "vocab.txt") self.tokenizer = FullTokenizer(VOCAB_PATH)
for x, y in imdb["train"].batch(128): imdb_reviews_train.extend(x.numpy()) y_train.extend(y.numpy()) for x, y in imdb["test"].batch(128): imdb_reviews_test.extend(x.numpy()) y_test.extend(y.numpy()) y_train = np.array(y_train) y_test = np.array(y_test) # Extract pre-trained BERT as a Keras layer. bert_model_path = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1" bert_layer = hub.KerasLayer(bert_model_path, trainable=False) # Build tokenizer from pre-trained BERT vocabulary. bert_tokenizer = FullTokenizer( vocab_file=bert_layer.resolved_object.vocab_file.asset_path.numpy(), do_lower_case=bert_layer.resolved_object.do_lower_case.numpy()) # TODO: # Document longer than 512 words wont be able to be encoded by BERT, # since its positional encoding has a hard limit for 512 words. # For better results we may need to summarize the document into <= 512 tokens, # or encode sentence by sentence then pool together. maxlen = 256 # TODO: # We need to manually handle CLS and SEP special token for sentence beginning and ending. # Encode text with padding, masking, and segmentation (required by BERT even if we don't use it). tok_seq_train = [bert_tokenizer.tokenize(text) for text in imdb_reviews_train] wid_seq_train = [
clf_output = sequence_output[:, 0, :] out = keras.layers.LSTM(128) out = keras.layers.Dense(1, activation='sigmoid')(clf_output) model = keras.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out) model.compile(keras.optimizers.Adam(lr=0.00001), loss='binary_crossentropy', metrics=['accuracy']) return model vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy() do_lower_case = bert_layer.resolved_object.do_lower_case.numpy() tokenizer = FullTokenizer(vocab_file, do_lower_case) train_input = bert_encode(train.text1.values, tokenizer, maxlen) test_input = bert_encode(test.text1.values, tokenizer, maxlen) model = build_model(bert_layer, maxlen) model.summary() # model = keras.Sequential([ # bert_layer([input_word_ids, input_mask, segment_ids]), # # keras.layers.Dropout(0.3), # keras.layers.LSTM(128), # # keras.layers.Dropout(0.3), # keras.layers.Dense(64), # keras.layers.Dense(1, activation = 'sigmoid') # ]
def convert_single_example(example, max_seq_length=256, tokenizer=FullTokenizer()): """Converts a single `InputExample` into a single `InputFeatures`.""" label_map = {label: i for i, label in enumerate(label_list)} tokens_a = tokenizer.tokenize(example.text_a) tokens_b = None if example.text_b: tokens_b = tokenizer.tokenize(example.text_b) if tokens_b: # Modifies `tokens_a` and `tokens_b` in place so that the total # length is less than the specified length. # Account for [CLS], [SEP], [SEP] with "- 3" _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) else: # Account for [CLS] and [SEP] with "- 2" if len(tokens_a) > max_seq_length - 2: tokens_a = tokens_a[0:(max_seq_length - 2)] # The convention in BERT is: # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 # # Where "type_ids" are used to indicate whether this is the first # sequence or the second sequence. The embedding vectors for `type=0` and # `type=1` were learned during pre-training and are added to the wordpiece # embedding vector (and position vector). This is not *strictly* necessary # since the [SEP] token unambiguously separates the sequences, but it makes # it easier for the model to learn the concept of sequences. # # For classification tasks, the first vector (corresponding to [CLS]) is # used as the "sentence vector". Note that this only makes sense because # the entire model is fine-tuned. tokens = ["[CLS]"] + tokens_a + ["[SEP]"] segment_ids = [0] * len(tokens) if tokens_b: tokens += tokens_b + ["[SEP]"] segment_ids += [1] * (len(tokens_b) + 1) input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. if len(input_ids) < max_seq_length: input_ids += [0] * (max_seq_length - len(input_ids)) input_mask += [0] * (max_seq_length - len(input_mask)) segment_ids += [0] * (max_seq_length - len(segment_ids)) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length label_id = label_map[example.label] if example.label else 0 feature = InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=label_id, is_real_example=True) return feature
def main(_): import time os.environ['CUDA_VISIBLE_DEVICES'] = str(FLAGS.gpu_id) current_path = os.path.dirname(os.path.abspath(__file__)) tokenizer = FullTokenizer(os.path.join(current_path, './model/chinese_L-12_H-768_A-12/vocab.txt')) Configuration=namedtuple('Configuration', ['fp16', 'bert_config', 'checkpoint_path', 'graph_tmp_dir', 'max_seq_length']) fp16=False bert_config='./model/chinese_L-12_H-768_A-12/bert_config.json' #checkpoint_path='./model/ad/model_0622/model.ckpt-610194' checkpoint_path='./model/ad/model_0626/model.ckpt-610194' checkpoint_path='./model/ad/model_pretrain_ctr_0826/model.ckpt-16352' graph_tmp_dir='./model/ad/tmp/' max_seq_length=70 configuration=Configuration(fp16, bert_config, checkpoint_path,graph_tmp_dir,max_seq_length) graph_path, bert_config = optimize_graph(configuration) worker = BertWorker(0, graph_path, configuration) start=time.time() for no in range(140): suffix=10000+no slice_path='/data1/zhangpengpeng/ad_data/eval_ins2_20190701/lm_validset_%s' % str(suffix)[1:] slice_output_path=os.path.join('/data1/zhangpengpeng/ad_data/eval_ins2_20190701/', 'lm_validset_mask_output_%s' % str(suffix)[1:]) slice_output_file=tf.gfile.Open(slice_output_path, 'w') if not tf.gfile.Exists(slice_path): continue if tf.gfile.Exists(slice_output_path): continue print(slice_path, slice_output_path) count=0 with tf.gfile.Open(slice_path, 'r') as f: input_ids_list=[] input_mask_list=[] segment_ids_list=[] rows=[] for index, line in enumerate(f): row=line.split('\t', 4) if len(row)!=5: continue text_a=row[3] text_b=row[-1].strip() text_c=row[1] start_position=text_b.find(text_c) feature=get_example(tokenizer, text_a, text_b, text_c, start_position, 70) input_ids_list.append(feature[0]) input_mask_list.append(feature[1]) segment_ids_list.append(feature[2]) rows.append(row) if len(input_ids_list) == 60: features=(input_ids_list, input_mask_list, segment_ids_list) tags, scores = worker.predict(features) for i in range(len(input_ids_list)): slice_output_file.write('%f\t%s\n' % (scores[i][1], '\t'.join(rows[i][:3]))) input_ids_list=[] input_mask_list=[] segment_ids_list=[] rows=[] count+=1 if len(rows)>0: features=(input_ids_list, input_mask_list, segment_ids_list) tags, scores = worker.predict(features) for i in range(len(input_ids_list)): slice_output_file.write('%f\t%s\n' % (scores[i][1], '\t'.join(rows[i][:3]))) slice_output_file.close() end=time.time() print("filename: %s\tqps: %d" % (slice_path, count/(end-start)))
axis=1, ) data.test_InputExamples = data.test.apply( lambda x: run_classifier.InputExample(guid=None, text_a=x[DATA_COLUMN], text_b=None, label=x[LABEL_COLUMN]), axis=1, ) #%% # We'll set sequences to be at most 128 tokens long. MAX_SEQ_LENGTH = 256 VOC_FNAME = "./64000_vocab_sp_70m.txt" tokenizer = FullTokenizer(VOC_FNAME) for data in tqdm(all_datasets): # Convert our train and test features to InputFeatures that BERT understands. data.train_features = run_classifier.convert_examples_to_features( data.train_InputExamples, data.label_list, MAX_SEQ_LENGTH, tokenizer) data.test_features = run_classifier.convert_examples_to_features( data.test_InputExamples, data.label_list, MAX_SEQ_LENGTH, tokenizer) # %% import pickle with open("all_datasets_64k_farasa_256.pickle", "wb") as fp: # Pickling pickle.dump(all_datasets, fp) # %%
#!/usr/bin/env python # coding: utf-8 from modeling import BertForQuestionAnswering, BertConfig #config = BertConfig.from_json_file('uncased_L-12_H-768_A-12/bert_config.json') # config = BertConfig.from_json_file('configs/pals_config.json') # model = BertForQuestionAnswering(config) # model.load_pretained('initial_bert.bin', patch=True) # print(model) from tokenization import FullTokenizer, BasicTokenizer tokenizer = FullTokenizer('uncased_L-12_H-768_A-12/vocab.txt', do_lower_case=True) tokens = tokenizer.tokenize('I love China!!') print(tokens) tokenizer = BasicTokenizer() tokens = tokenizer.tokenize('[SEP]') print(tokens)
help='whether to use gpu for finetuning') args = parser.parse_args() logging.info(args) batch_size = args.batch_size test_batch_size = args.test_batch_size lr = args.lr ctx = mx.cpu() if args.gpu is None or args.gpu == '' else mx.gpu() bert, vocabulary = bert_12_768_12(dataset_name='book_corpus_wiki_en_uncased', pretrained=True, ctx=ctx, use_pooler=True, use_decoder=False, use_classifier=False) tokenizer = FullTokenizer(vocabulary, do_lower_case=True) model = BERTClassifier(bert, dropout=0.1) model.classifier.initialize(init=mx.init.Normal(0.02), ctx=ctx) model.hybridize(static_alloc=True) logging.info(model) loss_function = gluon.loss.SoftmaxCELoss() loss_function.hybridize(static_alloc=True) metric = mx.metric.Accuracy() trans = ClassificationTransform(tokenizer, MRPCDataset.get_labels(), args.max_len) data_train = MRPCDataset('train').transform(trans) data_dev = MRPCDataset('dev').transform(trans)
#set_dir = 'pleasant_unpleasant' set_dir = 'career_family' #set_dir = 'unpleasant_pleasant' #set_dir = 'follower_leader' targets = ['male', 'female'] attributes = ['career', 'family'] #attributes = ['unpleasant'] #attributes = ['pleasant'] #attributes = ['leader', 'follower'] templates = ['templates'] vocab_file = 'uncased_L-12_H-768_A-12/vocab.txt' tok = FullTokenizer(vocab_file) if SWAP_TARGETS: tmp = copy.deepcopy(targets) targets = attributes attributes = tmp def open_results_file(path): result = pickle.load(open(path, 'rb')) for res in result: res['file'] = path return result def load_results(base_results_dir,