def predict(self, inputs, **kwargs): """Predicts the resulting tensors. Args: inputs: A dictionary of input tensors keyed by names. Returns: predictions: A dictionary of prediction tensors keyed by name. """ is_training = self._is_training options = self._model_proto token_to_id_layer = token_to_id.TokenToIdLayer( options.bert_vocab_file, options.bert_unk_token_id) bert_config = BertConfig.from_json_file(options.bert_config_file) slim_fc_scope = hyperparams.build_hyperparams(options.fc_hyperparams, is_training)() # Prediction. answer_logits = self._predict_logits( inputs[self._field_answer_choices], inputs[self._field_answer_choices_len], token_to_id_layer, bert_config, slim_fc_scope, options.dropout_keep_prob, is_training) # Restore from checkpoint. assignment_map, _ = get_assignment_map_from_checkpoint( tf.global_variables(), options.bert_checkpoint_file) tf.compat.v1.train.init_from_checkpoint(options.bert_checkpoint_file, assignment_map) return { FIELD_ANSWER_PREDICTION: answer_logits, }
def _bert_model(self, input_ids, input_tag_features, input_masks): """Creates the Bert model. Args: input_ids: A [batch, max_seq_len] int tensor. input_masks: A [batch, max_seq_len] int tensor. """ is_training = self._is_training options = self._model_proto bert_config = BertConfig.from_json_file(options.bert_config_file) bert_model = BertModel(bert_config, is_training, input_ids=input_ids, input_mask=input_masks, use_tag_embeddings=True, tag_features=input_tag_features) # Restore from checkpoint. assignment_map, _ = get_assignment_map_from_checkpoint( tf.global_variables(), options.bert_checkpoint_file) if 'global_step' in assignment_map: assignment_map.pop('global_step') tf.compat.v1.train.init_from_checkpoint(options.bert_checkpoint_file, assignment_map) return bert_model.get_pooled_output()
def __init__(self, path, training=False, max_seq_length=512): self.max_seq_length = max_seq_length self.graph = tf.Graph() with self.graph.as_default(): self.input_ids = tf.compat.v1.placeholder( tf.int32, shape=(None, self.max_seq_length)) self.input_mask = tf.compat.v1.placeholder( tf.int32, shape=(None, self.max_seq_length)) self.segment_ids = tf.compat.v1.placeholder( tf.int32, shape=(None, self.max_seq_length)) self.bert_config = BertConfig.from_json_file(path + '/bert_config.json') self.bert_module = BertModel(config=self.bert_config, is_training=training, input_ids=self.input_ids, input_mask=self.input_mask, token_type_ids=self.segment_ids, use_one_hot_embeddings=False) assignment_map, initialized_variable_names = get_assignment_map_from_checkpoint( tf.trainable_variables(), path + '/bert_model.ckpt') tf.train.init_from_checkpoint(path + '/bert_model.ckpt', assignment_map) self.sess = tf.compat.v1.Session() self.sess.run( tf.group(tf.compat.v1.global_variables_initializer(), tf.compat.v1.tables_initializer())) self.bert_outputs = { 'sequence_output': self.bert_module.get_sequence_output(), 'pooled_output': self.bert_module.get_pooled_output(), } self.tok = tokenization.FullTokenizer(vocab_file=path + '/vocab.txt', do_lower_case=True)
def __init__(self, train_corpus_fname, test_corpus_fname, vocab_fname, pretrain_model_fname, bertconfig_fname, model_save_path, max_seq_length=128, warmup_proportion=0.1, batch_size=32, learning_rate=2e-5, num_labels=2): # Load a corpus. super().__init__(train_corpus_fname=train_corpus_fname, tokenized_train_corpus_fname=train_corpus_fname + ".bert-tokenized", test_corpus_fname=test_corpus_fname, batch_size=batch_size, tokenized_test_corpus_fname=test_corpus_fname + ".bert-tokenized", model_name="bert", vocab_fname=vocab_fname, model_save_path=model_save_path) # configurations config = BertConfig.from_json_file(bertconfig_fname) self.pretrain_model_fname = pretrain_model_fname self.max_seq_length = max_seq_length self.batch_size = batch_size self.learning_rate = learning_rate self.num_labels = 2 # positive, negative self.PAD_INDEX = 0 self.CLS_TOKEN = "[CLS]" self.SEP_TOKEN = "[SEP]" self.num_train_steps = (int((len(self.train_data) - 1) / self.batch_size) + 1) * self.num_epochs self.num_warmup_steps = int(self.num_train_steps * warmup_proportion) self.eval_every = int(self.num_train_steps / self.num_epochs) # epoch마다 평가 self.training = tf.placeholder(tf.bool) # build train graph self.input_ids, self.input_mask, self.segment_ids, self.label_ids, self.logits, self.loss = make_bert_graph(config, max_seq_length, self.dropout_keep_prob_rate, num_labels, tune=True)
def get_bert(BERT_PT_PATH, bert_type, do_lower_case, no_pretraining): bert_config_file = os.path.join(BERT_PT_PATH, f'bert_config_{bert_type}.json') vocab_file = os.path.join(BERT_PT_PATH, f'vocab_{bert_type}.txt') init_checkpoint = os.path.join(BERT_PT_PATH, f'pytorch_model_{bert_type}.bin') bert_config = BertConfig.from_json_file(bert_config_file) tokenizer = tokenization.FullTokenizer( vocab_file=vocab_file, do_lower_case=do_lower_case) bert_config.print_status() model_bert = BertModel(bert_config) # if no_pretraining: # pass # else: # model_bert.load_state_dict(torch.load(init_checkpoint, map_location='cpu')) # print("Load pre-trained parameters.") # model_bert=torch.nn.DataParallel(model_bert, device_ids=[0, 4, 5]) model_bert.to(device) # model_bert.cuda(2) return model_bert, tokenizer, bert_config
def _build_bert_model(self): # load pre-trained model config bert_config_file = self.bert_model_dir + "bert_config.json" bert_config = BertConfig.from_json_file(bert_config_file) # code to facilitate TPU usage - not used in this case so can be overlooked is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( master=None, tpu_config=tf.contrib.tpu.TPUConfig( num_shards=8, per_host_input_for_training=is_per_host) ) # then load build BERT model checkpoint_file = self.bert_model_dir + 'bert_model.ckpt' model_fn = model_fn_builder( bert_config = bert_config, # the bert_model.ckpt file is actually three files, but is referenced as one init_checkpoint = checkpoint_file, layer_indexes = self.layer_indexes, use_tpu = False, # extract_features script reccomends this to be set to true if using TPU # apparently much faster use_one_hot_embeddings = False ) estimator = tf.contrib.tpu.TPUEstimator( use_tpu=False, model_fn=model_fn, config=run_config, predict_batch_size=32 ) return estimator
def __init__(self): bert_pretrained_dir = args.pretrain_models_path + args.bert_model_name self.do_lower_case = args.bert_model_name.startswith('uncased') self.vocab_file = os.path.join(bert_pretrained_dir, 'vocab.txt') self.config_file = os.path.join(bert_pretrained_dir, 'bert_config.json') self.tokenizer = FullTokenizer(vocab_file=self.vocab_file, do_lower_case=self.do_lower_case) self.input_id = tf.placeholder(tf.int64, [None, None], 'input_ids') self.input_mask = tf.placeholder(tf.int64, [None, None], 'input_mask') self.segment_ids = tf.placeholder(tf.int64, [None, None], 'segment_ids') bert_config = BertConfig.from_json_file(self.config_file) model = BertModel(config=bert_config, is_training=False, input_ids=self.input_id, input_mask=self.input_mask, token_type_ids=self.segment_ids, use_one_hot_embeddings=True, scope='bert') self.output_layer = model.get_sequence_output() self.embedding_layer = model.get_embedding_output() saver = tf.train.Saver() config = tf.ConfigProto() config.gpu_options.allow_growth = True self.session = tf.Session(config=config) saver.restore(self.session, bert_pretrained_dir + '/bert_model.ckpt')
def __init__(self, model_folder, max_length=256, lowercase=True): # 1. Create tokenizer self.max_length = max_length vocab_file = os.path.join(model_folder, 'vocab.txt') self.tokenizer = FullTokenizer(vocab_file, do_lower_case=lowercase) # 2. Read Config config_file = os.path.join(model_folder, 'bert_config.json') self.config = BertConfig.from_json_file(config_file) # 3. Create Model self.session = tf.Session() self.token_ids_op = tf.placeholder(tf.int32, shape=(None, max_length), name='token_ids') self.model = BertModel(config=self.config, is_training=False, input_ids=self.token_ids_op, use_one_hot_embeddings=False) # 4. Restore Trained Model self.saver = tf.train.Saver() ckpt_file = os.path.join(model_folder, 'bert_model.ckpt') # RCS ckpt_file = os.path.join(model_folder, 'model.ckpt-1000000') self.saver.restore(self.session, ckpt_file) hidden_layers = self.config.num_hidden_layers self.embeddings_op = tf.get_default_graph().get_tensor_by_name( "bert/encoder/Reshape_{}:0".format(hidden_layers + 1))
def predict(self, inputs, **kwargs): """Predicts the resulting tensors. Args: inputs: A dictionary of input tensors keyed by names. Returns: predictions: A dictionary of prediction tensors keyed by name. """ is_training = self._is_training options = self._model_proto (answer_choices, answer_choices_len, answer_label) = (inputs[InputFields.answer_choices_with_question], inputs[InputFields.answer_choices_with_question_len], inputs[InputFields.answer_label]) # Create model layers. token_to_id_layer = token_to_id.TokenToIdLayer( options.bert_vocab_file, options.bert_unk_token_id) # Convert tokens into token ids. batch_size = answer_choices.shape[0] answer_choices_token_ids = token_to_id_layer(answer_choices) answer_choices_token_ids_reshaped = tf.reshape( answer_choices_token_ids, [batch_size * NUM_CHOICES, -1]) answer_choices_mask = tf.sequence_mask( answer_choices_len, maxlen=tf.shape(answer_choices)[-1]) answer_choices_mask_reshaped = tf.reshape( answer_choices_mask, [batch_size * NUM_CHOICES, -1]) # Bert prediction. bert_config = BertConfig.from_json_file(options.bert_config_file) bert_model = BertModel(bert_config, is_training, input_ids=answer_choices_token_ids_reshaped, input_mask=answer_choices_mask_reshaped) answer_choices_cls_feature_reshaped = bert_model.get_pooled_output() answer_choices_cls_feature = tf.reshape( answer_choices_cls_feature_reshaped, [batch_size, NUM_CHOICES, -1]) assignment_map, _ = get_assignment_map_from_checkpoint( tf.global_variables(), options.bert_checkpoint_file) tf.compat.v1.train.init_from_checkpoint(options.bert_checkpoint_file, assignment_map) # Classification layer. output = tf.compat.v1.layers.dense(answer_choices_cls_feature, units=1, activation=None) output = tf.squeeze(output, axis=-1) return {FIELD_ANSWER_PREDICTION: output}
def convert(args): # Initialise PyTorch model config = BertConfig.from_json_file(args.bert_config_file) model = BertModel(config) # Load weights from TF model path = args.tf_checkpoint_path print("Converting TensorFlow checkpoint from {}".format(path)) init_vars = tf.train.list_variables(path) names = [] arrays = [] for name, shape in init_vars: print("Loading {} with shape {}".format(name, shape)) array = tf.train.load_variable(path, name) print("Numpy array shape {}".format(array.shape)) names.append(name) arrays.append(array) for name, array in zip(names, arrays): name = name[5:] # skip "bert/" print("Loading {}".format(name)) name = name.split('/') if name[0] in ['redictions', 'eq_relationship']: print("Skipping") continue pointer = model for m_name in name: if re.fullmatch(r'[A-Za-z]+_\d+', m_name): l = re.split(r'_(\d+)', m_name) else: l = [m_name] if l[0] == 'kernel': pointer = getattr(pointer, 'weight') else: if l[0] != 'l_step': pointer = getattr(pointer, l[0], name) else: print(l[0]) if len(l) >= 2: num = int(l[1]) pointer = pointer[num] if m_name[-11:] == '_embeddings': pointer = getattr(pointer, 'weight') elif m_name == 'kernel': array = np.transpose(array) try: assert pointer.shape == array.shape except AssertionError as e: e.args += (pointer.shape, array.shape) raise except AttributeError: continue pointer.data = torch.from_numpy(array) # Save pytorch-model torch.save(model.state_dict(), args.pytorch_dump_path)
def build(self, data_iter, bert_config_file): # get the inputs with tf.variable_scope('inputs'): input_map = data_iter.get_next() usrid, prdid, input_x, input_y, doc_len = \ (input_map['usr'], input_map['prd'], input_map['content'], input_map['rating'], input_map['doc_len']) input_x = tf.reshape(input_x, [-1, self.max_sen_len]) sen_len = tf.count_nonzero(input_x, axis=-1) doc_len = doc_len // self.max_sen_len input_x = tf.cast(input_x, tf.int32) self.usr = lookup(self.embeddings['usr_emb'], usrid, name='cur_usr_embedding') self.prd = lookup(self.embeddings['prd_emb'], prdid, name='cur_prd_embedding') input_x = tf.reshape(input_x, [-1, self.max_sen_len]) input_mask = tf.sequence_mask(sen_len, self.max_sen_len) input_mask = tf.cast(input_mask, tf.int32) bert_config = BertConfig.from_json_file(bert_config_file) bert = BertModel(bert_config, is_training=False, input_ids=input_x, input_mask=input_mask, token_type_ids=None, use_one_hot_embeddings=False) # input_x = bert.get_sequence_output() input_x = bert.get_embedding_output() # build the process of model d_hat = self.nsc(input_x, self.max_sen_len, self.max_doc_len // self.max_sen_len, sen_len, doc_len) prediction = tf.argmax(d_hat, 1, name='prediction') with tf.variable_scope("loss"): sce = tf.nn.softmax_cross_entropy_with_logits_v2 self.loss = sce(logits=d_hat, labels=tf.one_hot(input_y, self.cls_cnt)) regularizer = tf.zeros(1) params = tf.trainable_variables() for param in params: if param not in self.embeddings.values(): regularizer += tf.nn.l2_loss(param) self.loss = tf.reduce_sum(self.loss) + self.l2_rate * regularizer prediction = tf.argmax(d_hat, 1, name='prediction') with tf.variable_scope("metrics"): correct_prediction = tf.equal(prediction, input_y) mse = tf.reduce_sum(tf.square(prediction - input_y), name="mse") correct_num = tf.reduce_sum(tf.cast(correct_prediction, dtype=tf.int32), name="correct_num") accuracy = tf.reduce_sum(tf.cast(correct_prediction, "float"), name="accuracy") return self.loss, mse, correct_num, accuracy
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path): # Initialise PyTorch model config = BertConfig.from_json_file(bert_config_file) print("Building PyTorch model from configuration: {}".format(str(config))) model = BertForPreTraining(config) # Load weights from tf checkpoint load_tf_weights_in_bert(model, tf_checkpoint_path) # Save pytorch-model print("Save PyTorch model to {}".format(pytorch_dump_path)) torch.save(model.state_dict(), pytorch_dump_path)
def get_bert(path_bert): bert_config_file = path_bert + 'bert_config_uncased_L-12_H-768_A-12.json' vocab_file = path_bert + 'vocab_uncased_L-12_H-768_A-12.txt' init_checkpoint = path_bert + 'pytorch_model_uncased_L-12_H-768_A-12.bin' bert_config = BertConfig.from_json_file(bert_config_file) tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True) model_bert = BertModel(bert_config) model_bert.load_state_dict(torch.load(init_checkpoint, map_location='cpu')) print("Load pre-trained parameters.") if gpu: model_bert.to(device) return model_bert, tokenizer, bert_config
def get_bert(BERT_PT_PATH, bert_type, do_lower_case): bert_config_file = os.path.join(BERT_PT_PATH, f'bert_config_{bert_type}.json') vocab_file = os.path.join(BERT_PT_PATH, f'vocab_{bert_type}.txt') bert_config = BertConfig.from_json_file(bert_config_file) tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=False) bert_config.print_status() model_bert = BertModel.from_pretrained(bert_type) print("Load pre-trained parameters.") model_bert.to(device) return model_bert, tokenizer, bert_config
def main(): config = BertConfig.from_json_file( './bert/models/uncased_L-12_H-768_A-12/bert_config.json') def model_fn(features, labels, mode, params): input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] model = BertModel(config, True, input_ids, input_mask, segment_ids) final_hidden = model.get_sequence_output() return final_hidden est = tf.estimator.Estimator(model_fn) print(est)
def get_bert(BERT_PATH): bert_config_file = BERT_PATH + "/bert_config_uncased_L-12_H-768_A-12.json" vocab_file = BERT_PATH + "/vocab_uncased_L-12_H-768_A-12.txt" init_checkpoint = BERT_PATH + "/pytorch_model_uncased_L-12_H-768_A-12.bin" bert_config = BertConfig.from_json_file(bert_config_file) tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True) bert_config.print_status() model_bert = BertModel(bert_config) model_bert.load_state_dict(torch.load(init_checkpoint, map_location='cpu')) print("Load pre-trained BERT parameters.") model_bert.to(device) return model_bert, tokenizer, bert_config
def get_model(self): logging.info("get bert model") graph = tf.Graph() with graph.as_default(): ph_input_ids = tf.placeholder(dtype=tf.int32, shape=[None, self._seq_length + 2], name="ph_input_ids") con = BertConfig.from_json_file(config.PROJECT_ROOT + "/bert_config.json") bert_model = BertModel(config=con, is_training=False, input_ids=ph_input_ids, use_one_hot_embeddings=True) output = bert_model.get_sequence_output() init = tf.global_variables_initializer() sess = tf.Session(graph=graph) sess.run(init) return sess, ph_input_ids, output
def get_bert(BERT_PT_PATH, bert_type, do_lower_case, no_pretraining): bert_config_file = os.path.join(BERT_PT_PATH, f'bert_config_{bert_type}.json') vocab_file = os.path.join(BERT_PT_PATH, f'vocab_{bert_type}.txt') init_checkpoint = os.path.join(BERT_PT_PATH, f'pytorch_model_{bert_type}.bin') bert_config = BertConfig.from_json_file(bert_config_file) tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) bert_config.print_status() model_bert = BertModel(bert_config) model_bert.load_state_dict(torch.load(init_checkpoint, map_location='cpu')) model_bert.to(device) return model_bert, tokenizer, bert_config
def get_bert(BERT_PT_PATH): bert_config_file = os.path.join(BERT_PT_PATH, 'bert_config.json') vocab_file = os.path.join(BERT_PT_PATH, 'vocab.txt') init_checkpoint = os.path.join(BERT_PT_PATH, 'pytorch_model.bin') bert_config = BertConfig.from_json_file(bert_config_file) tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file) bert_config.print_status() model_bert = BertModel(bert_config) model_bert.load_state_dict(torch.load(init_checkpoint, map_location='cpu')) print("Load pre-trained parameters.") model_bert.to(device) return model_bert, tokenizer, bert_config
def bert(bert_config_file, mode, dim, input_ids, input_mask, input_type, activation, init_checkpoint=None): bert_config = BertConfig.from_json_file(bert_config_file) bert_model = BertModel(config=bert_config, is_training=mode == tf.estimator.ModeKeys.TRAIN, input_ids=input_ids, input_mask=input_mask, token_type_ids=input_type, scope="bert_query") output = bert_model.get_pooled_output() if mode == tf.estimator.ModeKeys.TRAIN: output = tf.nn.dropout(output, keep_prob=0.9) sig = tf.layers.dense(output, dim, activation=activation, kernel_initializer=tf.truncated_normal_initializer( stddev=bert_config.initializer_range), name="bert_query/query") tvars = tf.trainable_variables('bert_query') initialized_variable_names = {} if init_checkpoint: (assignment_map, initialized_variable_names) = get_assignment_map_from_checkpoint( tvars, init_checkpoint) tf.train.init_from_checkpoint(init_checkpoint, assignment_map) """ for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) """ return sig
def __init__( self, model_fname="/notebooks/embedding/data/sentence-embeddings/bert/tune-ckpt", bertconfig_fname="/notebooks/embedding/data/sentence-embeddings/bert/multi_cased_L-12_H-768_A-12/bert_config.json", vocab_fname="/notebooks/embedding/data/sentence-embeddings/bert/multi_cased_L-12_H-768_A-12/vocab.txt", max_seq_length=32, dimension=768, num_labels=2): super().__init__("bert", dimension) config = BertConfig.from_json_file(bertconfig_fname) self.max_seq_length = max_seq_length self.tokenizer = FullTokenizer(vocab_file=vocab_fname, do_lower_case=False) self.model, self.input_ids, self.input_mask, self.segment_ids, self.probs = make_bert_graph( config, max_seq_length, 1.0, num_labels, tune=False) saver = tf.train.Saver(tf.global_variables()) self.sess = tf.Session() checkpoint_path = tf.train.latest_checkpoint(model_fname) saver.restore(self.sess, checkpoint_path)
def get_bert(BERT_PT_PATH, bert_type, do_lower_case, no_pretraining): bert_config_file = os.path.join(BERT_PT_PATH, f"bert_config_{bert_type}.json") vocab_file = os.path.join(BERT_PT_PATH, f"vocab_{bert_type}.txt") init_checkpoint = os.path.join(BERT_PT_PATH, f"pytorch_model_{bert_type}.bin") bert_config = BertConfig.from_json_file(bert_config_file) tokenizer = tokenization.FullTokenizer( vocab_file=vocab_file, do_lower_case=do_lower_case ) bert_config.print_status() model_bert = BertModel(bert_config) if no_pretraining: pass else: model_bert.load_state_dict(torch.load(init_checkpoint, map_location="cpu")) print("Load pre-trained parameters.") model_bert.to(device) return model_bert, tokenizer, bert_config
def __init__(self, aggregation_method): self.aggregation_method = aggregation_method #'cls_max', 'cls_avg', 'cls_attn' or 'cls_transformer' self.tokenizer = tokenization.FullTokenizer(vocab_file='vocab.txt') self.writer = tf.python_io.TFRecordWriter("output.tfrecords") self.run_config = tf.estimator.tpu.RunConfig( cluster=None, model_dir=None, save_checkpoints_steps=1000, keep_checkpoint_max=1, tpu_config=tf.estimator.tpu.TPUConfig( iterations_per_loop=1000, num_shards=8, per_host_input_for_training=tf.estimator.tpu.InputPipelineConfig.PER_HOST_V2)) self.model_fn = model_fn_builder( bert_config=BertConfig.from_json_file('bert_models_onMSMARCO/vanilla_bert_tiny_on_MSMARCO/bert_config.json'), num_labels=2, init_checkpoint='bert_models_onMSMARCO/vanilla_bert_tiny_on_MSMARCO/model.ckpt-1600000', learning_rate=5e-5, num_train_steps=None, num_warmup_steps=None, use_tpu=False, use_one_hot_embeddings=False, aggregation_method=self.aggregation_method, pretrained_model='bert', from_distilled_student=False) self.estimator = tf.estimator.tpu.TPUEstimator( use_tpu=False, model_fn=self.model_fn, config=self.run_config, train_batch_size=32, eval_batch_size=32, predict_batch_size=32)
def get_bert(BERT_PT_PATH, bert_type, do_lower_case, no_pretraining): bert_config_file = os.path.join(BERT_PT_PATH, f'bert_config.json') #bert的配置文件 vocab_file = os.path.join(BERT_PT_PATH, f'vocab.txt') #bert的词汇文件 init_checkpoint = os.path.join(BERT_PT_PATH, f'pytorch_model.bin') #bert的预训练模型(不一定有) """ ==BertConfig==该类在bert文件里的modeling里,用bert的配置文件初始化(默认uS) <from_json_file>方法用于读取bert配置文件的内容 """ bert_config = BertConfig.from_json_file(bert_config_file) """ ==tokenization==bert里的文件 ==FullTokenizer==类,里面存放词汇信息 """ tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) #毫无作用的输出参数 bert_config.print_status() """ ==BertModel==该类在bert文件里的modeling里,同样用bert的配置文件初始化,里面有一系列对bert模型的操作(例如添加层,加载参数等...) """ model_bert = BertModel(bert_config) if no_pretraining: #如果不用bert预训练模型,只要它们团队的模型(不需要.bin) pass else: model_bert.load_state_dict( torch.load(init_checkpoint, map_location='cpu')) #加载.bin文件,即加载预训练参数 print("Load pre-trained parameters.") model_bert.to(device) # bert模型 词汇 bert配置文件 return model_bert, tokenizer, bert_config
def get_bert(BERT_PT_PATH, bert_type, do_lower_case): bert_config_file = os.path.join(BERT_PT_PATH, f'bert_config_{bert_type}.json') vocab_file = os.path.join(BERT_PT_PATH, f'vocab_{bert_type}.txt') #init_checkpoint = os.path.join(BERT_PT_PATH, f'pytorch_model_{bert_type}.bin') #init_checkpoint = os.path.join(BERT_PT_PATH, f'bert_model_{bert_type}.ckpt.data') bert_config = BertConfig.from_json_file(bert_config_file) tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=False) bert_config.print_status() model_bert = BertModel.from_pretrained(bert_type) #model_bert.eval() ''' if no_pretraining: pass else: model_bert.load_state_dict(torch.load(init_checkpoint, map_location='cpu')) ''' print("Load pre-trained parameters.") model_bert.to(device) return model_bert, tokenizer, bert_config
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--bert_config_file", default=None, type=str, required=True, help="The config json file corresponding to the pre-trained BERT model. " "This specifies the model architecture.") parser.add_argument("--vocab_file", default=None, type=str, required=True, help="The vocabulary file that the BERT model was trained on.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written.") ## Other parameters parser.add_argument("--debug", default=False, action='store_true', help="Whether to run in debug mode.") parser.add_argument("--data_dir", default='data/semeval_14', type=str, help="SemEval data dir") parser.add_argument("--train_file", default=None, type=str, help="SemEval xml for training") parser.add_argument("--predict_file", default=None, type=str, help="SemEval csv for prediction") parser.add_argument("--extraction_file", default=None, type=str, help="pkl file for extraction") parser.add_argument("--init_checkpoint", default=None, type=str, help="Initial checkpoint (usually from a pre-trained BERT model).") parser.add_argument("--do_lower_case", default=True, action='store_true', help="Whether to lower case the input text. Should be True for uncased " "models and False for cased models.") parser.add_argument("--max_seq_length", default=96, type=int, help="The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_pipeline", default=False, action='store_true', help="Whether to run pipeline on the dev set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=32, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=2e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% " "of training.") parser.add_argument("--save_proportion", default=0.5, type=float, help="Proportion of steps to save models for. E.g., 0.5 = 50% " "of training.") parser.add_argument("--verbose_logging", default=False, action='store_true', help="If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--optimize_on_cpu', default=False, action='store_true', help="Whether to perform optimization and keep the optimizer averages on CPU") parser.add_argument('--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=128, help='Loss scaling, positive power of 2 values can improve fp16 convergence.') args = parser.parse_args() if not args.do_train and not args.do_predict and not args.do_pipeline: raise ValueError("At least one of `do_train` or `do_predict` must be True.") if args.do_train and not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if args.do_predict and not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified.") if args.do_pipeline and not args.extraction_file: raise ValueError( "If `do_pipeline` is True, then `extraction_file` must be specified.") if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') if args.fp16: logger.info("16-bits training currently not supported in distributed training") args.fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496) logger.info("torch_version: {} device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( torch.__version__, device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) bert_config = BertConfig.from_json_file(args.bert_config_file) if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (args.max_seq_length, bert_config.max_position_embeddings)) tokenizer = tokenization.FullTokenizer( vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) logger.info('output_dir: {}'.format(args.output_dir)) save_path = os.path.join(args.output_dir, 'checkpoint.pth.tar') log_path = os.path.join(args.output_dir, 'performance.txt') network_path = os.path.join(args.output_dir, 'network.txt') parameter_path = os.path.join(args.output_dir, 'parameter.txt') f = open(parameter_path, "w") for arg in sorted(vars(args)): print("{}: {}".format(arg, getattr(args, arg)), file=f) f.close() logger.info("***** Preparing model *****") model = BertForSpanAspectClassification(bert_config) if args.init_checkpoint is not None and not os.path.isfile(save_path): model = bert_load_state_dict(model, torch.load(args.init_checkpoint, map_location='cpu')) logger.info("Loading model from pretrained checkpoint: {}".format(args.init_checkpoint)) if args.fp16: model.half() model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) if os.path.isfile(save_path): checkpoint = torch.load(save_path) model.load_state_dict(checkpoint['model']) step = checkpoint['step'] logger.info("Loading model from finetuned checkpoint: '{}' (step {})" .format(save_path, step)) f = open(network_path, "w") for n, param in model.named_parameters(): print("name: {}, size: {}, dtype: {}, requires_grad: {}" .format(n, param.size(), param.dtype, param.requires_grad), file=f) total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) total_params = sum(p.numel() for p in model.parameters()) print("Total trainable parameters: {}".format(total_trainable_params), file=f) print("Total parameters: {}".format(total_params), file=f) f.close() logger.info("***** Preparing data *****") train_dataloader, num_train_steps = None, None eval_examples, eval_features, eval_dataloader = None, None, None args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) if args.do_train: logger.info("***** Preparing training *****") train_dataloader, num_train_steps = read_train_data(args, tokenizer, logger) logger.info("***** Preparing evaluation *****") eval_examples, eval_features, eval_dataloader = read_eval_data(args, tokenizer, logger) logger.info("***** Preparing optimizer *****") optimizer, param_optimizer = prepare_optimizer(args, model, num_train_steps) global_step = 0 if os.path.isfile(save_path): checkpoint = torch.load(save_path) optimizer.load_state_dict(checkpoint['optimizer']) step = checkpoint['step'] logger.info("Loading optimizer from finetuned checkpoint: '{}' (step {})".format(save_path, step)) global_step = step if args.do_train: logger.info("***** Running training *****") best_f1 = 0 save_checkpoints_steps = int(num_train_steps / (5 * args.num_train_epochs)) start_save_steps = int(num_train_steps * args.save_proportion) if args.debug: args.num_train_epochs = 1 save_checkpoints_steps = 20 start_save_steps = 0 model.train() for epoch in range(int(args.num_train_epochs)): logger.info("***** Epoch: {} *****".format(epoch+1)) global_step, model, best_f1 = run_train_epoch(args, global_step, model, param_optimizer, train_dataloader, eval_examples, eval_features, eval_dataloader, optimizer, n_gpu, device, logger, log_path, save_path, save_checkpoints_steps, start_save_steps, best_f1) if args.do_predict: logger.info("***** Running prediction *****") if eval_dataloader is None: eval_examples, eval_features, eval_dataloader = read_eval_data(args, tokenizer, logger) # restore from best checkpoint if save_path and os.path.isfile(save_path) and args.do_train: checkpoint = torch.load(save_path) model.load_state_dict(checkpoint['model']) step = checkpoint['step'] logger.info("Loading model from finetuned checkpoint: '{}' (step {})" .format(save_path, step)) model.eval() metrics = evaluate(args, model, device, eval_examples, eval_features, eval_dataloader, logger, write_pred=True) print("step: {}, P: {:.4f}, R: {:.4f}, F1: {:.4f} (common: {}, retrieved: {}, relevant: {})" .format(global_step, metrics['p'], metrics['r'], metrics['f1'], metrics['common'], metrics['retrieved'], metrics['relevant'])) if args.do_pipeline: logger.info("***** Running prediction *****") eval_examples, eval_features, eval_dataloader = pipeline_eval_data(args, tokenizer, logger) # restore from best checkpoint if save_path and os.path.isfile(save_path) and args.do_train: checkpoint = torch.load(save_path) model.load_state_dict(checkpoint['model']) step = checkpoint['step'] logger.info("Loading model from finetuned checkpoint: '{}' (step {})" .format(save_path, step)) model.eval() metrics = evaluate(args, model, device, eval_examples, eval_features, eval_dataloader, logger, write_pred=True) f = open(log_path, "a") print("pipeline, step: {}, P: {:.4f}, R: {:.4f}, F1: {:.4f} (common: {}, retrieved: {}, relevant: {})" .format(global_step, metrics['p'], metrics['r'], metrics['f1'], metrics['common'], metrics['retrieved'], metrics['relevant']), file=f) print(" ", file=f) f.close()
import pickle import json from bert import tokenization from bert.modeling import BertConfig from utils import (input_fn_builder, make_filename, read_squad_examples, FeatureWriter, convert_examples_to_features) from train import model_fn_builder, FLAGS from models.rnn_lstm import create_rnn_lstm_model, LSTMConfig from models.cnn import CNNConfig, create_cnn_model from models.cnn_keras import CNNKerasConfig, create_cnnKeras_model from models.contextualized_cnn import create_contextualized_cnn_model, ContextualizedCNNConfig from models.fully_connected import create_fully_connected_model, FullyConnectedConfig DATA_BERT_DIRECTORY = FLAGS.data_bert_directory BERT_CONFIG_FILE = "%s/bert_config.json" % DATA_BERT_DIRECTORY bert_config = BertConfig.from_json_file(BERT_CONFIG_FILE) INIT_CHECKPOINT = FLAGS.output_dir if FLAGS.init_checkpoint is not None: INIT_CHECKPOINT = '%s/%s' % (FLAGS.output_dir, FLAGS.init_checkpoint) DEV_FILENAME = make_filename('dev', 1., FLAGS.features_dir, FLAGS.fine_tune, FLAGS.n_examples) print('DEV_FILENAE %s' % DEV_FILENAME) RawResult = collections.namedtuple("RawResult", ["unique_id", "start_logits", "end_logits"]) def load_and_save_config(filename): with tf.gfile.GFile(filename, 'r') as json_data:
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--bert_config_file", default=None, type=str, required=True, help="The config json file corresponding to the pre-trained BERT model. " "This specifies the model architecture.") parser.add_argument("--vocab_file", default=None, type=str, required=True, help="The vocabulary file that the BERT model was trained on.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written.") ## Answering ablities parser.add_argument("--span_extraction", default=False, action='store_true', help="Whether to use span extraction.") parser.add_argument("--addition_subtraction", default=False, action='store_true', help="Whether to use addition subtraction.") parser.add_argument("--counting", default=False, action='store_true', help="Whether to use counting.") parser.add_argument("--negation", default=False, action='store_true', help="Whether to use negation.") parser.add_argument("--include_more_numbers", default=True, action='store_true', help="Whether to include more numbers.") parser.add_argument("--beam_size", default=3, type=int, help="The size of beam search.") parser.add_argument("--max_count", default=4, type=int, help="The maximal number of add_sub expressions.") parser.add_argument("--max_answer_number", default=8, type=int, help="The maximal number of answers.") ## Other parameters parser.add_argument("--do_debug", default=False, action='store_true', help="Whether to run in debug mode.") parser.add_argument("--train_file", default=None, type=str, help="DROP json for training. E.g., drop_dataset_train.json") parser.add_argument("--predict_file", default=None, type=str, help="DROP json for predictions.") parser.add_argument("--init_checkpoint", default=None, type=str, help="Initial checkpoint (usually from a pre-trained BERT model).") parser.add_argument("--do_lower_case", default=False, action='store_true', help="Whether to lower case the input text. Should be True for uncased " "models and False for cased models.") parser.add_argument("--max_seq_length", default=384, type=int, help="The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=32, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.05, type=float, help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% " "of training.") parser.add_argument("--length_heuristic", default=0.05, type=float, help="Weight on length heuristic.") parser.add_argument("--n_best_size", default=20, type=int, help="The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument("--max_answer_length", default=30, type=int, help="The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument("--verbose_logging", default=False, action='store_true', help="If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--data_parallel", default=False, action='store_true', help="Whether not to use data parallel") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--optimize_on_cpu', default=False, action='store_true', help="Whether to perform optimization and keep the optimizer averages on CPU") parser.add_argument('--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=128, help='Loss scaling, positive power of 2 values can improve fp16 convergence.') args = parser.parse_args() if not args.span_extraction and not args.addition_subtraction and not args.counting and not args.negation: raise ValueError("At least one of `span_extraction` or `addition_subtraction` or `counting` or `negation` must be True.") args.answering_abilities = [] if args.span_extraction: args.answering_abilities.append("span_extraction") if args.addition_subtraction: args.answering_abilities.append("addition_subtraction") if args.counting: args.answering_abilities.append("counting") if args.negation: args.answering_abilities.append("negation") logger.info("Answering abilities: {}".format(args.answering_abilities)) assert "span_extraction" in args.answering_abilities and "addition_subtraction" in args.answering_abilities if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) logger.info('output_dir: {}'.format(args.output_dir)) save_path = os.path.join(args.output_dir, 'checkpoint.pth.tar') log_path = os.path.join(args.output_dir, 'performance.txt') network_path = os.path.join(args.output_dir, 'network.txt') parameter_path = os.path.join(args.output_dir, 'parameter.txt') f = open(parameter_path, "w") for arg in sorted(vars(args)): print("{}: {}".format(arg, getattr(args, arg)), file=f) f.close() if not args.do_train and not args.do_predict: raise ValueError("At least one of `do_train` or `do_predict` must be True.") if args.do_train and not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if args.do_predict and not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified.") if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') if args.fp16: logger.info("16-bits training currently not supported in distributed training") args.fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496) logger.info("torch_version: {} device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( torch.__version__, device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) bert_config = BertConfig.from_json_file(args.bert_config_file) if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (args.max_seq_length, bert_config.max_position_embeddings)) # --- Prepare model --- logger.info("***** Preparing model *****") model = MTMSN(bert_config, args.answering_abilities, args.max_answer_number) if args.init_checkpoint is not None and not os.path.isfile(save_path): logger.info("Loading model from pretrained checkpoint: {}".format(args.init_checkpoint)) model = bert_load_state_dict(model, torch.load(args.init_checkpoint, map_location='cpu')) if args.fp16: model.half() model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1 or args.data_parallel: model = torch.nn.DataParallel(model) if os.path.isfile(save_path): checkpoint = torch.load(save_path) model.load_state_dict(checkpoint['model']) logger.info("Loading model from finetuned checkpoint: '{}' (step {}, epoch {})" .format(save_path, checkpoint['step'], checkpoint['epoch'])) f = open(network_path, "w") for n, param in model.named_parameters(): print("name: {}, size: {}, dtype: {}, requires_grad: {}" .format(n, param.size(), param.dtype, param.requires_grad), file=f) total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) total_params = sum(p.numel() for p in model.parameters()) print("Total trainable parameters: {}".format(total_trainable_params), file=f) print("Total parameters: {}".format(total_params), file=f) f.close() # --- Prepare data --- tokenizer = tokenization.FullTokenizer( vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) train_examples, train_features, num_train_steps = None, None, None eval_examples, eval_features = None, None if args.do_train: logger.info("***** Preparing training *****") train_examples, train_features, num_train_steps = read_train_data(args, tokenizer, logger) logger.info("***** Preparing evaluation *****") eval_examples, eval_features = read_eval_data(args, tokenizer, logger) if args.do_predict and eval_features is None: logger.info("***** Preparing prediction *****") eval_examples, eval_features = read_eval_data(args, tokenizer, logger) # --- Prepare optimizer --- logger.info("***** Preparing optimizer *****") if args.fp16: param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) \ for n, param in model.named_parameters()] elif args.optimize_on_cpu: param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \ for n, param in model.named_parameters()] else: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if n not in no_decay], 'weight_decay_rate': 0.01}, {'params': [p for n, p in param_optimizer if n in no_decay], 'weight_decay_rate': 0.0} ] optimizer = BERTAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) global_step, global_epoch = 0, 1 if os.path.isfile(save_path): checkpoint = torch.load(save_path) optimizer.load_state_dict(checkpoint['optimizer']) logger.info("Load optimizer from finetuned checkpoint: '{}' (step {}, epoch {})" .format(save_path, checkpoint['step'], checkpoint['epoch'])) global_step = checkpoint['step'] global_epoch = checkpoint['epoch'] + 1 # --- Run training --- if args.do_train and global_epoch < int(args.num_train_epochs)+1: logger.info("***** Running training *****") best_f1 = 0 for epoch in range(global_epoch, int(args.num_train_epochs)+1): logger.info("***** Epoch: {} *****".format(epoch)) global_step, model, best_f1 = run_train_epoch(args, global_step, n_gpu, device, model, param_optimizer, optimizer, train_examples, train_features, eval_examples, eval_features, logger, log_path, save_path, best_f1, epoch) # --- Run prediction --- if args.do_predict: logger.info("***** Running prediction *****") # restore from best checkpoint if save_path and os.path.isfile(save_path): checkpoint = torch.load(save_path) model.load_state_dict(checkpoint['model']) logger.info("Loading model from finetuned checkpoint: '{}' (step {}, epoch {})" .format(save_path, checkpoint['step'], checkpoint['epoch'])) global_step = checkpoint['step'] torch.save({ 'model': model.state_dict(), 'step': checkpoint['step'], 'epoch': checkpoint['epoch'] }, save_path) model.eval() metrics = evaluate(args, model, device, eval_examples, eval_features, logger, write_pred=True) f = open(log_path, "a") print("step: {}, test_em: {:.3f}, test_f1: {:.3f}" .format(global_step, metrics['em'], metrics['f1']), file=f) print(" ", file=f) f.close()
def predict(self, inputs, **kwargs): """Predicts the resulting tensors. Args: inputs: A dictionary of input tensors keyed by names. Returns: predictions: A dictionary of prediction tensors keyed by name. """ is_training = self._is_training options = self._model_proto token_to_id_layer = token_to_id.TokenToIdLayer( options.bert_vocab_file, options.bert_unk_token_id) bert_config = BertConfig.from_json_file(options.bert_config_file) slim_fc_scope = hyperparams.build_hyperparams(options.fc_hyperparams, is_training)() # Predict object embedding vectors. (num_objects, object_bboxes, object_labels, object_scores, object_features, max_num_objects) = _trim_to_max_num_objects( inputs[InputFields.num_detections], inputs[InputFields.detection_boxes], inputs[InputFields.detection_classes], inputs[InputFields.detection_scores], inputs[InputFields.detection_features], max_num_objects=options.max_num_objects) object_features = _predict_object_embeddings( object_features, bert_config.hidden_size, slim_fc_scope, keep_prob=options.dropout_keep_prob, is_training=is_training) # Gather text inputs. (answer_choices, answer_choices_tag, answer_choices_len) = (inputs[self._field_answer_choices], inputs[self._field_answer_choices_tag], inputs[self._field_answer_choices_len]) batch_size = answer_choices.shape[0] answer_choices_tag = _assign_invalid_tags(answer_choices_tag, max_num_objects) # Convert tokens into token ids. answer_choices_token_ids = token_to_id_layer(answer_choices) answer_choices_token_ids = tf.reshape(answer_choices_token_ids, [batch_size * NUM_CHOICES, -1]) answer_choices_mask = tf.sequence_mask( answer_choices_len, maxlen=tf.shape(answer_choices)[-1]) answer_choices_mask = tf.reshape(answer_choices_mask, [batch_size * NUM_CHOICES, -1]) # Create tag features sequence. answer_choices_tag = tf.reshape(answer_choices_tag, [batch_size * NUM_CHOICES, -1]) answer_choices_tag_embeddings = _ground_tag_using_object_features( object_features, answer_choices_tag) (tiled_object_masks, tiled_object_ids, tiled_object_features) = _tile_objects( num_objects, token_to_id_layer(object_labels), object_features) # Create Bert model. input_ids = tf.concat([answer_choices_token_ids, tiled_object_ids], -1) input_tag_embeddings = tf.concat( [answer_choices_tag_embeddings, tiled_object_features], 1) input_mask = tf.concat([answer_choices_mask, tiled_object_masks], -1) output = self._bert_model( input_ids, input_tag_embeddings, input_mask, bert_config, bert_checkpoint_file=options.bert_checkpoint_file, is_training=is_training) # Classification layer. with slim.arg_scope(slim_fc_scope): output = slim.fully_connected(output, num_outputs=1, activation_fn=None, scope='logits') output = tf.reshape(output, [batch_size, NUM_CHOICES]) return {FIELD_ANSWER_PREDICTION: output}
def main(_): logging.set_verbosity(logging.INFO) for i in range(_NUM_PARTITIONS): tf.io.gfile.makedirs( os.path.join(FLAGS.output_bert_feature_dir, '%02d' % i)) # Create Bert model. bert_tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.bert_vocab_file, do_lower_case=FLAGS.do_lower_case) # Bert prediction. input_placeholder = tf.placeholder(shape=[None], dtype=tf.string) token_to_id_layer = token_to_id.TokenToIdLayer(FLAGS.bert_vocab_file, unk_token_id=UNK) bert_config = BertConfig.from_json_file(FLAGS.bert_config_file) bert_model = BertModel(bert_config, is_training=False, input_ids=token_to_id_layer( tf.expand_dims(input_placeholder, 0))) sequence_output = bert_model.get_sequence_output()[0] pooled_output = bert_model.get_pooled_output()[0] saver = tf.compat.v1.train.Saver() config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.compat.v1.Session(config=config) sess.run(tf.compat.v1.tables_initializer()) saver.restore(sess, FLAGS.bert_checkpoint_file) for name in sess.run(tf.compat.v1.report_uninitialized_variables()): logging.warn('%s is uninitialized!', name) def _bert_fn(sequence): return sess.run([sequence_output, pooled_output], feed_dict={input_placeholder: sequence}) # Load annotations. annots = _load_annotations(FLAGS.annotations_jsonl_file) logging.info('Loaded %i annotations.', len(annots)) shard_id, num_shards = FLAGS.shard_id, FLAGS.num_shards assert 0 <= shard_id < num_shards for idx, annot in enumerate(annots): if (idx + 1) % 1000 == 0: logging.info('On example %i/%i.', idx + 1, len(annots)) annot_id = int(annot['annot_id'].split('-')[-1]) if annot_id % num_shards != shard_id: continue # Check npy file. part_id = get_partition_id(annot['annot_id']) output_file = os.path.join(FLAGS.output_bert_feature_dir, '%02d' % part_id, annot['annot_id'] + '.npy') if os.path.isfile(output_file): logging.info('%s is there.', output_file) continue annot_id = int(annot['annot_id'].split('-')[-1]) if annot_id % num_shards != shard_id: continue # Create TF example. bert_outputs = _create_bert_embeddings(annot, bert_tokenizer, FLAGS.do_lower_case, _bert_fn) with open(output_file, 'wb') as f: np.save(f, bert_outputs) logging.info('Done')