def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path): config_path = os.path.abspath(bert_config_file) tf_path = os.path.abspath(tf_checkpoint_path) print("Converting TensorFlow checkpoint from {} with config at {}".format( tf_path, config_path)) # Load weights from TF model init_vars = tf.train.list_variables(tf_path) names = [] arrays = [] for name, shape in init_vars: print("Loading TF weight {} with shape {}".format(name, shape)) array = tf.train.load_variable(tf_path, name) names.append(name) arrays.append(array) # Initialise PyTorch model config = BertConfig.from_json_file(bert_config_file) print("Building PyTorch model from configuration: {}".format(str(config))) model = BertForPreTraining(config) for name, array in zip(names, arrays): name = name.split('/') # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v # which are not required for using pretrained model if any(n in ["adam_v", "adam_m"] for n in name): print("Skipping {}".format("/".join(name))) continue pointer = model for m_name in name: if re.fullmatch(r'[A-Za-z]+_\d+', m_name): l = re.split(r'_(\d+)', m_name) else: l = [m_name] if l[0] == 'kernel' or l[0] == 'gamma': pointer = getattr(pointer, 'weight') elif l[0] == 'output_bias' or l[0] == 'beta': pointer = getattr(pointer, 'bias') elif l[0] == 'output_weights': pointer = getattr(pointer, 'weight') else: pointer = getattr(pointer, l[0]) if len(l) >= 2: num = int(l[1]) pointer = pointer[num] if m_name[-11:] == '_embeddings': pointer = getattr(pointer, 'weight') elif m_name == 'kernel': array = np.transpose(array) try: assert pointer.shape == array.shape except AssertionError as e: e.args += (pointer.shape, array.shape) raise print("Initialize PyTorch weight {}".format(name)) pointer.data = torch.from_numpy(array) # Save pytorch-model print("Save PyTorch model to {}".format(pytorch_dump_path)) torch.save(model.state_dict(), pytorch_dump_path)
def eval_all(): # output_model_file = "../../output/best_model" output_model_file = MODEL_PATH output_config_file = os.path.join('../model_dir/', args.config_name) config = BertConfig(output_config_file) model = BertForQuestionAnswering(config) if not args.no_pai: try: model.load_state_dict(torch.load(output_model_file))#, map_location='cpu')) except: model = nn.DataParallel(model) model.load_state_dict(torch.load(output_model_file))#, map_location='cpu')) else: try: model.load_state_dict(torch.load(output_model_file, map_location='cpu')) except: model = nn.DataParallel(model) model.load_state_dict(torch.load(output_model_file, map_location='cpu')) result_file_path = os.path.join('../metric', args.result_file_name) evaluate(model, result_file=result_file_path) if not args.no_pai: print(os.getcwd()) pai_file_output = "/Container/thsi_yicui/dureader-bert/Dureader/output" client.upload(pai_file_output, result_file_path, overwrite=True)
def __init__(self, output_dir: str, bert_config: str, max_seq_length: int): bert_config = BertConfig.from_json_file(bert_config) bert_config.experimental_gelu = FLAGS.experimental_gelu if FLAGS.precision: bert_config.precision = FLAGS.precision self.session = tf.compat.v1.Session() placeholder = tf.compat.v1.placeholder input_shape = [None, max_seq_length] self.input_ids = placeholder(tf.int32, input_shape, name='input_ids') self.input_mask = placeholder(tf.int32, input_shape, name='input_mask') self.segment_ids = placeholder(tf.int32, input_shape, name='segment_ids') (self.start_logits, self.end_logits) = \ create_model_top(bert_config, False, # is training self.input_ids, self.input_mask, self.segment_ids, False, # use_one_hot_embeddings None) # frozen graph path latest_model = tf.train.latest_checkpoint(FLAGS.output_dir) saver = tf.compat.v1.train.Saver() saver.restore(self.session, latest_model) self.output_dir = output_dir self.dest_dir = os.path.join(self.output_dir, "frozen") if not os.path.exists(self.dest_dir): os.mkdir(self.dest_dir)
def eval_all(): # output_model_file = "../../output/best_model" output_model_file = MODEL_PATH output_config_file = CONFIG_PATH config = BertConfig(output_config_file) model = BertForQuestionAnswering(config) if next(model.parameters()).is_cuda: try: model.load_state_dict(torch.load(output_model_file)) except: model = nn.DataParallel(model) model.load_state_dict(torch.load(output_model_file)) else: try: model.load_state_dict( torch.load(output_model_file, map_location='cpu')) except: model = nn.DataParallel(model) model.load_state_dict( torch.load(output_model_file, map_location='cpu')) result_file_path = os.path.join('../metric', args.result_file_name) evaluate(model, result_file=result_file_path)
def prepare_model(args, device): # Prepare model config = BertConfig.from_json_file(args.bert_config_path) # Padding for divisibility by 8 if config.vocab_size % 8 != 0: config.vocab_size += 8 - (config.vocab_size % 8) print('padded vocab size to: {}'.format(config.vocab_size)) # Set some options that the config file is expected to have (but don't need to be set properly # at this point) config.pad = False config.unpad = False config.dense_seq_output = False config.fused_mha = False config.fused_gelu_bias = False config.fuse_qkv = False config.fuse_scale = False config.fuse_mask = False config.fuse_dropout = False config.apex_softmax = False config.enable_stream = False if config.fuse_mask == True: config.apex_softmax = True if config.pad == False: config.enable_stream = True if config.unpad == True: config.fused_mha = False #Load from TF checkpoint model = BertForPreTraining.from_pretrained(args.tf_checkpoint, from_tf=True, config=config) return model
def __init__(self): self.bert_config = BertConfig.from_json_file( os.path.join(path, 'uncased_L-12_H-768_A-12/bert_config.json')) self.max_sequence_length = 128 self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") if self.max_sequence_length > self.bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}" .format(self.max_sequence_length, self.bert_config.max_position_embeddings)) self.processor = LogicProcessor() self.label_list = self.processor.get_labels() self.tokenizer = tokenization.FullTokenizer(vocab_file=os.path.join( path, 'uncased_L-12_H-768_A-12/vocab.txt'), do_lower_case=False) self.model = BertForSequenceClassification(self.bert_config, len(self.label_list)) init_checkpoint = os.path.join(path, 'model/logic_model_500.bin') #Future save model Load code if init_checkpoint is not None: self.model.load_state_dict( torch.load(init_checkpoint, map_location='cpu')) self.model.to(self.device)
def load_stock_model(model_dir, max_seq_len): from modeling import BertModel, BertConfig, get_assignment_map_from_checkpoint tf.compat.v1.reset_default_graph( ) # to scope naming for checkpoint loading (if executed more than once) bert_config_file = os.path.join(model_dir, "bert_config.json") bert_ckpt_file = os.path.join(model_dir, "bert_model.ckpt") pl_input_ids = tf.compat.v1.placeholder(tf.int32, shape=(1, max_seq_len)) pl_mask = tf.compat.v1.placeholder(tf.int32, shape=(1, max_seq_len)) pl_token_type_ids = tf.compat.v1.placeholder(tf.int32, shape=(1, max_seq_len)) bert_config = BertConfig.from_json_file(bert_config_file) s_model = BertModel(config=bert_config, is_training=False, input_ids=pl_input_ids, input_mask=pl_mask, token_type_ids=pl_token_type_ids, use_one_hot_embeddings=False) tvars = tf.compat.v1.trainable_variables() (assignment_map, initialized_var_names) = get_assignment_map_from_checkpoint( tvars, bert_ckpt_file) tf.compat.v1.train.init_from_checkpoint(bert_ckpt_file, assignment_map) return s_model, pl_input_ids, pl_token_type_ids, pl_mask
def get_model_from_args(args): config = BertConfig.from_json_file(args.config_file) if config.vocab_size % 8 != 0: config.vocab_size += 8 - (config.vocab_size % 8) class BertForQuestionAnswering_int32_inputs(BertForQuestionAnswering): def forward(self, input_ids, segment_ids, attention_mask): input_ids, segment_ids, attention_mask = input_ids.long( ), segment_ids.long(), attention_mask.long() return super().forward(input_ids, segment_ids, attention_mask) model = BertForQuestionAnswering_int32_inputs(config) model.enable_apex(False) if os.path.isfile(args.checkpoint): state_dict = torch.load(args.checkpoint, map_location="cpu") state_dict = state_dict["model"] if "model" in state_dict.keys( ) else state_dict model.load_state_dict(state_dict, strict=False) if args.precision == "fp16": model = model.half() device = "cuda:0" if not args.cpu else "cpu" model = model.to(device) model.eval() model.bermuda_batch_axis = 0 if not args.fixed_batch_dim else None return model
def load_query_encoder(self, device, args): # Configure paths for query encoder serving vocab_path = os.path.join(args.metadata_dir, args.vocab_name) bert_config_path = os.path.join( args.metadata_dir, args.bert_config_name.replace(".json", "") + "_" + args.bert_model_option + ".json" ) # Load pretrained QueryEncoder bert_config = BertConfig.from_json_file(bert_config_path) model = DenSPI(bert_config) if args.parallel: model = torch.nn.DataParallel(model) state = torch.load(args.query_encoder_path, map_location='cpu') try: model.load_state_dict(state['model']) logger.info('load okay') except: model.load_state_dict(state, strict=False) check_diff(model.state_dict(), state['model']) logger.info('Model loaded from %s' % args.query_encoder_path) model.to(device) tokenizer = tokenization.FullTokenizer(vocab_file=vocab_path, do_lower_case=not args.do_case) logger.info('Model loaded from %s' % args.query_encoder_path) logger.info('Number of model parameters: {:,}'.format(sum(p.numel() for p in model.parameters()))) return model, tokenizer
def convert(): # Initialise PyTorch model config = BertConfig.from_json_file(args.bert_config_file) model = BertModel(config) # Load weights from TF model path = args.tf_checkpoint_path print("Converting TensorFlow checkpoint from {}".format(path)) init_vars = tf.train.list_variables(path) names = [] arrays = [] for name, shape in init_vars: print("Loading {} with shape {}".format(name, shape)) array = tf.train.load_variable(path, name) print("Numpy array shape {}".format(array.shape)) names.append(name) arrays.append(array) for name, array in zip(names, arrays): if not name.startswith("bert"): print("Skipping {}".format(name)) continue else: name = name.replace("bert/", "") # skip "bert/" print("Loading {}".format(name)) name = name.split('/') # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v # which are not required for using pretrained model if name[0] in ['redictions', 'eq_relationship' ] or name[-1] == "adam_v" or name[-1] == "adam_m": print("Skipping {}".format("/".join(name))) continue pointer = model for m_name in name: if re.fullmatch(r'[A-Za-z]+_\d+', m_name): l = re.split(r'_(\d+)', m_name) else: l = [m_name] if l[0] == 'kernel': pointer = getattr(pointer, 'weight') else: pointer = getattr(pointer, l[0]) if len(l) >= 2: num = int(l[1]) pointer = pointer[num] if m_name[-11:] == '_embeddings': pointer = getattr(pointer, 'weight') elif m_name == 'kernel': array = np.transpose(array) try: assert pointer.shape == array.shape except AssertionError as e: e.args += (pointer.shape, array.shape) raise pointer.data = torch.from_numpy(array) # Save pytorch-model torch.save(model.state_dict(), args.pytorch_dump_path)
def convert_tmp_to_pytorch(bert_config_file, pytorch_dump_path): import torch from modeling import BertConfig, BertForPreTraining import pickle with open("tmp_names", "rb") as fp: # Unpickling # names = pickle.load(fp, encoding='iso-8859-1') names = pickle.load(fp) with open("tmp_arrays", "rb") as fp: # Unpickling # arrays = pickle.load(fp, encoding='iso-8859-1') arrays = pickle.load(fp) # Initialise PyTorch model config = BertConfig.from_json_file(bert_config_file) print("Building PyTorch model from configuration: {}".format(str(config))) model = BertForPreTraining(config) for name, array in zip(names, arrays): name = name.split('/') # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v # which are not required for using pretrained model if name[-1] in ["adam_v", "adam_m", 'global_step']: print("Skipping {}".format("/".join(name))) continue pointer = model for m_name in name: if fullmatch(r'[A-Za-z]+_\d+', m_name): # if re.fullmatch(r'[A-Za-z]+_\d+', m_name): l = re.split(r'_(\d+)', m_name) else: l = [m_name] if l[0] == 'kernel': pointer = getattr(pointer, 'weight') elif l[0] == 'output_bias': pointer = getattr(pointer, 'bias') elif l[0] == 'output_weights': pointer = getattr(pointer, 'weight') else: pointer = getattr(pointer, l[0]) if len(l) >= 2: num = int(l[1]) pointer = pointer[num] if m_name[-11:] == '_embeddings': pointer = getattr(pointer, 'weight') elif m_name == 'kernel': array = np.transpose(array) try: assert pointer.shape == array.shape except AssertionError as e: e.args += (pointer.shape, array.shape) raise print("Initialize PyTorch weight {}".format(name)) pointer.data = torch.from_numpy(array) # Save pytorch-model print("Save PyTorch model to {}".format(pytorch_dump_path)) torch.save(model.state_dict(), pytorch_dump_path)
def convert(): args = parser.parse_args() args.tf_checkpoint_path = "chinese_L-12_H-768_A-12\\bert_model.ckpt" args.bert_config_file = "chinese_L-12_H-768_A-12\\bert_config.json" args.pytorch_dump_path = "chinese_L-12_H-768_A-12\pytorch_model.bin" # Initialise PyTorch model config = BertConfig.from_json_file(args.bert_config_file) model = BertModel(config) # Load weights from TF model path = args.tf_checkpoint_path print("Converting TensorFlow checkpoint from {}".format(path)) init_vars = tf.train.list_variables(path) names = [] arrays = [] for name, shape in init_vars: print("Loading {} with shape {}".format(name, shape)) array = tf.train.load_variable(path, name) print("Numpy array shape {}".format(array.shape)) names.append(name) arrays.append(array) for name, array in zip(names, arrays): name = name[5:] # skip "bert/" print("Loading {}".format(name)) name = name.split('/') if name[0] in ['redictions', 'eq_relationship']: print("Skipping") continue pointer = model for m_name in name: if re.fullmatch(r'[A-Za-z]+_\d+', m_name): l = re.split(r'_(\d+)', m_name) else: l = [m_name] if l[0] == 'kernel': pointer = getattr(pointer, 'weight') else: pointer = getattr(pointer, l[0]) if len(l) >= 2: num = int(l[1]) pointer = pointer[num] if m_name[-11:] == '_embeddings': pointer = getattr(pointer, 'weight') elif m_name == 'kernel': array = np.transpose(array) try: assert pointer.shape == array.shape except AssertionError as e: e.args += (pointer.shape, array.shape) raise pointer.data = torch.from_numpy(array) # Save pytorch-model torch.save(model.state_dict(), args.pytorch_dump_path)
def get_predictor_model(cls): config = BertConfig.from_json_file(config_file) model = BertForQuestionAnswering(config) model.load_state_dict( torch.load(MODEL_PATH, map_location='cpu')["model"]) model.to(device) cls.model = model return cls.model
def eval_all(): output_model_file = "../model_dir/best_model" output_config_file = "../model_dir/bert_config.json" config = BertConfig(output_config_file) model = BertForQuestionAnswering(config) model.load_state_dict( torch.load(output_model_file)) #, map_location='cpu')) evaluate(model.cpu(), result_file="../metric/predicts.json")
def prepare_model_and_optimizer(self): # Prepare model self.config = BertConfig.from_json_file(self.args.config_file) # Padding for divisibility by 8 if self.config.vocab_size % 8 != 0: self.config.vocab_size += 8 - (self.config.vocab_size % 8) self.model = BertForPreTraining(self.config) self.another_model = BertForPreTraining(self.config) self.model.to(self.device) self.another_model.to(self.device) param_optimizer = list(self.model.named_parameters()) no_decay = ['bias', 'gamma', 'beta', 'LayerNorm'] optimizer_grouped_parameters = [] names = [] for n, p in param_optimizer: if not any(nd in n for nd in no_decay): optimizer_grouped_parameters.append({ 'params': [p], 'weight_decay': 0.01, 'name': n }) names.append({'params': [n], 'weight_decay': 0.01}) if any(nd in n for nd in no_decay): optimizer_grouped_parameters.append({ 'params': [p], 'weight_decay': 0.00, 'name': n }) names.append({'params': [n], 'weight_decay': 0.00}) if self.args.phase2: max_steps = self.args.max_steps tmp = max_steps * 10 r = self.args.phase1_end_step / tmp lr = self.args.learning_rate * (1 - r) else: max_steps = int(self.args.max_steps / 9 * 10) lr = self.args.learning_rate if self.args.optimizer == "lamb": self.optimizer = BertLAMB(optimizer_grouped_parameters, lr=lr, warmup=self.args.warmup_proportion if not self.args.phase2 else -1, t_total=max_steps) elif self.args.optimizer == "adam": self.optimizer = BertAdam(optimizer_grouped_parameters, lr=lr, warmup=self.args.warmup_proportion if not self.args.phase2 else -1, t_total=max_steps)
def initialize_model(args): ''' return model, ready to trace ''' config = BertConfig.from_json_file(args.config_file) if config.vocab_size % 8 != 0: config.vocab_size += 8 - (config.vocab_size % 8) model = BertForQuestionAnswering(config) model.enable_apex(False) state_dict = torch.load(args.checkpoint, map_location='cpu')["model"] model.load_state_dict(state_dict) if args.fp16: model.half() return model
def __init__(self, model_name: str, models_dir='_models', device='/device:GPU:0', is_training=False, use_one_hot_embeddings=False, verb=0): self.model_name = model_name self.models_dir = models_dir if verb > 0: print('\n*** BertMC *** initializing (folder: %s, model: %s)' % (self.models_dir, self.model_name)) self.graph = tf.Graph() with self.graph.as_default(): device = device_TF(devices=device, verb=verb)[0] if verb > 1: print(' > building graph on cuda: %s' % device) with tf.device(device): self.features = { 'input_ids': tf.placeholder(shape=[None, None], dtype=tf.int32), 'input_mask': tf.placeholder(shape=[None, None], dtype=tf.int32), 'input_type_ids': tf.placeholder(shape=[None, None], dtype=tf.int32) } super(BertMC, self).__init__( config=BertConfig.from_json_file(self.models_dir + '/' + self.model_name + '/bert_config.json'), is_training=is_training, input_ids=self.features['input_ids'], input_mask=self.features['input_mask'], token_type_ids=self.features['input_type_ids'], use_one_hot_embeddings=use_one_hot_embeddings) self.tvars = tf.trainable_variables() checkpoint = self.models_dir + '/' + self.model_name + '/bert_model.ckpt' (assignment_map, initialized_variable_names) = get_assignment_map_from_checkpoint( self.tvars, checkpoint) tf.train.init_from_checkpoint(checkpoint, assignment_map) init = tf.global_variables_initializer() self.sess = tf.Session( graph=self.graph, config=tf.ConfigProto(allow_soft_placement=True)) self.sess.run(init)
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path): # Initialise PyTorch model config = BertConfig.from_json_file(bert_config_file) print("Building PyTorch model from configuration: {}".format(str(config))) model = BertForPreTraining(config) # Load weights from tf checkpoint load_tf_weights_in_bert(model, tf_checkpoint_path) # Save pytorch-model print("Save PyTorch model to {}".format(pytorch_dump_path)) torch.save(model.state_dict(), pytorch_dump_path)
def bert_module_fn(is_training): """Spec function for a token embedding module.""" input_ids = tf.placeholder(shape=[None, None], dtype=tf.int32, name="input_ids") input_mask = tf.placeholder(shape=[None, None], dtype=tf.int32, name="input_mask") token_type = tf.placeholder(shape=[None, None], dtype=tf.int32, name="segment_ids") config = BertConfig.from_json_file(config_path) model = BertModel(config=config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type) model.input_to_output() seq_output = model.get_all_encoder_layers()[-1] config_file = tf.constant(value=config_path, dtype=tf.string, name="config_file") vocab_file = tf.constant(value=vocab_path, dtype=tf.string, name="vocab_file") lower_case = tf.constant(do_lower_case) tf.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS, config_file) tf.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS, vocab_file) input_map = { "input_ids": input_ids, "input_mask": input_mask, "segment_ids": token_type } output_map = {"sequence_output": seq_output} output_info_map = { "vocab_file": vocab_file, "do_lower_case": lower_case } hub.add_signature(name="tokens", inputs=input_map, outputs=output_map) hub.add_signature(name="tokenization_info", inputs={}, outputs=output_info_map)
def __init__(self, output_dir: str, task_name: str, bert_config: str, max_seq_length: int): processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, "xnli": XnliProcessor } task_name = task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() num_labels = len(label_list) # create model for CPU/dGPU, not TPU use_one_hot_embeddings = False bert_config = BertConfig.from_json_file(bert_config) bert_config.experimental_gelu = FLAGS.experimental_gelu if FLAGS.precision: bert_config.precision = FLAGS.precision self.session = tf.compat.v1.Session() placeholder = tf.compat.v1.placeholder input_shape = [None, max_seq_length] self.label_ids = placeholder(tf.int32, [None], name='label_ids') self.input_ids = placeholder(tf.int32, input_shape, name='input_ids') self.input_mask = placeholder(tf.int32, input_shape, name='input_mask') self.segment_ids = placeholder(tf.int32, input_shape, name='segment_ids') self.loss, self.per_example_loss, self.logits, self.probabilities = \ create_model_top(bert_config, False, # is training self.input_ids, self.input_mask, self.segment_ids, self.label_ids, num_labels, use_one_hot_embeddings, None) # frozen graph path latest_model = tf.train.latest_checkpoint(FLAGS.output_dir) saver = tf.compat.v1.train.Saver() saver.restore(self.session, latest_model) self.output_dir = output_dir self.dest_dir = os.path.join(self.output_dir, "frozen") if not os.path.exists(self.dest_dir): os.mkdir(self.dest_dir)
def init_models(): models = [] for model_path, model_num, config_path in zip(MODEL_PATHS, MODEL_NUMS, CONFIG_PATHS): config = BertConfig(config_path) model = BertForQuestionAnswerings[model_num](config) try: model.load_state_dict(torch.load(model_path)) except: model = nn.DataParallel(model) model.load_state_dict(torch.load(model_path)) models.append(model) return models
def __init__(self): self.input_ids = tf.placeholder(tf.int32, [None, None]) self.input_mask = tf.placeholder(tf.int32, [None, None]) self.model = BertModel(config=BertConfig.from_json_file(bert_config), is_training=True, input_ids=self.input_ids, input_mask=self.input_mask) self.is_training = tf.placeholder(tf.bool, []) self.predictions = self.construct_model(self.model) self.id_predictions = tf.argmax(self.predictions, axis=2) self.Y = tf.placeholder(tf.float32, [None, None, len(entity_types)]) self.tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case=False) self.model_path = "/scratch/sanjay/bert/bert_ner_model.ckpt" self.saver = tf.train.Saver()
def init_modle(self, model): print(f'starting to init model') vocab_path = os.path.join(self.model_path, 'vocab.txt') bert_config_file = os.path.join(self.model_path, 'bert_config.json') self.bert_config = BertConfig.from_json_file(bert_config_file) self.model = model(self.bert_config, 2) weight_path = os.path.join(self.model_path, 'pytorch_model.bin') new_state_dict = torch.load(weight_path) new_state_dict = dict([(k[7:], v) if k.startswith('module') else (k, v) for k, v in new_state_dict.items()]) self.model.load_state_dict(new_state_dict) self.tokenizer = tokenization.FullTokenizer(vocab_file=vocab_path) self.device = 'cuda' if torch.cuda.is_available() else 'cpu' self.model.to(self.device) self.model.eval() print(f'init {model} model finished')
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path): # 加载模型参数 config = BertConfig.from_json_file(bert_config_file) print("Building PyTorch model from configuration: {}".format(str(config))) # 加载模型 model = BertForPreTraining(config) # 加载检查点参数到模型中,进行处理 # 但是有一个问题,为什么加不加返回model都能返回???猜测其为内部已进行处理 load_tf_weights_in_bert(model, tf_checkpoint_path) print("Save PyTorch model to {}".format(pytorch_dump_path)) # 保存pytorch的检查点 torch.save(model.state_dict(), pytorch_dump_path)
def test_encode_context(self): x_context_value = [ [ 151, 12553, 8997, 8792, 10086, 8168, 10481, 9356, 8174, 10404, 9066, 10003, 10610, 10879] + [0 for i in range(312 - 14)], [8670, 11136, 8997, 10564, 8303, 8228, 8373, 10003, 8307, 119, 151, 12553, 8233, 8815] + [0 for i in range(312 - 14)] ] bert_scope = tf.VariableScope(name="bert", reuse=tf.AUTO_REUSE) bert_config = BertConfig.from_json_file(self._poly_encoder_config.bert_config) x_context = tf.convert_to_tensor(value=x_context_value, dtype=tf.int32) context_vecs, poly_code_mask = self._encoder_inst.encode_context(x_context=x_context, bert_config=bert_config, bert_scope=bert_scope) print(context_vecs) print(poly_code_mask)
def test_encode_candidate(self): x_response_value = [ [10378, 119, 119, 151, 8815, 8281, 8211, 10425, 8154, 0, 0, 0, 0, 0] + [0 for i in range(512 - 14)], [165, 8991, 8181, 8184, 131, 120, 120, 8134, 11300, 10540, 8735, 8207, 0, 0] + [0 for i in range(512 - 14)] ] bert_scope = tf.VariableScope(name="bert", reuse=tf.AUTO_REUSE) bert_config = BertConfig.from_json_file(self._poly_encoder_config.bert_config) x_response = tf.convert_to_tensor(value=x_response_value, dtype=tf.int32) x_response_emb, x_response_mask = self._encoder_inst.encode_candidate(x_response=x_response, bert_config=bert_config, bert_scope=bert_scope) print(x_response_emb) print(x_response_mask)
def make_global_options(task_specific_parsers=[]): # Parse command-line arguments command_line_parser = create_command_line_parser() all_options_parser = create_all_options_parser() for task_parser in task_specific_parsers: all_options_parser = task_parser(all_options_parser) known_command_line_args, unknown_command_line_args = command_line_parser.parse_known_args( ) if known_command_line_args.help or known_command_line_args.config is None: all_options_parser.print_help() sys.exit(os.EX_OK) # Parse options specified in the configuration file into config_file_path = known_command_line_args.config opts_from_config_file = BertConfig.from_json_file(config_file_path) # Build the global options structure from the default options current_options = vars(all_options_parser.parse_args()) unknown_options = [ opt for opt in opts_from_config_file.keys() if opt not in current_options.keys() ] if unknown_options: logging.error(f"Unonwn options: {unknown_options}") sys.exit(os.EX_USAGE) # Overwrite global options by those specified in the config file. current_options.update(opts_from_config_file) options_namespace = argparse.Namespace(**current_options) # Overwrite with command-line arguments all_options_namespace = all_options_parser.parse_args( unknown_command_line_args, options_namespace) logging.info( f"Overwrite configuration parameters: {', '.join(unknown_command_line_args)}" ) # argparse.Namespace -> dict() opts = vars(all_options_namespace) return opts
def create_stock_bert_graph(bert_config_file, max_seq_len): from modeling import BertModel, BertConfig tf_placeholder = tf.compat.v1.placeholder pl_input_ids = tf_placeholder(tf.int32, shape=(1, max_seq_len)) pl_mask = tf_placeholder(tf.int32, shape=(1, max_seq_len)) pl_token_type_ids = tf_placeholder(tf.int32, shape=(1, max_seq_len)) bert_config = BertConfig.from_json_file(bert_config_file) s_model = BertModel(config=bert_config, is_training=False, input_ids=pl_input_ids, input_mask=pl_mask, token_type_ids=pl_token_type_ids, use_one_hot_embeddings=False) return s_model, pl_input_ids, pl_mask, pl_token_type_ids
def feature_extactor(self, dummy=False): if not dummy: model_pre_trained = PreTrainedBertModel(self.model_dir, Verbose=True) else: print("Using randomly initialized model...") model_pre_trained = BERTModel(config=BertConfig(), Verbose=True, trainable=False) model_pre_trained.build(input_shape=(self.batch_size, self.max_seq_length)) print("Computing embeddings...") self.bert_embeddings = model_pre_trained(K.variable( self.all_input_ids)) print("Evaluating...") self.all_encoder_layers = np.array( [K.eval(emb) for emb in self.bert_embeddings[:-1]]) print("Output BERT shape: ", self.all_encoder_layers.shape)
def __init__(self, model_path): """ to obtain sentences embeddings model model path: init model weight path """ vocab_path = os.path.join(model_path, 'vocab.txt') bert_config_file = os.path.join(model_path, 'bert_config.json') self.bert_config = BertConfig.from_json_file(bert_config_file) print(f'starting to init model') self.model = TwoSentenceClassifier(self.bert_config, 2) weight_path = os.path.join(model_path, 'pytorch_model.bin') new_state_dict = torch.load(weight_path) new_state_dict = dict([(k[7:], v) if k.startswith('module') else (k, v) for k, v in new_state_dict.items()]) self.model.load_state_dict(new_state_dict) self.tokenizer = tokenization.FullTokenizer(vocab_file=vocab_path) self.device = 'cuda' if torch.cuda.is_available() else 'cpu' self.model.to(self.device) self.model.eval() print(f'init model finished')
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--bert_config_file", default=None, type=str, required=True, help="The config json file corresponding to the pre-trained BERT model. \n" "This specifies the model architecture.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument("--vocab_file", default=None, type=str, required=True, help="The vocabulary file that the BERT model was trained on.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written.") ## Other parameters parser.add_argument("--init_checkpoint", default=None, type=str, help="Initial checkpoint (usually from a pre-trained BERT model).") parser.add_argument("--do_lower_case", default=False, action='store_true', help="Whether to lower case the input text. True for uncased models, False for cased models.") parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--save_checkpoints_steps", default=1000, type=int, help="How often to save the model checkpoint.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumualte before performing a backward/update pass.") parser.add_argument('--optimize_on_cpu', default=False, action='store_true', help="Whether to perform optimization and keep the optimizer averages on CPU") parser.add_argument('--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=128, help='Loss scaling, positive power of 2 values can improve fp16 convergence.') args = parser.parse_args() processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, "news": NewsProcessor, } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs # torch.distributed.init_process_group(backend='nccl') if args.fp16: logger.info("16-bits training currently not supported in distributed training") args.fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496) logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") bert_config = BertConfig.from_json_file(args.bert_config_file) if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}".format( args.max_seq_length, bert_config.max_position_embeddings)) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() tokenizer = tokenization.FullTokenizer( vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) label_list = processor.get_labels() print("label_list.size:%d\n" %(len(label_list))) # Prepare model model = BertForSequenceClassification(bert_config, len(label_list)) if args.init_checkpoint is not None: model.bert.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu')) if args.fp16: model.half() model.to(device) #if args.local_rank != -1: #model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], # output_device=args.local_rank) #elif n_gpu > 1: # model = torch.nn.DataParallel(model) # Prepare optimizer if args.fp16: param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) \ for n, param in model.named_parameters()] elif args.optimize_on_cpu: param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \ for n, param in model.named_parameters()] else: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if n not in no_decay], 'weight_decay_rate': 0.01}, {'params': [p for n, p in param_optimizer if n in no_decay], 'weight_decay_rate': 0.0} ] optimizer = BERTAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) global_step = 0 if args.do_train: train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = RandomSampler(train_data) #train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss, _ = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16 or args.optimize_on_cpu: if args.fp16 and args.loss_scale != 1.0: # scale down gradients for fp16 training for param in model.parameters(): param.grad.data = param.grad.data / args.loss_scale is_nan = set_optimizer_params_grad(param_optimizer, model.named_parameters(), test_nan=True) if is_nan: logger.info("FP16 TRAINING: Nan in gradients, reducing loss scaling") args.loss_scale = args.loss_scale / 2 model.zero_grad() continue optimizer.step() copy_optimizer_params_to_model(model.named_parameters(), param_optimizer) else: optimizer.step() model.zero_grad() global_step += 1 if args.do_eval: eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: eval_sampler = SequentialSampler(eval_data) else: eval_sampler = SequentialSampler(eval_data) #eval_sampler = DistributedSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits = model(input_ids, segment_ids, input_mask, label_ids) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples result = {'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': tr_loss/nb_tr_steps} output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))