def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path): config_path = os.path.abspath(bert_config_file) tf_path = os.path.abspath(tf_checkpoint_path) print("Converting TensorFlow checkpoint from {} with config at {}".format( tf_path, config_path)) # Load weights from TF model init_vars = tf.train.list_variables(tf_path) names = [] arrays = [] for name, shape in init_vars: print("Loading TF weight {} with shape {}".format(name, shape)) array = tf.train.load_variable(tf_path, name) names.append(name) arrays.append(array) # Initialise PyTorch model config = BertConfig.from_json_file(bert_config_file) print("Building PyTorch model from configuration: {}".format(str(config))) model = BertForPreTraining(config) for name, array in zip(names, arrays): name = name.split('/') # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v # which are not required for using pretrained model if any(n in ["adam_v", "adam_m"] for n in name): print("Skipping {}".format("/".join(name))) continue pointer = model for m_name in name: if re.fullmatch(r'[A-Za-z]+_\d+', m_name): l = re.split(r'_(\d+)', m_name) else: l = [m_name] if l[0] == 'kernel' or l[0] == 'gamma': pointer = getattr(pointer, 'weight') elif l[0] == 'output_bias' or l[0] == 'beta': pointer = getattr(pointer, 'bias') elif l[0] == 'output_weights': pointer = getattr(pointer, 'weight') else: pointer = getattr(pointer, l[0]) if len(l) >= 2: num = int(l[1]) pointer = pointer[num] if m_name[-11:] == '_embeddings': pointer = getattr(pointer, 'weight') elif m_name == 'kernel': array = np.transpose(array) try: assert pointer.shape == array.shape except AssertionError as e: e.args += (pointer.shape, array.shape) raise print("Initialize PyTorch weight {}".format(name)) pointer.data = torch.from_numpy(array) # Save pytorch-model print("Save PyTorch model to {}".format(pytorch_dump_path)) torch.save(model.state_dict(), pytorch_dump_path)
def convert_tmp_to_pytorch(bert_config_file, pytorch_dump_path): import torch from modeling import BertConfig, BertForPreTraining import pickle with open("tmp_names", "rb") as fp: # Unpickling # names = pickle.load(fp, encoding='iso-8859-1') names = pickle.load(fp) with open("tmp_arrays", "rb") as fp: # Unpickling # arrays = pickle.load(fp, encoding='iso-8859-1') arrays = pickle.load(fp) # Initialise PyTorch model config = BertConfig.from_json_file(bert_config_file) print("Building PyTorch model from configuration: {}".format(str(config))) model = BertForPreTraining(config) for name, array in zip(names, arrays): name = name.split('/') # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v # which are not required for using pretrained model if name[-1] in ["adam_v", "adam_m", 'global_step']: print("Skipping {}".format("/".join(name))) continue pointer = model for m_name in name: if fullmatch(r'[A-Za-z]+_\d+', m_name): # if re.fullmatch(r'[A-Za-z]+_\d+', m_name): l = re.split(r'_(\d+)', m_name) else: l = [m_name] if l[0] == 'kernel': pointer = getattr(pointer, 'weight') elif l[0] == 'output_bias': pointer = getattr(pointer, 'bias') elif l[0] == 'output_weights': pointer = getattr(pointer, 'weight') else: pointer = getattr(pointer, l[0]) if len(l) >= 2: num = int(l[1]) pointer = pointer[num] if m_name[-11:] == '_embeddings': pointer = getattr(pointer, 'weight') elif m_name == 'kernel': array = np.transpose(array) try: assert pointer.shape == array.shape except AssertionError as e: e.args += (pointer.shape, array.shape) raise print("Initialize PyTorch weight {}".format(name)) pointer.data = torch.from_numpy(array) # Save pytorch-model print("Save PyTorch model to {}".format(pytorch_dump_path)) torch.save(model.state_dict(), pytorch_dump_path)
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path): # Initialise PyTorch model config = BertConfig.from_json_file(bert_config_file) print("Building PyTorch model from configuration: {}".format(str(config))) model = BertForPreTraining(config) # Load weights from tf checkpoint load_tf_weights_in_bert(model, tf_checkpoint_path) # Save pytorch-model print("Save PyTorch model to {}".format(pytorch_dump_path)) torch.save(model.state_dict(), pytorch_dump_path)
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path): # 加载模型参数 config = BertConfig.from_json_file(bert_config_file) print("Building PyTorch model from configuration: {}".format(str(config))) # 加载模型 model = BertForPreTraining(config) # 加载检查点参数到模型中,进行处理 # 但是有一个问题,为什么加不加返回model都能返回???猜测其为内部已进行处理 load_tf_weights_in_bert(model, tf_checkpoint_path) print("Save PyTorch model to {}".format(pytorch_dump_path)) # 保存pytorch的检查点 torch.save(model.state_dict(), pytorch_dump_path)
bert_module = BertForPreTraining( 30522, 128, 768, 12, 12, 3072, nn.GELU(), 0.0, 0.0, 512, 2, ) load_params_from_lazy(bert_module.state_dict(), lazy_model_path) assert id(bert_module.cls.predictions.decoder.weight) == id( bert_module.bert.embeddings.word_embeddings.weight) with open( "../../OneFlow-Benchmark/LanguageModeling/BERT/lazy_input_output_1.pickle", "rb") as handle: lazy_info = pickle.load(handle) total_loss = lazy_info["total_loss"] mlm_loss = lazy_info["mlm_loss"] nsp_loss = lazy_info["nsp_loss"] mlm_logit_prob = lazy_info["mlm_logit_prob"] ns_logit_prob = lazy_info["ns_logit_prob"] input_ids = lazy_info["input_ids"]
def inference(args): start_t = time.time() bert_module = BertForPreTraining( args.vocab_size, args.seq_length, args.hidden_size, args.num_hidden_layers, args.num_attention_heads, args.intermediate_size, nn.GELU(), args.hidden_dropout_prob, args.attention_probs_dropout_prob, args.max_position_embeddings, args.type_vocab_size, args.vocab_size, ) end_t = time.time() print("Initialize model using time: {:.3f}s".format(end_t - start_t)) start_t = time.time() if args.use_lazy_model: from utils.compare_lazy_outputs import load_params_from_lazy load_params_from_lazy( bert_module.state_dict(), args.model_path, ) else: bert_module.load_state_dict(flow.load(args.model_path)) end_t = time.time() print("Loading parameters using time: {:.3f}s".format(end_t - start_t)) bert_module.eval() bert_module.to(args.device) class BertEvalGraph(nn.Graph): def __init__(self): super().__init__() self.bert = bert_module def build(self, input_ids, input_masks, segment_ids): input_ids = input_ids.to(device=args.device) input_masks = input_masks.to(device=args.device) segment_ids = segment_ids.to(device=args.device) with flow.no_grad(): # 1. forward the next_sentence_prediction and masked_lm model _, seq_relationship_scores = self.bert(input_ids, input_masks, segment_ids) return seq_relationship_scores bert_eval_graph = BertEvalGraph() start_t = time.time() inputs = [np.random.randint(0, 20, size=args.seq_length)] inputs = flow.Tensor(inputs, dtype=flow.int64, device=flow.device(args.device)) mask = flow.cast(inputs > 0, dtype=flow.int64) segment_info = flow.zeros_like(inputs) prediction = bert_eval_graph(inputs, mask, segment_info) print(prediction.numpy()) end_t = time.time() print("Inference using time: {:.3f}".format(end_t - start_t))