def convert_tf_2_ms(tf_ckpt_path, ms_ckpt_path, new_ckpt_path): """ convert tf checkpoint to ms checkpoint """ tf2ms_param_dict = dict(zip(ms2tf_param_dict.values(), ms2tf_param_dict.keys())) # load MS checkpoint ms_param_dict = load_checkpoint(ms_ckpt_path) new_params_list = [] session = tf.compat.v1.Session() count = 0 for ms_name in tf2ms_param_dict.keys(): count += 1 param_dict = {} tf_name = tf2ms_param_dict[ms_name] data = tf.train.load_variable(tf_ckpt_path, tf_name) ms_shape = ms_param_dict[ms_name].data.shape tf_shape = data.shape if len(ms_shape) == 2: if ms_shape != tf_shape or ms_shape[0] == ms_shape[1]: data = tf.transpose(data, (1, 0)) data = data.eval(session=session) param_dict['name'] = ms_name param_dict['data'] = Tensor(data) new_params_list.append(param_dict) print("start saving checkpoint ...") save_checkpoint(new_params_list, new_ckpt_path) print("ms checkpoint was save in :", new_ckpt_path) return True
def test_save_checkpoint(): """ test_save_checkpoint """ parameter_list = [] one_param = {} param1 = {} param2 = {} one_param['name'] = "param_test" one_param['data'] = Tensor(np.random.randint(0, 255, [1, 3, 224, 224]), dtype=mstype.float32) param1['name'] = "param" param1['data'] = Tensor(np.random.randint(0, 255, [12, 1024]), dtype=mstype.float32) param2['name'] = "new_param" param2['data'] = Tensor(np.random.randint(0, 255, [12, 1024, 1]), dtype=mstype.float32) parameter_list.append(one_param) parameter_list.append(param1) parameter_list.append(param2) if os.path.exists('./parameters.ckpt'): os.chmod('./parameters.ckpt', stat.S_IWRITE) os.remove('./parameters.ckpt') ckpt_file_name = os.path.join(_cur_dir, './parameters.ckpt') save_checkpoint(parameter_list, ckpt_file_name)
def pt_to_ckpt(pt, ckpt, out_ckpt): """ Pt convert to ckpt file """ state_dict_torch = load_model(pt) state_dict_ms = load_model_ms(ckpt) name_relate = name_map(state_dict_ms) new_params_list = [] for key in state_dict_torch: param_dict = {} parameter = state_dict_torch[key] parameter = parameter.numpy() # depwise conv pytorch[cout, 1, k , k] -> ms[1, cin, k , k], cin = cout if state_dict_ms[name_relate[key]].data.shape != parameter.shape: parameter = parameter.transpose(1, 0, 2, 3) print('ms=', state_dict_ms[name_relate[key]].data.shape, 'pytorch=', parameter.shape, 'name=', key) param_dict['name'] = name_relate[key] param_dict['data'] = Tensor(parameter) new_params_list.append(param_dict) save_checkpoint(new_params_list, out_ckpt) return state_dict_ms
def convert(weights_file, output_file): """Conver weight to mindspore ckpt.""" params = build_network() weights = load_weight(weights_file) index = 0 param_list = [] for i in range(0, len(params), 5): weight = params[i] mean = params[i+1] var = params[i+2] gamma = params[i+3] beta = params[i+4] beta_data = weights[index: index+beta.size].reshape(beta.shape) index += beta.size gamma_data = weights[index: index+gamma.size].reshape(gamma.shape) index += gamma.size mean_data = weights[index: index+mean.size].reshape(mean.shape) index += mean.size var_data = weights[index: index + var.size].reshape(var.shape) index += var.size weight_data = weights[index: index+weight.size].reshape(weight.shape) index += weight.size param_list.append({'name': weight.name, 'type': weight.dtype, 'shape': weight.shape, 'data': Tensor(weight_data)}) param_list.append({'name': mean.name, 'type': mean.dtype, 'shape': mean.shape, 'data': Tensor(mean_data)}) param_list.append({'name': var.name, 'type': var.dtype, 'shape': var.shape, 'data': Tensor(var_data)}) param_list.append({'name': gamma.name, 'type': gamma.dtype, 'shape': gamma.shape, 'data': Tensor(gamma_data)}) param_list.append({'name': beta.name, 'type': beta.dtype, 'shape': beta.shape, 'data': Tensor(beta_data)}) save_checkpoint(param_list, output_file)
def save(self, signum, frame): """ Save current checkpoint when an error is occur. """ print(f"process sig {signum} and frame content {frame}") if self.cb_params is None: return prefix = _check_bpckpt_file_name_if_same_exist(self._directory, self._prefix) step_num_in_epoch = int( (self.cb_params.cur_step_num - 1) % self.cb_params.batch_num + 1) cur_ckpt_file = f"{prefix}-{self.cb_params.cur_epoch_num}_{step_num_in_epoch}_breakpoint.ckpt" cur_file = os.path.join(self._directory, cur_ckpt_file) if "epoch_num" in self._append_dict: self._append_dict[ "epoch_num"] = self._append_epoch_num + self.cb_params.cur_epoch_num if "step_num" in self._append_dict: self._append_dict[ "step_num"] = self._append_step_num + self.cb_params.cur_step_num network = self._config.saved_network if self._config.saved_network is not None else self.cb_params.train_network save_checkpoint(network, cur_file, self._config.integrated_save, self._config.async_save, self._append_dict, self._config.enc_key, self._config.enc_mode) raise RuntimeError("Term exception happened.")
def step_end(self, run_context): """step end and do evaluation""" cb_params = run_context.original_args() if cb_params.cur_step_num % self.eval_ckpt_step == 0: params_dict = save_params(self.network) convert_network(self.network, self.embedding_bits, self.weight_bits, self.clip_value) self.network.set_train(False) callback = self.metrics() columns_list = [ "input_ids", "input_mask", "segment_ids", "label_ids" ] for data in self.dataset: input_data = [] for i in columns_list: input_data.append(data[i]) input_ids, input_mask, token_type_id, label_ids = input_data _, _, logits, _ = self.network(input_ids, token_type_id, input_mask) callback.update(logits, label_ids) metrics = callback.get_metrics() if metrics > self.global_metrics: self.global_metrics = metrics eval_model_ckpt_file = os.path.join(self.save_ckpt_dir, 'eval_model.ckpt') if os.path.exists(eval_model_ckpt_file): os.remove(eval_model_ckpt_file) save_checkpoint(self.network, eval_model_ckpt_file) print('step {}, {} {}, best_{} {}'.format(cb_params.cur_step_num, callback.name, metrics, callback.name, self.global_metrics)) restore_params(self.network, params_dict) self.network.set_train(True)
def adaptive_weight(ckpt_file, ms_model): """Adapte the weight shape.""" parameter_dict = load_checkpoint(ckpt_file) net_parameter = ms_model.parameters_and_names() new_ms_params_list = [] for index, paras in enumerate(net_parameter): net_para_name = paras[0] net_para_shape = paras[1].data.shape if net_para_name in parameter_dict: init_weight = parameter_dict[net_para_name].data init_para_shape = init_weight.shape if net_para_shape != init_para_shape: if "conv" in net_para_name: new_weight = _adaptive_conv(init_weight, net_para_shape) elif "batch_norm" in net_para_name: new_weight = _adaptive_bn(init_weight, net_para_shape) else: continue logging.debug("parameter shape not match,para name: {}, init_shape:{}, net_para_shape:{}". format(net_para_name, init_para_shape, net_para_shape)) param_dict = {} param_dict['name'] = net_para_name param_dict['data'] = init_weight if net_para_shape == init_para_shape else new_weight new_ms_params_list.append(param_dict) # parameter_dict[net_para_name].data = new_weight save_path = os.path.dirname(ckpt_file) save_file_name = os.path.join(save_path, "adaptive_" + uuid.uuid1().hex[:8] + ".ckpt") save_checkpoint(new_ms_params_list, save_file_name) return save_file_name
def extract_and_convert(input_dir, output_dir): if not os.path.exists(output_dir): os.makedirs(output_dir) config = json.load( open(os.path.join(input_dir, 'ernie_config.json'), 'rt', encoding='utf-8')) print('=' * 20 + 'save vocab file' + '=' * 20) shutil.copyfile(os.path.join(input_dir, 'vocab.txt'), os.path.join(output_dir, 'vocab.txt')) print('=' * 20 + 'extract weights' + '=' * 20) state_dict = [] weight_map = build_params_map(attention_num=config['num_hidden_layers']) with fluid.dygraph.guard(): paddle_paddle_params, _ = D.load_dygraph( os.path.join(input_dir, 'params')) for weight_name, weight_value in paddle_paddle_params.items(): if weight_name not in weight_map.keys(): continue #print(weight_name, weight_value.shape) if 'w_0' in weight_name \ or 'post_att_layer_norm_scale' in weight_name \ or 'post_ffn_layer_norm_scale' in weight_name \ or 'cls_out_w' in weight_name: weight_value = weight_value.transpose() state_dict.append({ 'name': weight_map[weight_name], 'data': Tensor(weight_value) }) print(weight_name, '->', weight_map[weight_name], weight_value.shape) save_checkpoint(state_dict, os.path.join(output_dir, "ernie.ckpt"))
def step_end(self, run_context): """step end and do evaluation""" cb_params = run_context.original_args() if cb_params.cur_step_num % 100 == 0: callback = Accuracy() columns_list = ["input_ids", "input_mask", "segment_ids", "label_ids"] for data in self.dataset.create_dict_iterator(num_epochs=1): input_data = [] for i in columns_list: input_data.append(data[i]) input_ids, input_mask, token_type_id, label_ids = input_data self.network.set_train(False) logits = self.network(input_ids, token_type_id, input_mask) callback.update(logits[3], label_ids) acc = callback.acc_num / callback.total_num with open("./eval.log", "a+") as f: f.write("acc_num {}, total_num{}, accuracy{:.6f}".format(callback.acc_num, callback.total_num, callback.acc_num / callback.total_num)) f.write('\n') if acc > self.global_acc: self.global_acc = acc print("The best acc is {}".format(acc)) eval_model_ckpt_file = "eval_model.ckpt" if os.path.exists(eval_model_ckpt_file): os.remove(eval_model_ckpt_file) save_checkpoint(self.network, eval_model_ckpt_file)
def run(args): ms.context.set_context( mode=ms.context.GRAPH_MODE, device_target=args.device, save_graphs=False, ) net = LeNet5( num_class=10, num_channel=3, use_bn=args.use_bn, dbg_log_tensor=args.log_tensor, ) loss = ms.nn.loss.SoftmaxCrossEntropyWithLogits( sparse=True, reduction='mean', ) opt = build_optimizer(args, net) if args.mode == 'init': save_checkpoint( net, ckpt_file_name=os.path.join('seeds', '%d.ckpt' % (time.time())), ) if args.mode == 'train': ds_train = create_dataset( args=args, data_path=os.path.join(args.data_path, 'train'), batch_size=args.device_batch_size, ) if args.init_ckpt: print('using init checkpoint %s' % (args.init_ckpt)) load_ckpt(net, args.init_ckpt) train(args, net, loss, opt, ds_train) if args.mode == 'test': if args.use_kungfu: rank = kfops.kungfu_current_rank() if rank > 0: return ds_test = create_dataset( args=args, data_path=os.path.join(args.data_path, 'test'), batch_size=args.device_batch_size, ) if args.ckpt_files: checkpoints = args.ckpt_files.split(',') else: checkpoint_dir = get_ckpt_dir(args) print('checkpoint_dir: %s' % (checkpoint_dir)) checkpoints = list(sorted(glob.glob(checkpoint_dir + '/*.ckpt'))) print('will test %d checkpoints' % (len(checkpoints))) # for i, n in enumerate(checkpoints): # print('[%d]=%s' % (i, n)) test(args, net, loss, opt, ds_test, checkpoints)
def pytorch2mindspore(pth_file): """Convert pytorch weight to mindspore checkpoint.""" torch_para_dict = torch.load(pth_file) torch_weight_list = [] torch_paras_name_list = [] ms_params_list = [] ms_para_name_list = [] for index, name in enumerate(torch_para_dict): torch_paras_name_list.append(name) torch_weight = torch_para_dict[name] # if name == "fc.weight": # ms_name = "fc.linear.weight" # elif name == "fc.bias": # ms_name = "fc.linear.bias" if name.endswith("weight"): name = name[:name.rfind("weight")] ms_name = "backbone." + name + "conv2d.weight" elif name.endswith('bias'): name = name[:name.rfind('bias')] ms_name = "backbone." + name + 'batch_norm.beta' elif name.endswith('.running_mean'): # fix batch_norm name old_name_gamma = ms_para_name_list[index - 2] new_name_gamma = old_name_gamma[:old_name_gamma.rfind( 'conv2d.weight')] + "batch_norm.gamma" ms_para_name_list[index - 2] = new_name_gamma name = name[:name.rfind('.running_mean')] ms_name = "backbone." + name + '.batch_norm.moving_mean' elif name.endswith('.running_var'): name = name[:name.rfind('.running_var')] ms_name = "backbone." + name + '.batch_norm.moving_variance' elif name.endswith(".num_batches_tracked"): ms_name = name torch_weight_list.append(torch_weight) ms_para_name_list.append(ms_name) for index, name in enumerate(ms_para_name_list): logging.debug('========================py_name: {}'.format( torch_paras_name_list[index])) logging.debug('========================ms_name: {}'.format(name)) param_dict = {} param_dict['name'] = name parameter = torch_weight_list[index] param_dict['data'] = Tensor(parameter.detach().numpy()) ms_params_list.append(param_dict) save_path = os.path.dirname(pth_file) save_file_name = os.path.join(save_path, "torch2ms_" + uuid.uuid1().hex[:8] + ".ckpt") save_checkpoint(ms_params_list, save_file_name) return save_file_name
def pytorch2mindspore_extend(pth_file, model): """Convert torchvison weight to vega weight of ms.""" init_para_dict = torch.load(pth_file) init_names_list = [] init_weights_list = [] for index, name in enumerate(init_para_dict): init_names_list.append(name) init_weights_list.append(init_para_dict[name]) vega_names_list = [] vega_weights_list = [] valid_names_list = [] for name in model.parameters_dict(): if not name.endswith("num_batches_tracked"): vega_names_list.append(name) for index, name in enumerate(vega_names_list): init_name = init_names_list[index] # if index < 1: # continue if name.endswith("weight") and "conv" in name and init_name.endswith("weight") and ( "conv" in init_name or "downsample" in init_name): valid_names_list.append(name) vega_weights_list.append(init_weights_list[index]) elif name.endswith("moving_mean") and ("bn" in name or "batch" in name) and init_name.endswith("running_mean"): valid_names_list.append(name) vega_weights_list.append(init_weights_list[index]) elif name.endswith("moving_variance") and ("bn" in name or "batch" in name) and init_name.endswith( "running_var"): valid_names_list.append(name) vega_weights_list.append(init_weights_list[index]) elif name.endswith("gamma") and ("bn" in name or "batch" in name) and init_name.endswith("weight") and ( "bn" in init_name or "downsample" in init_name): valid_names_list.append(name) vega_weights_list.append(init_weights_list[index]) elif name.endswith("beta") and ("bn" in name or "batch" in name) and init_name.endswith("bias") and ( "bn" in init_name or "downsample" in init_name): valid_names_list.append(name) vega_weights_list.append(init_weights_list[index]) else: continue ms_params_list = [] for index, name in enumerate(valid_names_list): param_dict = {} param_dict['name'] = name parameter = vega_weights_list[index] param_dict['data'] = Tensor(parameter.detach().numpy()) ms_params_list.append(param_dict) save_path = os.path.dirname(pth_file) save_file_name = os.path.join(save_path, "torch2ms_" + uuid.uuid1().hex[:8] + ".ckpt") save_checkpoint(ms_params_list, save_file_name) return save_file_name
def test_save_checkpoint_for_network(): """ test save_checkpoint for network""" net = Net() loss = SoftmaxCrossEntropyWithLogits(sparse=True) opt = Momentum(net.trainable_params(), 0.0, 0.9, 0.0001, 1024) loss_net = WithLossCell(net, loss) train_network = TrainOneStepCell(loss_net, opt) save_checkpoint(train_network, ckpt_file_name="./new_ckpt.ckpt") load_checkpoint("new_ckpt.ckpt")
def step_end(self, run_context): cb_params = run_context.original_args() epoch_num = cb_params.cur_epoch_num result = self.model.eval(self.eval_dataset) print("epoch", epoch_num, " top_1_accuracy:", result['top_1_accuracy']) if result['top_1_accuracy'] > self.acc: self.acc = result['top_1_accuracy'] file_name = "max.ckpt" file_name = os.path.join(self.ckpt_path, file_name) save_checkpoint(save_obj=cb_params.train_network, ckpt_file_name=file_name) print("Save the maximum accuracy checkpoint,the accuracy is", self.acc)
def epoch_end(self, net): """ Print log and save cgeckpoints when epoch end. Args: net (layers.Layer): TrainOneStepG instance. """ epoch_cost = (time.time() - self.epoch_start_time) * 1000 pre_step_time = epoch_cost / self.dataset_size mean_loss_G = sum(self.G_loss) / self.dataset_size mean_loss_D = sum(self.D_loss) / self.dataset_size self.info( "Epoch [{}] total cost: {:.2f} ms, pre step: {:.2f} ms, G_loss: {:.2f}, D_loss: {:.2f}" .format(self.epoch, epoch_cost, pre_step_time, mean_loss_G, mean_loss_D)) if self.epoch % self.save_checkpoint_epochs == 0 and self.rank == 0: save_checkpoint( net.G.generator.G_A, os.path.join(self.ckpts_dir, f"G_A_{self.epoch}.ckpt")) save_checkpoint( net.G.generator.G_B, os.path.join(self.ckpts_dir, f"G_B_{self.epoch}.ckpt")) save_checkpoint( net.G.D_A, os.path.join(self.ckpts_dir, f"D_A_{self.epoch}.ckpt")) save_checkpoint( net.G.D_B, os.path.join(self.ckpts_dir, f"D_B_{self.epoch}.ckpt"))
def step_end(self, run_context): """step end and save ckpt""" cb_params = run_context.original_args() if cb_params.cur_step_num % self.save_ckpt_step == 0: saved_ckpt_num = cb_params.cur_step_num / self.save_ckpt_step if saved_ckpt_num > self.max_ckpt_num: oldest_ckpt_index = saved_ckpt_num - self.max_ckpt_num path = os.path.join(self.output_dir, "tiny_bert_{}_{}.ckpt".format(int(oldest_ckpt_index), self.save_ckpt_step)) if os.path.exists(path): os.remove(path) save_checkpoint(self.network, os.path.join(self.output_dir, "tiny_bert_{}_{}.ckpt".format(int(saved_ckpt_num), self.save_ckpt_step)))
def epoch_end(self, run_context): """evaluate the model and ema-model at the end of each epoch""" cb_params = run_context.original_args() cur_epoch = cb_params.cur_epoch_num + self._start_epoch - 1 save_ckpt = (cur_epoch % self.save_epoch == 0) acc = self.model.eval(self.eval_dataset, dataset_sink_mode=self.dataset_sink_mode) print("Model Accuracy:", acc) load_nparray_into_net(self.ema_network, self.shadow) self.ema_network.set_train(False) model_ema = Model(self.ema_network, loss_fn=self.loss_fn, metrics=self.eval_metrics) ema_acc = model_ema.eval(self.eval_dataset, dataset_sink_mode=self.dataset_sink_mode) print("EMA-Model Accuracy:", ema_acc) self.ema_accuracy[cur_epoch] = ema_acc["Top1-Acc"] output = [{ "name": k, "data": Tensor(v) } for k, v in self.shadow.items()] if self.best_ema_accuracy < ema_acc["Top1-Acc"]: self.best_ema_accuracy = ema_acc["Top1-Acc"] self.best_ema_epoch = cur_epoch save_checkpoint(output, "ema_best.ckpt") if self.best_accuracy < acc["Top1-Acc"]: self.best_accuracy = acc["Top1-Acc"] self.best_epoch = cur_epoch print("Best Model Accuracy: %s, at epoch %s" % (self.best_accuracy, self.best_epoch)) print("Best EMA-Model Accuracy: %s, at epoch %s" % (self.best_ema_accuracy, self.best_ema_epoch)) if save_ckpt: # Save the ema_model checkpoints ckpt = "{}-{}.ckpt".format("ema", cur_epoch) save_checkpoint(output, ckpt) save_checkpoint(output, "ema_last.ckpt") # Save the model checkpoints save_checkpoint(cb_params.train_network, "last.ckpt") print("Top 10 EMA-Model Accuracies: ") count = 0 for epoch in sorted(self.ema_accuracy, key=self.ema_accuracy.get, reverse=True): if count == 10: break print("epoch: %s, Top-1: %s)" % (epoch, self.ema_accuracy[epoch])) count += 1
def run(args): ms.context.set_context( mode=ms.context.GRAPH_MODE, device_target=args.device, save_graphs=False, ) net = LeNet5( num_class=10, num_channel=3, use_bn=args.use_bn, ) loss = ms.nn.loss.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') opt = build_optimizer(args, net) if args.mode == 'init': save_checkpoint( net, ckpt_file_name=os.path.join('seeds', '%d.ckpt' % (time.time())), ) if args.mode == 'train': ds_train = create_dataset( data_path=os.path.join(args.data_path, 'train'), batch_size=args.device_batch_size, ) if args.init_ckpt: print('using init checkpoint %s' % (args.init_ckpt)) load_ckpt(net, args.init_ckpt) train(args, net, loss, opt, ds_train) if args.mode == 'test': ds_test = create_dataset( data_path=os.path.join(args.data_path, 'test'), batch_size=args.device_batch_size, ) if args.ckpt_files: checkpoints = args.ckpt_files.split(',') else: steps = [10, 20, 30, 40] checkpoints = [get_ckpt_file_name(args, i) for i in steps] print('will test %d checkpoints' % (len(checkpoints))) # for i, n in enumerate(checkpoints): # print('[%d]=%s' % (i, n)) test(args, net, loss, opt, ds_test, checkpoints)
def trans_model_para(): file_names = [name for name in os.listdir() if name.endswith(".npy")] #to find all file names with suffix '.npy' in the current path. new_params_list = [] for file_name in file_names: var_name = file_name[:-4] param_dict = {"name": var_name, "data": Tensor(np.load(file_name))} if var_name in trans_dict.values(): new_params_list.append(param_dict) print(var_name + " has been saved") save_checkpoint(new_params_list, "ms_model_medium.ckpt") #to load the parameters from npy files and save them as mindspore checkpoint print("Finished:the parameters have been saved into mindspore checkpoint.")
def _save_ckpt(self, cb_params, force_to_save=False): """Save checkpoint files.""" if cb_params.cur_step_num == self._last_triggered_step: return # if param is cache enable, flush data from cache to host before save_ckpt if self._need_flush_from_cache: self._flush_from_cache(cb_params) save_ckpt = self._check_save_ckpt(cb_params, force_to_save) step_num_in_epoch = int((cb_params.cur_step_num - 1) % cb_params.batch_num + 1) if save_ckpt: cur_ckpoint_file = self._prefix + "-" + str(cb_params.cur_epoch_num) + "_" \ + str(step_num_in_epoch) + ".ckpt" # update checkpoint file list. self._manager.update_ckpoint_filelist(self._directory, self._prefix) # keep checkpoint files number equal max number. if self._config.keep_checkpoint_max and 0 < self._config.keep_checkpoint_max <= self._manager.ckpoint_num: self._manager.remove_oldest_ckpoint_file() elif self._config.keep_checkpoint_per_n_minutes and self._config.keep_checkpoint_per_n_minutes > 0: self._cur_time_for_keep = time.time() if (self._cur_time_for_keep - self._last_time_for_keep) \ < self._config.keep_checkpoint_per_n_minutes * 60: self._manager.keep_one_ckpoint_per_minutes( self._config.keep_checkpoint_per_n_minutes, self._cur_time_for_keep) # generate the new checkpoint file and rename it. global _save_dir _save_dir = self._directory cur_file = os.path.join(self._directory, cur_ckpoint_file) self._last_time_for_keep = time.time() self._last_triggered_step = cb_params.cur_step_num if context.get_context("enable_ge"): set_cur_net(cb_params.train_network) cb_params.train_network.exec_checkpoint_graph() network = self._config.saved_network if self._config.saved_network is not None else cb_params.train_network save_checkpoint(network, cur_file, self._config.integrated_save, self._config.async_save, self._config.enc_key, self._config.enc_mode) self._latest_ckpt_file_name = cur_file
def _save_ckpt(self, cb_params, force_to_save=False): """Save checkpoint files.""" if cb_params.cur_step_num == self._last_triggered_step: return save_ckpt = self._check_save_ckpt(cb_params, force_to_save) step_num_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num + 1 if save_ckpt: cur_ckpoint_file = self._prefix + "-" + str(cb_params.cur_epoch_num) + "_" \ + str(step_num_in_epoch) + ".ckpt" if _is_role_pserver(): cur_ckpoint_file = "PServer_" + str( _get_ps_mode_rank()) + "_" + cur_ckpoint_file # update checkpoint file list. self._manager.update_ckpoint_filelist(self._directory, self._prefix) # keep checkpoint files number equal max number. if self._config.keep_checkpoint_max and 0 < self._config.keep_checkpoint_max <= self._manager.ckpoint_num: self._manager.remove_oldest_ckpoint_file() elif self._config.keep_checkpoint_per_n_minutes and self._config.keep_checkpoint_per_n_minutes > 0: self._cur_time_for_keep = time.time() if (self._cur_time_for_keep - self._last_time_for_keep) \ < self._config.keep_checkpoint_per_n_minutes * 60: self._manager.keep_one_ckpoint_per_minutes( self._config.keep_checkpoint_per_n_minutes, self._cur_time_for_keep) # generate the new checkpoint file and rename it. global _save_dir _save_dir = self._directory cur_file = os.path.join(self._directory, cur_ckpoint_file) self._last_time_for_keep = time.time() self._last_triggered_step = cb_params.cur_step_num if context.get_context("enable_ge"): set_cur_net(cb_params.train_network) cb_params.train_network.exec_checkpoint_graph() save_checkpoint(cb_params.train_network, cur_file, self._config.integrated_save, self._config.async_save) self._latest_ckpt_file_name = cur_file
def epoch_end(self, net): """print log and save cgeckpoints when epoch end.""" epoch_cost = (time.time() - self.epoch_start_time) * 1000 pre_step_time = epoch_cost / self.dataset_size mean_loss = sum(self.contrastive_loss) / self.dataset_size self.info("Epoch [{}] total cost: {:.2f} ms, pre step: {:.2f} ms, mean_loss: {:.2f}"\ .format(self.epoch, epoch_cost, pre_step_time, mean_loss)) if self.epoch % self.save_checkpoint_epochs == 0: if self.linear_eval: save_checkpoint( net, os.path.join(self.ckpts_dir, f"linearClassifier_{self.epoch}.ckpt")) else: save_checkpoint( net, os.path.join(self.ckpts_dir, f"simclr_{self.epoch}.ckpt"))
def train_process(self, epoch, train_dataset, mini_steps=None): """ Training process. The data would be passed to network directly. """ dataset_helper = DatasetHelper(train_dataset, dataset_sink_mode=False, epoch_num=epoch) for i in range(epoch): step = 0 for k, next_element in enumerate(dataset_helper): loss = self._train_forward_backward(*next_element) if (k + 1) % mini_steps == 0: step += 1 print("epoch:", i + 1, "step:", step, "loss is ", loss) self._train_optim() self._train_clear() train_dataset.reset() save_checkpoint(self._train_forward_backward, "gradient_accumulation.ckpt")
def pt_to_ckpt(pt, ckpt, out_ckpt): """ Pt convert to ckpt file """ state_dict_torch = load_model(pt) state_dict_ms = load_model_ms(ckpt) name_relate = name_map(state_dict_ms) new_params_list = [] for key in state_dict_torch: param_dict = {} parameter = state_dict_torch[key] parameter = parameter.numpy() param_dict['name'] = name_relate[key] param_dict['data'] = Tensor(parameter) new_params_list.append(param_dict) save_checkpoint(new_params_list, out_ckpt) return state_dict_ms
def train(self): """Trainer""" losses = 0 for batch_idx, imgs in enumerate(self.trainloader): lr = imgs["LR"] hr = imgs["HR"] lr = Tensor(sub_mean(lr), mstype.float32) hr = Tensor(sub_mean(hr), mstype.float32) idx = Tensor(np.ones(imgs["idx"][0]), mstype.int32) t1 = time.time() loss = self.bp(lr, hr, idx) t2 = time.time() losses += loss.asnumpy() print('Task: %g, Step: %g, loss: %f, time: %f s' % (idx.shape[0], batch_idx, loss.asnumpy(), t2 - t1), flush=True) os.makedirs(self.args.save, exist_ok=True) if self.args.rank == 0: save_checkpoint( self.bp, self.args.save + "model_" + str(self.epoch) + '.ckpt')
def test_save_and_load_checkpoint_for_network_with_encryption(): """ test save and checkpoint for network with encryption""" net = Net() loss = SoftmaxCrossEntropyWithLogits(sparse=True) opt = Momentum(net.trainable_params(), 0.0, 0.9, 0.0001, 1024) loss_net = WithLossCell(net, loss) train_network = TrainOneStepCell(loss_net, opt) key = os.urandom(16) mode = "AES-GCM" ckpt_path = "./encrypt_ckpt.ckpt" if platform.system().lower() == "windows": with pytest.raises(NotImplementedError): save_checkpoint(train_network, ckpt_file_name=ckpt_path, enc_key=key, enc_mode=mode) param_dict = load_checkpoint(ckpt_path, dec_key=key, dec_mode="AES-GCM") load_param_into_net(net, param_dict) else: save_checkpoint(train_network, ckpt_file_name=ckpt_path, enc_key=key, enc_mode=mode) param_dict = load_checkpoint(ckpt_path, dec_key=key, dec_mode="AES-GCM") load_param_into_net(net, param_dict) if os.path.exists(ckpt_path): os.remove(ckpt_path)
def step_end(self, run_context): """step end and save ckpt""" cb_params = run_context.original_args() if cb_params.cur_step_num % self.save_ckpt_step == 0: saved_ckpt_num = cb_params.cur_step_num / self.save_ckpt_step if saved_ckpt_num > self.max_ckpt_num: oldest_ckpt_index = saved_ckpt_num - self.max_ckpt_num path = os.path.join( self.output_dir, "ternary_bert_{}_{}.ckpt".format(int(oldest_ckpt_index), self.save_ckpt_step)) if os.path.exists(path): os.remove(path) params_dict = save_params(self.network) convert_network(self.network, self.embedding_bits, self.weight_bits, self.clip_value) save_checkpoint( self.network, os.path.join( self.output_dir, "ternary_bert_{}_{}.ckpt".format(int(saved_ckpt_num), self.save_ckpt_step))) restore_params(self.network, params_dict)
opt = Momentum(filter(lambda x: x.requires_grad, head_net.get_parameters()), lr, config.momentum, config.weight_decay) network = WithLossCell(head_net, loss) network = TrainOneStepCell(network, opt) network.set_train() features_path = args_opt.dataset_path + '_features' idx_list = list(range(step_size)) rank = 0 if config.run_distribute: rank = get_rank() save_ckpt_path = os.path.join(config.save_checkpoint_path, 'ckpt_' + str(rank) + '/') if not os.path.isdir(save_ckpt_path): os.mkdir(save_ckpt_path) for epoch in range(epoch_size): random.shuffle(idx_list) epoch_start = time.time() losses = [] for j in idx_list: feature = Tensor(np.load(os.path.join(features_path, f"feature_{j}.npy"))) label = Tensor(np.load(os.path.join(features_path, f"label_{j}.npy"))) losses.append(network(feature, label).asnumpy()) epoch_mseconds = (time.time()-epoch_start) * 1000 per_step_mseconds = epoch_mseconds / step_size print("epoch[{}/{}], iter[{}] cost: {:5.3f}, per step time: {:5.3f}, avg loss: {:5.3f}"\ .format(epoch + 1, epoch_size, step_size, epoch_mseconds, per_step_mseconds, np.mean(np.array(losses)))) if (epoch + 1) % config.save_checkpoint_epochs == 0: save_checkpoint(net, os.path.join(save_ckpt_path, f"mobilenetv2_{epoch+1}.ckpt")) print("total cost {:5.4f} s".format(time.time() - start))
Returns: parameter list(list): pretrain model weight list. """ ms_ckpt = load_checkpoint(model_path) weights = {} for msname in ms_ckpt: if msname.startswith("layer") or msname.startswith("conv1") or msname.startswith("bn"): param_name = "backbone." + msname else: param_name = msname if "down_sample_layer.0" in param_name: param_name = param_name.replace("down_sample_layer.0", "conv_down_sample") if "down_sample_layer.1" in param_name: param_name = param_name.replace("down_sample_layer.1", "bn_down_sample") weights[param_name] = ms_ckpt[msname].data.asnumpy() if use_fp16_weight: dtype = mstype.float16 else: dtype = mstype.float32 parameter_dict = {} for name in weights: parameter_dict[name] = Parameter(Tensor(weights[name], dtype), name=name) param_list = [] for key, value in parameter_dict.items(): param_list.append({"name": key, "data": value}) return param_list if __name__ == "__main__": parameter_list = load_weights(args_opt.ckpt_file, use_fp16_weight=False) save_checkpoint(parameter_list, "resnet50_backbone.ckpt")
def train(): """Train model.""" parser = argparse.ArgumentParser(description='GCN') parser.add_argument('--data_dir', type=str, default='./data/cora/cora_mr', help='Dataset directory') parser.add_argument('--seed', type=int, default=0, help='Random seed') parser.add_argument('--train_nodes_num', type=int, default=140, help='Nodes numbers for training') parser.add_argument('--eval_nodes_num', type=int, default=500, help='Nodes numbers for evaluation') parser.add_argument('--test_nodes_num', type=int, default=1000, help='Nodes numbers for test') parser.add_argument('--save_TSNE', type=ast.literal_eval, default=False, help='Whether to save t-SNE graph') args_opt = parser.parse_args() if not os.path.exists("ckpts"): os.mkdir("ckpts") set_seed(args_opt.seed) context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False) config = ConfigGCN() adj, feature, label_onehot, label = get_adj_features_labels(args_opt.data_dir) nodes_num = label_onehot.shape[0] train_mask = get_mask(nodes_num, 0, args_opt.train_nodes_num) eval_mask = get_mask(nodes_num, args_opt.train_nodes_num, args_opt.train_nodes_num + args_opt.eval_nodes_num) test_mask = get_mask(nodes_num, nodes_num - args_opt.test_nodes_num, nodes_num) class_num = label_onehot.shape[1] gcn_net = GCN(config, adj, feature, class_num) gcn_net.add_flags_recursive(fp16=True) eval_net = LossAccuracyWrapper(gcn_net, label_onehot, eval_mask, config.weight_decay) train_net = TrainNetWrapper(gcn_net, label_onehot, train_mask, config) loss_list = [] if args_opt.save_TSNE: out_feature = gcn_net() tsne_result = t_SNE(out_feature.asnumpy(), 2) graph_data = [] graph_data.append(tsne_result) fig = plt.figure() scat = plt.scatter(tsne_result[:, 0], tsne_result[:, 1], s=2, c=label, cmap='rainbow') plt.title('t-SNE visualization of Epoch:0', fontsize='large', fontweight='bold', verticalalignment='center') for epoch in range(config.epochs): t = time.time() train_net.set_train() train_result = train_net() train_loss = train_result[0].asnumpy() train_accuracy = train_result[1].asnumpy() eval_net.set_train(False) eval_result = eval_net() eval_loss = eval_result[0].asnumpy() eval_accuracy = eval_result[1].asnumpy() loss_list.append(eval_loss) print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(train_loss), "train_acc=", "{:.5f}".format(train_accuracy), "val_loss=", "{:.5f}".format(eval_loss), "val_acc=", "{:.5f}".format(eval_accuracy), "time=", "{:.5f}".format(time.time() - t)) if args_opt.save_TSNE: out_feature = gcn_net() tsne_result = t_SNE(out_feature.asnumpy(), 2) graph_data.append(tsne_result) if epoch > config.early_stopping and loss_list[-1] > np.mean(loss_list[-(config.early_stopping+1):-1]): print("Early stopping...") break save_checkpoint(gcn_net, "ckpts/gcn.ckpt") gcn_net_test = GCN(config, adj, feature, class_num) load_checkpoint("ckpts/gcn.ckpt", net=gcn_net_test) gcn_net_test.add_flags_recursive(fp16=True) test_net = LossAccuracyWrapper(gcn_net_test, label_onehot, test_mask, config.weight_decay) t_test = time.time() test_net.set_train(False) test_result = test_net() test_loss = test_result[0].asnumpy() test_accuracy = test_result[1].asnumpy() print("Test set results:", "loss=", "{:.5f}".format(test_loss), "accuracy=", "{:.5f}".format(test_accuracy), "time=", "{:.5f}".format(time.time() - t_test)) if args_opt.save_TSNE: ani = animation.FuncAnimation(fig, update_graph, frames=range(config.epochs + 1), fargs=(graph_data, scat, plt)) ani.save('t-SNE_visualization.gif', writer='imagemagick')