Example #1
0
def convert_tf_2_ms(tf_ckpt_path, ms_ckpt_path, new_ckpt_path):
    """
    convert tf checkpoint to ms checkpoint
    """
    tf2ms_param_dict = dict(zip(ms2tf_param_dict.values(), ms2tf_param_dict.keys()))

    # load MS checkpoint
    ms_param_dict = load_checkpoint(ms_ckpt_path)

    new_params_list = []
    session = tf.compat.v1.Session()
    count = 0
    for ms_name in tf2ms_param_dict.keys():
        count += 1
        param_dict = {}

        tf_name = tf2ms_param_dict[ms_name]
        data = tf.train.load_variable(tf_ckpt_path, tf_name)
        ms_shape = ms_param_dict[ms_name].data.shape
        tf_shape = data.shape

        if len(ms_shape) == 2:
            if ms_shape != tf_shape or ms_shape[0] == ms_shape[1]:
                data = tf.transpose(data, (1, 0))
                data = data.eval(session=session)

        param_dict['name'] = ms_name
        param_dict['data'] = Tensor(data)

        new_params_list.append(param_dict)
    print("start saving checkpoint ...")
    save_checkpoint(new_params_list, new_ckpt_path)
    print("ms checkpoint was save in :", new_ckpt_path)

    return True
Example #2
0
def test_save_checkpoint():
    """ test_save_checkpoint """
    parameter_list = []
    one_param = {}
    param1 = {}
    param2 = {}
    one_param['name'] = "param_test"
    one_param['data'] = Tensor(np.random.randint(0, 255, [1, 3, 224, 224]),
                               dtype=mstype.float32)
    param1['name'] = "param"
    param1['data'] = Tensor(np.random.randint(0, 255, [12, 1024]),
                            dtype=mstype.float32)
    param2['name'] = "new_param"
    param2['data'] = Tensor(np.random.randint(0, 255, [12, 1024, 1]),
                            dtype=mstype.float32)
    parameter_list.append(one_param)
    parameter_list.append(param1)
    parameter_list.append(param2)

    if os.path.exists('./parameters.ckpt'):
        os.chmod('./parameters.ckpt', stat.S_IWRITE)
        os.remove('./parameters.ckpt')

    ckpt_file_name = os.path.join(_cur_dir, './parameters.ckpt')
    save_checkpoint(parameter_list, ckpt_file_name)
def pt_to_ckpt(pt, ckpt, out_ckpt):
    """
    Pt convert to ckpt file
    """
    state_dict_torch = load_model(pt)
    state_dict_ms = load_model_ms(ckpt)
    name_relate = name_map(state_dict_ms)
    new_params_list = []

    for key in state_dict_torch:
        param_dict = {}
        parameter = state_dict_torch[key]
        parameter = parameter.numpy()

        # depwise conv pytorch[cout, 1, k , k] -> ms[1, cin, k , k], cin = cout
        if state_dict_ms[name_relate[key]].data.shape != parameter.shape:
            parameter = parameter.transpose(1, 0, 2, 3)
            print('ms=', state_dict_ms[name_relate[key]].data.shape, 'pytorch=', parameter.shape, 'name=', key)


        param_dict['name'] = name_relate[key]
        param_dict['data'] = Tensor(parameter)
        new_params_list.append(param_dict)

    save_checkpoint(new_params_list, out_ckpt)
    return state_dict_ms
Example #4
0
def convert(weights_file, output_file):
    """Conver weight to mindspore ckpt."""
    params = build_network()
    weights = load_weight(weights_file)
    index = 0
    param_list = []
    for i in range(0, len(params), 5):
        weight = params[i]
        mean = params[i+1]
        var = params[i+2]
        gamma = params[i+3]
        beta = params[i+4]
        beta_data = weights[index: index+beta.size].reshape(beta.shape)
        index += beta.size
        gamma_data = weights[index: index+gamma.size].reshape(gamma.shape)
        index += gamma.size
        mean_data = weights[index: index+mean.size].reshape(mean.shape)
        index += mean.size
        var_data = weights[index: index + var.size].reshape(var.shape)
        index += var.size
        weight_data = weights[index: index+weight.size].reshape(weight.shape)
        index += weight.size

        param_list.append({'name': weight.name, 'type': weight.dtype, 'shape': weight.shape,
                           'data': Tensor(weight_data)})
        param_list.append({'name': mean.name, 'type': mean.dtype, 'shape': mean.shape, 'data': Tensor(mean_data)})
        param_list.append({'name': var.name, 'type': var.dtype, 'shape': var.shape, 'data': Tensor(var_data)})
        param_list.append({'name': gamma.name, 'type': gamma.dtype, 'shape': gamma.shape, 'data': Tensor(gamma_data)})
        param_list.append({'name': beta.name, 'type': beta.dtype, 'shape': beta.shape, 'data': Tensor(beta_data)})

    save_checkpoint(param_list, output_file)
    def save(self, signum, frame):
        """
        Save current checkpoint when an error is occur.
        """
        print(f"process sig {signum} and frame content {frame}")
        if self.cb_params is None:
            return

        prefix = _check_bpckpt_file_name_if_same_exist(self._directory,
                                                       self._prefix)
        step_num_in_epoch = int(
            (self.cb_params.cur_step_num - 1) % self.cb_params.batch_num + 1)

        cur_ckpt_file = f"{prefix}-{self.cb_params.cur_epoch_num}_{step_num_in_epoch}_breakpoint.ckpt"
        cur_file = os.path.join(self._directory, cur_ckpt_file)

        if "epoch_num" in self._append_dict:
            self._append_dict[
                "epoch_num"] = self._append_epoch_num + self.cb_params.cur_epoch_num
        if "step_num" in self._append_dict:
            self._append_dict[
                "step_num"] = self._append_step_num + self.cb_params.cur_step_num
        network = self._config.saved_network if self._config.saved_network is not None else self.cb_params.train_network

        save_checkpoint(network, cur_file, self._config.integrated_save,
                        self._config.async_save,
                        self._append_dict, self._config.enc_key,
                        self._config.enc_mode)
        raise RuntimeError("Term exception happened.")
Example #6
0
    def step_end(self, run_context):
        """step end and do evaluation"""
        cb_params = run_context.original_args()
        if cb_params.cur_step_num % self.eval_ckpt_step == 0:
            params_dict = save_params(self.network)
            convert_network(self.network, self.embedding_bits,
                            self.weight_bits, self.clip_value)
            self.network.set_train(False)
            callback = self.metrics()
            columns_list = [
                "input_ids", "input_mask", "segment_ids", "label_ids"
            ]
            for data in self.dataset:
                input_data = []
                for i in columns_list:
                    input_data.append(data[i])
                input_ids, input_mask, token_type_id, label_ids = input_data
                _, _, logits, _ = self.network(input_ids, token_type_id,
                                               input_mask)
                callback.update(logits, label_ids)
            metrics = callback.get_metrics()

            if metrics > self.global_metrics:
                self.global_metrics = metrics
                eval_model_ckpt_file = os.path.join(self.save_ckpt_dir,
                                                    'eval_model.ckpt')
                if os.path.exists(eval_model_ckpt_file):
                    os.remove(eval_model_ckpt_file)
                save_checkpoint(self.network, eval_model_ckpt_file)
            print('step {}, {} {}, best_{} {}'.format(cb_params.cur_step_num,
                                                      callback.name, metrics,
                                                      callback.name,
                                                      self.global_metrics))
            restore_params(self.network, params_dict)
            self.network.set_train(True)
Example #7
0
def adaptive_weight(ckpt_file, ms_model):
    """Adapte the weight shape."""
    parameter_dict = load_checkpoint(ckpt_file)
    net_parameter = ms_model.parameters_and_names()
    new_ms_params_list = []
    for index, paras in enumerate(net_parameter):
        net_para_name = paras[0]
        net_para_shape = paras[1].data.shape

        if net_para_name in parameter_dict:
            init_weight = parameter_dict[net_para_name].data
            init_para_shape = init_weight.shape

            if net_para_shape != init_para_shape:
                if "conv" in net_para_name:
                    new_weight = _adaptive_conv(init_weight, net_para_shape)
                elif "batch_norm" in net_para_name:
                    new_weight = _adaptive_bn(init_weight, net_para_shape)
                else:
                    continue
                logging.debug("parameter shape not match,para name: {}, init_shape:{}, net_para_shape:{}".
                              format(net_para_name, init_para_shape, net_para_shape))
            param_dict = {}
            param_dict['name'] = net_para_name
            param_dict['data'] = init_weight if net_para_shape == init_para_shape else new_weight
            new_ms_params_list.append(param_dict)
            # parameter_dict[net_para_name].data = new_weight
    save_path = os.path.dirname(ckpt_file)
    save_file_name = os.path.join(save_path, "adaptive_" + uuid.uuid1().hex[:8] + ".ckpt")
    save_checkpoint(new_ms_params_list, save_file_name)
    return save_file_name
Example #8
0
def extract_and_convert(input_dir, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    config = json.load(
        open(os.path.join(input_dir, 'ernie_config.json'),
             'rt',
             encoding='utf-8'))
    print('=' * 20 + 'save vocab file' + '=' * 20)
    shutil.copyfile(os.path.join(input_dir, 'vocab.txt'),
                    os.path.join(output_dir, 'vocab.txt'))
    print('=' * 20 + 'extract weights' + '=' * 20)
    state_dict = []
    weight_map = build_params_map(attention_num=config['num_hidden_layers'])
    with fluid.dygraph.guard():
        paddle_paddle_params, _ = D.load_dygraph(
            os.path.join(input_dir, 'params'))
    for weight_name, weight_value in paddle_paddle_params.items():
        if weight_name not in weight_map.keys():
            continue
        #print(weight_name, weight_value.shape)
        if 'w_0' in weight_name \
            or 'post_att_layer_norm_scale' in weight_name \
            or 'post_ffn_layer_norm_scale' in weight_name \
            or 'cls_out_w' in weight_name:
            weight_value = weight_value.transpose()
        state_dict.append({
            'name': weight_map[weight_name],
            'data': Tensor(weight_value)
        })
        print(weight_name, '->', weight_map[weight_name], weight_value.shape)
    save_checkpoint(state_dict, os.path.join(output_dir, "ernie.ckpt"))
Example #9
0
    def step_end(self, run_context):
        """step end and do evaluation"""
        cb_params = run_context.original_args()
        if cb_params.cur_step_num % 100 == 0:
            callback = Accuracy()
            columns_list = ["input_ids", "input_mask", "segment_ids", "label_ids"]
            for data in self.dataset.create_dict_iterator(num_epochs=1):
                input_data = []
                for i in columns_list:
                    input_data.append(data[i])
                input_ids, input_mask, token_type_id, label_ids = input_data
                self.network.set_train(False)
                logits = self.network(input_ids, token_type_id, input_mask)
                callback.update(logits[3], label_ids)
            acc = callback.acc_num / callback.total_num
            with open("./eval.log", "a+") as f:
                f.write("acc_num {}, total_num{}, accuracy{:.6f}".format(callback.acc_num, callback.total_num,
                                                                         callback.acc_num / callback.total_num))
                f.write('\n')

            if acc > self.global_acc:
                self.global_acc = acc
                print("The best acc is {}".format(acc))
                eval_model_ckpt_file = "eval_model.ckpt"
                if os.path.exists(eval_model_ckpt_file):
                    os.remove(eval_model_ckpt_file)
                save_checkpoint(self.network, eval_model_ckpt_file)
Example #10
0
def run(args):
    ms.context.set_context(
        mode=ms.context.GRAPH_MODE,
        device_target=args.device,
        save_graphs=False,
    )

    net = LeNet5(
        num_class=10,
        num_channel=3,
        use_bn=args.use_bn,
        dbg_log_tensor=args.log_tensor,
    )

    loss = ms.nn.loss.SoftmaxCrossEntropyWithLogits(
        sparse=True,
        reduction='mean',
    )
    opt = build_optimizer(args, net)

    if args.mode == 'init':
        save_checkpoint(
            net,
            ckpt_file_name=os.path.join('seeds', '%d.ckpt' % (time.time())),
        )

    if args.mode == 'train':
        ds_train = create_dataset(
            args=args,
            data_path=os.path.join(args.data_path, 'train'),
            batch_size=args.device_batch_size,
        )
        if args.init_ckpt:
            print('using init checkpoint %s' % (args.init_ckpt))
            load_ckpt(net, args.init_ckpt)
        train(args, net, loss, opt, ds_train)

    if args.mode == 'test':
        if args.use_kungfu:
            rank = kfops.kungfu_current_rank()
            if rank > 0:
                return
        ds_test = create_dataset(
            args=args,
            data_path=os.path.join(args.data_path, 'test'),
            batch_size=args.device_batch_size,
        )

        if args.ckpt_files:
            checkpoints = args.ckpt_files.split(',')
        else:
            checkpoint_dir = get_ckpt_dir(args)
            print('checkpoint_dir: %s' % (checkpoint_dir))
            checkpoints = list(sorted(glob.glob(checkpoint_dir + '/*.ckpt')))
        print('will test %d checkpoints' % (len(checkpoints)))
        # for i, n in enumerate(checkpoints):
        #     print('[%d]=%s' % (i, n))
        test(args, net, loss, opt, ds_test, checkpoints)
Example #11
0
def pytorch2mindspore(pth_file):
    """Convert pytorch weight to mindspore checkpoint."""
    torch_para_dict = torch.load(pth_file)
    torch_weight_list = []
    torch_paras_name_list = []
    ms_params_list = []
    ms_para_name_list = []

    for index, name in enumerate(torch_para_dict):
        torch_paras_name_list.append(name)
        torch_weight = torch_para_dict[name]

        # if name == "fc.weight":
        # ms_name = "fc.linear.weight"
        # elif name == "fc.bias":
        # ms_name = "fc.linear.bias"
        if name.endswith("weight"):
            name = name[:name.rfind("weight")]
            ms_name = "backbone." + name + "conv2d.weight"
        elif name.endswith('bias'):
            name = name[:name.rfind('bias')]
            ms_name = "backbone." + name + 'batch_norm.beta'
        elif name.endswith('.running_mean'):
            # fix batch_norm name
            old_name_gamma = ms_para_name_list[index - 2]
            new_name_gamma = old_name_gamma[:old_name_gamma.rfind(
                'conv2d.weight')] + "batch_norm.gamma"
            ms_para_name_list[index - 2] = new_name_gamma

            name = name[:name.rfind('.running_mean')]
            ms_name = "backbone." + name + '.batch_norm.moving_mean'

        elif name.endswith('.running_var'):
            name = name[:name.rfind('.running_var')]
            ms_name = "backbone." + name + '.batch_norm.moving_variance'

        elif name.endswith(".num_batches_tracked"):
            ms_name = name

        torch_weight_list.append(torch_weight)
        ms_para_name_list.append(ms_name)

    for index, name in enumerate(ms_para_name_list):
        logging.debug('========================py_name: {}'.format(
            torch_paras_name_list[index]))
        logging.debug('========================ms_name: {}'.format(name))
        param_dict = {}
        param_dict['name'] = name
        parameter = torch_weight_list[index]
        param_dict['data'] = Tensor(parameter.detach().numpy())
        ms_params_list.append(param_dict)

    save_path = os.path.dirname(pth_file)
    save_file_name = os.path.join(save_path,
                                  "torch2ms_" + uuid.uuid1().hex[:8] + ".ckpt")
    save_checkpoint(ms_params_list, save_file_name)
    return save_file_name
Example #12
0
def pytorch2mindspore_extend(pth_file, model):
    """Convert torchvison  weight to vega weight of ms."""
    init_para_dict = torch.load(pth_file)
    init_names_list = []
    init_weights_list = []
    for index, name in enumerate(init_para_dict):
        init_names_list.append(name)
        init_weights_list.append(init_para_dict[name])

    vega_names_list = []
    vega_weights_list = []
    valid_names_list = []

    for name in model.parameters_dict():
        if not name.endswith("num_batches_tracked"):
            vega_names_list.append(name)

    for index, name in enumerate(vega_names_list):
        init_name = init_names_list[index]
        # if index < 1:
        #     continue
        if name.endswith("weight") and "conv" in name and init_name.endswith("weight") and (
                "conv" in init_name or "downsample" in init_name):
            valid_names_list.append(name)
            vega_weights_list.append(init_weights_list[index])
        elif name.endswith("moving_mean") and ("bn" in name or "batch" in name) and init_name.endswith("running_mean"):
            valid_names_list.append(name)
            vega_weights_list.append(init_weights_list[index])
        elif name.endswith("moving_variance") and ("bn" in name or "batch" in name) and init_name.endswith(
                "running_var"):
            valid_names_list.append(name)
            vega_weights_list.append(init_weights_list[index])
        elif name.endswith("gamma") and ("bn" in name or "batch" in name) and init_name.endswith("weight") and (
                "bn" in init_name or "downsample" in init_name):
            valid_names_list.append(name)
            vega_weights_list.append(init_weights_list[index])
        elif name.endswith("beta") and ("bn" in name or "batch" in name) and init_name.endswith("bias") and (
                "bn" in init_name or "downsample" in init_name):
            valid_names_list.append(name)
            vega_weights_list.append(init_weights_list[index])
        else:
            continue

    ms_params_list = []

    for index, name in enumerate(valid_names_list):
        param_dict = {}
        param_dict['name'] = name
        parameter = vega_weights_list[index]
        param_dict['data'] = Tensor(parameter.detach().numpy())
        ms_params_list.append(param_dict)
    save_path = os.path.dirname(pth_file)
    save_file_name = os.path.join(save_path, "torch2ms_" + uuid.uuid1().hex[:8] + ".ckpt")
    save_checkpoint(ms_params_list, save_file_name)
    return save_file_name
Example #13
0
def test_save_checkpoint_for_network():
    """ test save_checkpoint for network"""
    net = Net()
    loss = SoftmaxCrossEntropyWithLogits(sparse=True)
    opt = Momentum(net.trainable_params(), 0.0, 0.9, 0.0001, 1024)

    loss_net = WithLossCell(net, loss)
    train_network = TrainOneStepCell(loss_net, opt)
    save_checkpoint(train_network, ckpt_file_name="./new_ckpt.ckpt")

    load_checkpoint("new_ckpt.ckpt")
Example #14
0
 def step_end(self, run_context):
     cb_params = run_context.original_args()
     epoch_num = cb_params.cur_epoch_num
     result = self.model.eval(self.eval_dataset)
     print("epoch", epoch_num, " top_1_accuracy:", result['top_1_accuracy'])
     if result['top_1_accuracy'] > self.acc:
         self.acc = result['top_1_accuracy']
         file_name = "max.ckpt"
         file_name = os.path.join(self.ckpt_path, file_name)
         save_checkpoint(save_obj=cb_params.train_network,
                         ckpt_file_name=file_name)
         print("Save the maximum accuracy checkpoint,the accuracy is",
               self.acc)
Example #15
0
    def epoch_end(self, net):
        """
        Print log and save cgeckpoints when epoch end.

        Args:
            net (layers.Layer): TrainOneStepG instance.
        """
        epoch_cost = (time.time() - self.epoch_start_time) * 1000
        pre_step_time = epoch_cost / self.dataset_size
        mean_loss_G = sum(self.G_loss) / self.dataset_size
        mean_loss_D = sum(self.D_loss) / self.dataset_size
        self.info(
            "Epoch [{}] total cost: {:.2f} ms, pre step: {:.2f} ms, G_loss: {:.2f}, D_loss: {:.2f}"
            .format(self.epoch, epoch_cost, pre_step_time, mean_loss_G,
                    mean_loss_D))

        if self.epoch % self.save_checkpoint_epochs == 0 and self.rank == 0:
            save_checkpoint(
                net.G.generator.G_A,
                os.path.join(self.ckpts_dir, f"G_A_{self.epoch}.ckpt"))
            save_checkpoint(
                net.G.generator.G_B,
                os.path.join(self.ckpts_dir, f"G_B_{self.epoch}.ckpt"))
            save_checkpoint(
                net.G.D_A,
                os.path.join(self.ckpts_dir, f"D_A_{self.epoch}.ckpt"))
            save_checkpoint(
                net.G.D_B,
                os.path.join(self.ckpts_dir, f"D_B_{self.epoch}.ckpt"))
Example #16
0
 def step_end(self, run_context):
     """step end and save ckpt"""
     cb_params = run_context.original_args()
     if cb_params.cur_step_num % self.save_ckpt_step == 0:
         saved_ckpt_num = cb_params.cur_step_num / self.save_ckpt_step
         if saved_ckpt_num > self.max_ckpt_num:
             oldest_ckpt_index = saved_ckpt_num - self.max_ckpt_num
             path = os.path.join(self.output_dir, "tiny_bert_{}_{}.ckpt".format(int(oldest_ckpt_index),
                                                                                self.save_ckpt_step))
             if os.path.exists(path):
                 os.remove(path)
         save_checkpoint(self.network, os.path.join(self.output_dir,
                                                    "tiny_bert_{}_{}.ckpt".format(int(saved_ckpt_num),
                                                                                  self.save_ckpt_step)))
Example #17
0
    def epoch_end(self, run_context):
        """evaluate the model and ema-model at the end of each epoch"""
        cb_params = run_context.original_args()
        cur_epoch = cb_params.cur_epoch_num + self._start_epoch - 1

        save_ckpt = (cur_epoch % self.save_epoch == 0)

        acc = self.model.eval(self.eval_dataset,
                              dataset_sink_mode=self.dataset_sink_mode)
        print("Model Accuracy:", acc)

        load_nparray_into_net(self.ema_network, self.shadow)
        self.ema_network.set_train(False)

        model_ema = Model(self.ema_network,
                          loss_fn=self.loss_fn,
                          metrics=self.eval_metrics)
        ema_acc = model_ema.eval(self.eval_dataset,
                                 dataset_sink_mode=self.dataset_sink_mode)

        print("EMA-Model Accuracy:", ema_acc)
        self.ema_accuracy[cur_epoch] = ema_acc["Top1-Acc"]
        output = [{
            "name": k,
            "data": Tensor(v)
        } for k, v in self.shadow.items()]

        if self.best_ema_accuracy < ema_acc["Top1-Acc"]:
            self.best_ema_accuracy = ema_acc["Top1-Acc"]
            self.best_ema_epoch = cur_epoch
            save_checkpoint(output, "ema_best.ckpt")

        if self.best_accuracy < acc["Top1-Acc"]:
            self.best_accuracy = acc["Top1-Acc"]
            self.best_epoch = cur_epoch

        print("Best Model Accuracy: %s, at epoch %s" %
              (self.best_accuracy, self.best_epoch))
        print("Best EMA-Model Accuracy: %s, at epoch %s" %
              (self.best_ema_accuracy, self.best_ema_epoch))

        if save_ckpt:
            # Save the ema_model checkpoints
            ckpt = "{}-{}.ckpt".format("ema", cur_epoch)
            save_checkpoint(output, ckpt)
            save_checkpoint(output, "ema_last.ckpt")

            # Save the model checkpoints
            save_checkpoint(cb_params.train_network, "last.ckpt")

        print("Top 10 EMA-Model Accuracies: ")
        count = 0
        for epoch in sorted(self.ema_accuracy,
                            key=self.ema_accuracy.get,
                            reverse=True):
            if count == 10:
                break
            print("epoch: %s, Top-1: %s)" % (epoch, self.ema_accuracy[epoch]))
            count += 1
Example #18
0
def run(args):
    ms.context.set_context(
        mode=ms.context.GRAPH_MODE,
        device_target=args.device,
        save_graphs=False,
    )

    net = LeNet5(
        num_class=10,
        num_channel=3,
        use_bn=args.use_bn,
    )

    loss = ms.nn.loss.SoftmaxCrossEntropyWithLogits(sparse=True,
                                                    reduction='mean')
    opt = build_optimizer(args, net)

    if args.mode == 'init':
        save_checkpoint(
            net,
            ckpt_file_name=os.path.join('seeds', '%d.ckpt' % (time.time())),
        )

    if args.mode == 'train':
        ds_train = create_dataset(
            data_path=os.path.join(args.data_path, 'train'),
            batch_size=args.device_batch_size,
        )

        if args.init_ckpt:
            print('using init checkpoint %s' % (args.init_ckpt))
            load_ckpt(net, args.init_ckpt)
        train(args, net, loss, opt, ds_train)

    if args.mode == 'test':
        ds_test = create_dataset(
            data_path=os.path.join(args.data_path, 'test'),
            batch_size=args.device_batch_size,
        )

        if args.ckpt_files:
            checkpoints = args.ckpt_files.split(',')
        else:
            steps = [10, 20, 30, 40]
            checkpoints = [get_ckpt_file_name(args, i) for i in steps]
        print('will test %d checkpoints' % (len(checkpoints)))
        # for i, n in enumerate(checkpoints):
        #     print('[%d]=%s' % (i, n))
        test(args, net, loss, opt, ds_test, checkpoints)
Example #19
0
def trans_model_para():
    file_names = [name for name in os.listdir() if name.endswith(".npy")]
    #to find all file names with suffix '.npy' in the current path.
    new_params_list = []
    for file_name in file_names:
        var_name = file_name[:-4]
        param_dict = {"name": var_name, "data": Tensor(np.load(file_name))}
        if var_name in trans_dict.values():
            new_params_list.append(param_dict)
            print(var_name + " has been saved")

    save_checkpoint(new_params_list, "ms_model_medium.ckpt")
    #to load the parameters from npy files and save them as mindspore checkpoint

    print("Finished:the parameters have been saved into mindspore checkpoint.")
Example #20
0
    def _save_ckpt(self, cb_params, force_to_save=False):
        """Save checkpoint files."""
        if cb_params.cur_step_num == self._last_triggered_step:
            return

        # if param is cache enable, flush data from cache to host before save_ckpt
        if self._need_flush_from_cache:
            self._flush_from_cache(cb_params)

        save_ckpt = self._check_save_ckpt(cb_params, force_to_save)
        step_num_in_epoch = int((cb_params.cur_step_num - 1) %
                                cb_params.batch_num + 1)

        if save_ckpt:
            cur_ckpoint_file = self._prefix + "-" + str(cb_params.cur_epoch_num) + "_" \
                + str(step_num_in_epoch) + ".ckpt"
            # update checkpoint file list.
            self._manager.update_ckpoint_filelist(self._directory,
                                                  self._prefix)
            # keep checkpoint files number equal max number.
            if self._config.keep_checkpoint_max and 0 < self._config.keep_checkpoint_max <= self._manager.ckpoint_num:
                self._manager.remove_oldest_ckpoint_file()
            elif self._config.keep_checkpoint_per_n_minutes and self._config.keep_checkpoint_per_n_minutes > 0:
                self._cur_time_for_keep = time.time()
                if (self._cur_time_for_keep - self._last_time_for_keep) \
                        < self._config.keep_checkpoint_per_n_minutes * 60:
                    self._manager.keep_one_ckpoint_per_minutes(
                        self._config.keep_checkpoint_per_n_minutes,
                        self._cur_time_for_keep)

            # generate the new checkpoint file and rename it.
            global _save_dir
            _save_dir = self._directory
            cur_file = os.path.join(self._directory, cur_ckpoint_file)
            self._last_time_for_keep = time.time()
            self._last_triggered_step = cb_params.cur_step_num

            if context.get_context("enable_ge"):
                set_cur_net(cb_params.train_network)
                cb_params.train_network.exec_checkpoint_graph()

            network = self._config.saved_network if self._config.saved_network is not None else cb_params.train_network
            save_checkpoint(network, cur_file, self._config.integrated_save,
                            self._config.async_save, self._config.enc_key,
                            self._config.enc_mode)

            self._latest_ckpt_file_name = cur_file
Example #21
0
    def _save_ckpt(self, cb_params, force_to_save=False):
        """Save checkpoint files."""
        if cb_params.cur_step_num == self._last_triggered_step:
            return

        save_ckpt = self._check_save_ckpt(cb_params, force_to_save)
        step_num_in_epoch = (cb_params.cur_step_num -
                             1) % cb_params.batch_num + 1

        if save_ckpt:
            cur_ckpoint_file = self._prefix + "-" + str(cb_params.cur_epoch_num) + "_" \
                               + str(step_num_in_epoch) + ".ckpt"
            if _is_role_pserver():
                cur_ckpoint_file = "PServer_" + str(
                    _get_ps_mode_rank()) + "_" + cur_ckpoint_file
            # update checkpoint file list.
            self._manager.update_ckpoint_filelist(self._directory,
                                                  self._prefix)
            # keep checkpoint files number equal max number.
            if self._config.keep_checkpoint_max and 0 < self._config.keep_checkpoint_max <= self._manager.ckpoint_num:
                self._manager.remove_oldest_ckpoint_file()
            elif self._config.keep_checkpoint_per_n_minutes and self._config.keep_checkpoint_per_n_minutes > 0:
                self._cur_time_for_keep = time.time()
                if (self._cur_time_for_keep - self._last_time_for_keep) \
                        < self._config.keep_checkpoint_per_n_minutes * 60:
                    self._manager.keep_one_ckpoint_per_minutes(
                        self._config.keep_checkpoint_per_n_minutes,
                        self._cur_time_for_keep)

            # generate the new checkpoint file and rename it.
            global _save_dir
            _save_dir = self._directory
            cur_file = os.path.join(self._directory, cur_ckpoint_file)
            self._last_time_for_keep = time.time()
            self._last_triggered_step = cb_params.cur_step_num

            if context.get_context("enable_ge"):
                set_cur_net(cb_params.train_network)
                cb_params.train_network.exec_checkpoint_graph()

            save_checkpoint(cb_params.train_network, cur_file,
                            self._config.integrated_save,
                            self._config.async_save)

            self._latest_ckpt_file_name = cur_file
Example #22
0
    def epoch_end(self, net):
        """print log and save cgeckpoints when epoch end."""
        epoch_cost = (time.time() - self.epoch_start_time) * 1000
        pre_step_time = epoch_cost / self.dataset_size
        mean_loss = sum(self.contrastive_loss) / self.dataset_size

        self.info("Epoch [{}] total cost: {:.2f} ms, pre step: {:.2f} ms, mean_loss: {:.2f}"\
            .format(self.epoch, epoch_cost, pre_step_time, mean_loss))
        if self.epoch % self.save_checkpoint_epochs == 0:
            if self.linear_eval:
                save_checkpoint(
                    net,
                    os.path.join(self.ckpts_dir,
                                 f"linearClassifier_{self.epoch}.ckpt"))
            else:
                save_checkpoint(
                    net,
                    os.path.join(self.ckpts_dir, f"simclr_{self.epoch}.ckpt"))
Example #23
0
    def train_process(self, epoch, train_dataset, mini_steps=None):
        """
        Training process. The data would be passed to network directly.
        """
        dataset_helper = DatasetHelper(train_dataset, dataset_sink_mode=False, epoch_num=epoch)

        for i in range(epoch):
            step = 0
            for k, next_element in enumerate(dataset_helper):
                loss = self._train_forward_backward(*next_element)
                if (k + 1) % mini_steps == 0:
                    step += 1
                    print("epoch:", i + 1, "step:", step, "loss is ", loss)
                    self._train_optim()
                    self._train_clear()

            train_dataset.reset()

        save_checkpoint(self._train_forward_backward, "gradient_accumulation.ckpt")
def pt_to_ckpt(pt, ckpt, out_ckpt):
    """
    Pt convert to ckpt file
    """
    state_dict_torch = load_model(pt)
    state_dict_ms = load_model_ms(ckpt)
    name_relate = name_map(state_dict_ms)
    new_params_list = []

    for key in state_dict_torch:
        param_dict = {}
        parameter = state_dict_torch[key]
        parameter = parameter.numpy()

        param_dict['name'] = name_relate[key]
        param_dict['data'] = Tensor(parameter)
        new_params_list.append(param_dict)

    save_checkpoint(new_params_list, out_ckpt)
    return state_dict_ms
Example #25
0
 def train(self):
     """Trainer"""
     losses = 0
     for batch_idx, imgs in enumerate(self.trainloader):
         lr = imgs["LR"]
         hr = imgs["HR"]
         lr = Tensor(sub_mean(lr), mstype.float32)
         hr = Tensor(sub_mean(hr), mstype.float32)
         idx = Tensor(np.ones(imgs["idx"][0]), mstype.int32)
         t1 = time.time()
         loss = self.bp(lr, hr, idx)
         t2 = time.time()
         losses += loss.asnumpy()
         print('Task: %g, Step: %g, loss: %f, time: %f s' %
               (idx.shape[0], batch_idx, loss.asnumpy(), t2 - t1),
               flush=True)
     os.makedirs(self.args.save, exist_ok=True)
     if self.args.rank == 0:
         save_checkpoint(
             self.bp, self.args.save + "model_" + str(self.epoch) + '.ckpt')
Example #26
0
def test_save_and_load_checkpoint_for_network_with_encryption():
    """ test save and checkpoint for network with encryption"""
    net = Net()
    loss = SoftmaxCrossEntropyWithLogits(sparse=True)
    opt = Momentum(net.trainable_params(), 0.0, 0.9, 0.0001, 1024)

    loss_net = WithLossCell(net, loss)
    train_network = TrainOneStepCell(loss_net, opt)
    key = os.urandom(16)
    mode = "AES-GCM"
    ckpt_path = "./encrypt_ckpt.ckpt"
    if platform.system().lower() == "windows":
        with pytest.raises(NotImplementedError):
            save_checkpoint(train_network, ckpt_file_name=ckpt_path, enc_key=key, enc_mode=mode)
            param_dict = load_checkpoint(ckpt_path, dec_key=key, dec_mode="AES-GCM")
            load_param_into_net(net, param_dict)
    else:
        save_checkpoint(train_network, ckpt_file_name=ckpt_path, enc_key=key, enc_mode=mode)
        param_dict = load_checkpoint(ckpt_path, dec_key=key, dec_mode="AES-GCM")
        load_param_into_net(net, param_dict)
    if os.path.exists(ckpt_path):
        os.remove(ckpt_path)
Example #27
0
 def step_end(self, run_context):
     """step end and save ckpt"""
     cb_params = run_context.original_args()
     if cb_params.cur_step_num % self.save_ckpt_step == 0:
         saved_ckpt_num = cb_params.cur_step_num / self.save_ckpt_step
         if saved_ckpt_num > self.max_ckpt_num:
             oldest_ckpt_index = saved_ckpt_num - self.max_ckpt_num
             path = os.path.join(
                 self.output_dir,
                 "ternary_bert_{}_{}.ckpt".format(int(oldest_ckpt_index),
                                                  self.save_ckpt_step))
             if os.path.exists(path):
                 os.remove(path)
         params_dict = save_params(self.network)
         convert_network(self.network, self.embedding_bits,
                         self.weight_bits, self.clip_value)
         save_checkpoint(
             self.network,
             os.path.join(
                 self.output_dir,
                 "ternary_bert_{}_{}.ckpt".format(int(saved_ckpt_num),
                                                  self.save_ckpt_step)))
         restore_params(self.network, params_dict)
Example #28
0
        opt = Momentum(filter(lambda x: x.requires_grad, head_net.get_parameters()), lr, config.momentum, config.weight_decay)

        network = WithLossCell(head_net, loss)
        network = TrainOneStepCell(network, opt)
        network.set_train()

        features_path = args_opt.dataset_path + '_features'
        idx_list = list(range(step_size))
        rank = 0
        if config.run_distribute:
            rank = get_rank()
        save_ckpt_path = os.path.join(config.save_checkpoint_path, 'ckpt_' + str(rank) + '/')
        if not os.path.isdir(save_ckpt_path):
            os.mkdir(save_ckpt_path)

        for epoch in range(epoch_size):
            random.shuffle(idx_list)
            epoch_start = time.time()
            losses = []
            for j in idx_list:
                feature = Tensor(np.load(os.path.join(features_path, f"feature_{j}.npy")))
                label = Tensor(np.load(os.path.join(features_path, f"label_{j}.npy")))
                losses.append(network(feature, label).asnumpy())
            epoch_mseconds = (time.time()-epoch_start) * 1000
            per_step_mseconds = epoch_mseconds / step_size
            print("epoch[{}/{}], iter[{}] cost: {:5.3f}, per step time: {:5.3f}, avg loss: {:5.3f}"\
            .format(epoch + 1, epoch_size, step_size, epoch_mseconds, per_step_mseconds, np.mean(np.array(losses))))
            if (epoch + 1) % config.save_checkpoint_epochs == 0:
                save_checkpoint(net, os.path.join(save_ckpt_path, f"mobilenetv2_{epoch+1}.ckpt"))
        print("total cost {:5.4f} s".format(time.time() - start))
Example #29
0
    Returns:
        parameter list(list): pretrain model weight list.
    """
    ms_ckpt = load_checkpoint(model_path)
    weights = {}
    for msname in ms_ckpt:
        if msname.startswith("layer") or msname.startswith("conv1") or msname.startswith("bn"):
            param_name = "backbone." + msname
        else:
            param_name = msname
        if "down_sample_layer.0" in param_name:
            param_name = param_name.replace("down_sample_layer.0", "conv_down_sample")
        if "down_sample_layer.1" in param_name:
            param_name = param_name.replace("down_sample_layer.1", "bn_down_sample")
        weights[param_name] = ms_ckpt[msname].data.asnumpy()
    if use_fp16_weight:
        dtype = mstype.float16
    else:
        dtype = mstype.float32
    parameter_dict = {}
    for name in weights:
        parameter_dict[name] = Parameter(Tensor(weights[name], dtype), name=name)
    param_list = []
    for key, value in parameter_dict.items():
        param_list.append({"name": key, "data": value})
    return param_list

if __name__ == "__main__":
    parameter_list = load_weights(args_opt.ckpt_file, use_fp16_weight=False)
    save_checkpoint(parameter_list, "resnet50_backbone.ckpt")
Example #30
0
def train():
    """Train model."""
    parser = argparse.ArgumentParser(description='GCN')
    parser.add_argument('--data_dir', type=str, default='./data/cora/cora_mr', help='Dataset directory')
    parser.add_argument('--seed', type=int, default=0, help='Random seed')
    parser.add_argument('--train_nodes_num', type=int, default=140, help='Nodes numbers for training')
    parser.add_argument('--eval_nodes_num', type=int, default=500, help='Nodes numbers for evaluation')
    parser.add_argument('--test_nodes_num', type=int, default=1000, help='Nodes numbers for test')
    parser.add_argument('--save_TSNE', type=ast.literal_eval, default=False, help='Whether to save t-SNE graph')
    args_opt = parser.parse_args()
    if not os.path.exists("ckpts"):
        os.mkdir("ckpts")

    set_seed(args_opt.seed)
    context.set_context(mode=context.GRAPH_MODE,
                        device_target="Ascend", save_graphs=False)
    config = ConfigGCN()
    adj, feature, label_onehot, label = get_adj_features_labels(args_opt.data_dir)

    nodes_num = label_onehot.shape[0]
    train_mask = get_mask(nodes_num, 0, args_opt.train_nodes_num)
    eval_mask = get_mask(nodes_num, args_opt.train_nodes_num, args_opt.train_nodes_num + args_opt.eval_nodes_num)
    test_mask = get_mask(nodes_num, nodes_num - args_opt.test_nodes_num, nodes_num)

    class_num = label_onehot.shape[1]
    gcn_net = GCN(config, adj, feature, class_num)
    gcn_net.add_flags_recursive(fp16=True)

    eval_net = LossAccuracyWrapper(gcn_net, label_onehot, eval_mask, config.weight_decay)
    train_net = TrainNetWrapper(gcn_net, label_onehot, train_mask, config)

    loss_list = []

    if args_opt.save_TSNE:
        out_feature = gcn_net()
        tsne_result = t_SNE(out_feature.asnumpy(), 2)
        graph_data = []
        graph_data.append(tsne_result)
        fig = plt.figure()
        scat = plt.scatter(tsne_result[:, 0], tsne_result[:, 1], s=2, c=label, cmap='rainbow')
        plt.title('t-SNE visualization of Epoch:0', fontsize='large', fontweight='bold', verticalalignment='center')

    for epoch in range(config.epochs):
        t = time.time()

        train_net.set_train()
        train_result = train_net()
        train_loss = train_result[0].asnumpy()
        train_accuracy = train_result[1].asnumpy()

        eval_net.set_train(False)
        eval_result = eval_net()
        eval_loss = eval_result[0].asnumpy()
        eval_accuracy = eval_result[1].asnumpy()

        loss_list.append(eval_loss)
        print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(train_loss),
              "train_acc=", "{:.5f}".format(train_accuracy), "val_loss=", "{:.5f}".format(eval_loss),
              "val_acc=", "{:.5f}".format(eval_accuracy), "time=", "{:.5f}".format(time.time() - t))

        if args_opt.save_TSNE:
            out_feature = gcn_net()
            tsne_result = t_SNE(out_feature.asnumpy(), 2)
            graph_data.append(tsne_result)

        if epoch > config.early_stopping and loss_list[-1] > np.mean(loss_list[-(config.early_stopping+1):-1]):
            print("Early stopping...")
            break
    save_checkpoint(gcn_net, "ckpts/gcn.ckpt")
    gcn_net_test = GCN(config, adj, feature, class_num)
    load_checkpoint("ckpts/gcn.ckpt", net=gcn_net_test)
    gcn_net_test.add_flags_recursive(fp16=True)

    test_net = LossAccuracyWrapper(gcn_net_test, label_onehot, test_mask, config.weight_decay)
    t_test = time.time()
    test_net.set_train(False)
    test_result = test_net()
    test_loss = test_result[0].asnumpy()
    test_accuracy = test_result[1].asnumpy()
    print("Test set results:", "loss=", "{:.5f}".format(test_loss),
          "accuracy=", "{:.5f}".format(test_accuracy), "time=", "{:.5f}".format(time.time() - t_test))

    if args_opt.save_TSNE:
        ani = animation.FuncAnimation(fig, update_graph, frames=range(config.epochs + 1), fargs=(graph_data, scat, plt))
        ani.save('t-SNE_visualization.gif', writer='imagemagick')