Beispiel #1
0
    def _run_network(self, dataset_sink_mode=True):
        lenet = LeNet5()
        loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False,
                                                sparse=True,
                                                reduction="mean")
        optim = Momentum(lenet.trainable_params(),
                         learning_rate=0.1,
                         momentum=0.9)
        model = Model(lenet,
                      loss_fn=loss,
                      optimizer=optim,
                      metrics={'acc': Accuracy()})
        summary_dir = tempfile.mkdtemp(dir=self.base_summary_dir)
        summary_collector = SummaryCollector(summary_dir=summary_dir,
                                             collect_freq=1)

        ds_train = create_dataset(os.path.join(self.mnist_path, "train"))
        model.train(1,
                    ds_train,
                    callbacks=[summary_collector],
                    dataset_sink_mode=dataset_sink_mode)

        ds_eval = create_dataset(os.path.join(self.mnist_path, "test"))
        model.eval(ds_eval,
                   dataset_sink_mode=dataset_sink_mode,
                   callbacks=[summary_collector])

        self._check_summary_result(summary_dir)
Beispiel #2
0
    def _run_network(self, dataset_sink_mode=False, num_samples=2, **kwargs):
        lenet = LeNet5()
        loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
        optim = Momentum(lenet.trainable_params(),
                         learning_rate=0.1,
                         momentum=0.9)
        model = Model(lenet,
                      loss_fn=loss,
                      optimizer=optim,
                      metrics={'loss': Loss()})
        summary_dir = tempfile.mkdtemp(dir=self.base_summary_dir)
        summary_collector = SummaryCollector(summary_dir=summary_dir,
                                             collect_freq=2,
                                             **kwargs)

        ds_train = create_dataset(os.path.join(self.mnist_path, "train"),
                                  num_samples=num_samples)
        model.train(1,
                    ds_train,
                    callbacks=[summary_collector],
                    dataset_sink_mode=dataset_sink_mode)

        ds_eval = create_dataset(os.path.join(self.mnist_path, "test"))
        model.eval(ds_eval,
                   dataset_sink_mode=dataset_sink_mode,
                   callbacks=[summary_collector])
        return summary_dir
Beispiel #3
0
def test_compile_model_train_O2():
    dataset_types = (np.float32, np.float32)
    dataset_shapes = ((16, 16), (16, 16))

    dataset = MindDataSet(dataset_types, dataset_shapes)

    net = NetNoLoss(16, 16)
    loss = nn.MSELoss()
    optimizer = nn.Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)

    model = Model(net, loss_fn=loss, optimizer=optimizer, metrics={"acc"}, amp_level="O2")
    model.train(2, dataset, dataset_sink_mode=False)
    with pytest.raises(ValueError):
        # not actual run, the metrics step will fail, check if compile ok.
        model.eval(dataset)
Beispiel #4
0
def mnist_train(epoch_size, batch_size, lr, momentum):
    mnist_path = "./MNIST_unzip/"
    ds = generate_mnist_dataset(os.path.join(mnist_path, "train"),
                                batch_size=batch_size,
                                repeat_size=1)

    network = LeNet5()
    net_loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False,
                                                sparse=True,
                                                reduction="mean")
    net_opt = nn.Momentum(network.trainable_params(), lr, momentum)
    config_ck = CheckpointConfig(save_checkpoint_steps=1875,
                                 keep_checkpoint_max=10)
    ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
                                 directory="./trained_ckpt_file/",
                                 config=config_ck)
    model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()})

    LOGGER.info(TAG, "============== Starting Training ==============")
    model.train(epoch_size,
                ds,
                callbacks=[ckpoint_cb, LossMonitor()],
                dataset_sink_mode=False)

    LOGGER.info(TAG, "============== Starting Testing ==============")
    ckpt_file_name = "trained_ckpt_file/checkpoint_lenet-10_1875.ckpt"
    param_dict = load_checkpoint(ckpt_file_name)
    load_param_into_net(network, param_dict)
    ds_eval = generate_mnist_dataset(os.path.join(mnist_path, "test"),
                                     batch_size=batch_size)
    acc = model.eval(ds_eval, dataset_sink_mode=False)
    LOGGER.info(TAG, "============== Accuracy: %s ==============", acc)
Beispiel #5
0
def train(data_dir, lr=0.01, momentum=0.9, num_epochs=2, ckpt_name="lenet"):
    dataset_sink = context.get_context('device_target') == 'Ascend'
    repeat = num_epochs if dataset_sink else 1
    ds_train = create_dataset(data_dir, repeat=repeat)
    ds_eval = create_dataset(data_dir, training=False)
    steps_per_epoch = ds_train.get_dataset_size()

    net = LeNet5()
    loss = nn.loss.SoftmaxCrossEntropyWithLogits(is_grad=False,
                                                 sparse=True,
                                                 reduction='mean')
    opt = nn.Momentum(net.trainable_params(), lr, momentum)

    ckpt_cfg = CheckpointConfig(save_checkpoint_steps=steps_per_epoch,
                                keep_checkpoint_max=5)
    ckpt_cb = ModelCheckpoint(prefix=ckpt_name,
                              directory='ckpt',
                              config=ckpt_cfg)
    loss_cb = LossMonitor(steps_per_epoch)

    model = Model(net, loss, opt, metrics={'acc', 'loss'})
    model.train(num_epochs,
                ds_train,
                callbacks=[ckpt_cb, loss_cb],
                dataset_sink_mode=dataset_sink)
    metrics = model.eval(ds_eval, dataset_sink_mode=dataset_sink)
    print('Metrics:', metrics)
Beispiel #6
0
def calibration():
    """ do the calibration to get the scale offset record file"""
    dataset = create_dataset(
        dataset_path=ARGS_OPT.eval_dataset,
        do_train=False,
        batch_size=config.batch_size,  # pylint: disable=no-member
        target=ARGS_OPT.device_target)
    dataset = dataset.take(1)
    loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')

    network = resnet(10)
    network.set_train(False)
    param_dict = load_checkpoint(ARGS_OPT.pre_trained)
    load_param_into_net(network, param_dict)
    input_data = np.random.uniform(0.0, 1.0, size=[32, 3, 224,
                                                   224]).astype(np.float32)
    config_file = os.path.join(CUR_DIR, './config.json')
    amct.create_quant_config(config_file, network, input_data)
    calibration_network = amct.quantize_model(config_file, network, input_data)

    model = Model(calibration_network,
                  loss_fn=loss,
                  metrics={'top_1_accuracy', 'top_5_accuracy'})
    _ = model.eval(dataset)
    amct.save_model('./resnet50_quant_calibration', calibration_network,
                    input_data)
def eval_quant():
    context.set_context(mode=context.GRAPH_MODE, device_target=device_target)
    cfg = quant_cfg
    ds_eval = create_dataset(os.path.join(data_path, "test"), cfg.batch_size,
                             1)
    ckpt_path = './ckpt_lenet_quant-10_937.ckpt'
    # define fusion network
    network = LeNet5Fusion(cfg.num_classes)
    # convert fusion network to quantization aware network
    quantizer = QuantizationAwareTraining(quant_delay=0,
                                          bn_fold=False,
                                          freeze_bn=10000,
                                          per_channel=[True, False],
                                          symmetric=[True, False])
    network = quantizer.quantize(network)

    # define loss
    net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
    # define network optimization
    net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)

    # call back and monitor
    model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()})

    # load quantization aware network checkpoint
    param_dict = load_checkpoint(ckpt_path)
    not_load_param = load_param_into_net(network, param_dict)
    if not_load_param:
        raise ValueError("Load param into net fail!")

    print("============== Starting Testing ==============")
    acc = model.eval(ds_eval, dataset_sink_mode=True)
    print("============== {} ==============".format(acc))
    assert acc['Accuracy'] > 0.98
Beispiel #8
0
def eval_alexnet():
    print("============== Starting Testing ==============")

    device_num = get_device_num()
    if device_num > 1:
        # context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target)
        context.set_context(mode=context.GRAPH_MODE,
                            device_target='Davinci',
                            save_graphs=False)
        if config.device_target == "Ascend":
            context.set_context(device_id=get_device_id())
            init()
        elif config.device_target == "GPU":
            init()

    if config.dataset_name == 'cifar10':
        network = AlexNet(config.num_classes, phase='test')
        loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
        opt = nn.Momentum(network.trainable_params(), config.learning_rate,
                          config.momentum)
        ds_eval = create_dataset_cifar10(config.data_path, config.batch_size, status="test", \
            target=config.device_target)
        param_dict = load_checkpoint(load_path)
        print("load checkpoint from [{}].".format(load_path))
        load_param_into_net(network, param_dict)
        network.set_train(False)
        model = Model(network, loss, opt, metrics={"Accuracy": Accuracy()})

    elif config.dataset_name == 'imagenet':
        network = AlexNet(config.num_classes, phase='test')
        loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
        ds_eval = create_dataset_imagenet(config.data_path,
                                          config.batch_size,
                                          training=False)
        param_dict = load_checkpoint(load_path)
        print("load checkpoint from [{}].".format(load_path))
        load_param_into_net(network, param_dict)
        network.set_train(False)
        model = Model(network,
                      loss_fn=loss,
                      metrics={'top_1_accuracy', 'top_5_accuracy'})

    else:
        raise ValueError("Unsupported dataset.")

    if ds_eval.get_dataset_size() == 0:
        raise ValueError(
            "Please check dataset size > 0 and batch_size <= dataset size")

    result = model.eval(ds_eval, dataset_sink_mode=config.dataset_sink_mode)
    print("result : {}".format(result))
Beispiel #9
0
def train(data_dir, lr=0.01, momentum=0.9, num_epochs=3):
    ds_train = create_dataset(data_dir)
    ds_eval = create_dataset(data_dir, training=False)

    net = LeNet5()
    loss = nn.loss.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
    opt = nn.Momentum(net.trainable_params(), lr, momentum)
    loss_cb = LossMonitor(per_print_times=ds_train.get_dataset_size())

    model = Model(net, loss, opt, metrics={'acc', 'loss'})
    # dataset_sink_mode can be True when using Ascend
    model.train(num_epochs, ds_train, callbacks=[loss_cb], dataset_sink_mode=False)
    metrics = model.eval(ds_eval, dataset_sink_mode=False)
    print('Metrics:', metrics)
Beispiel #10
0
def test_train_and_eval_lenet():
    context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
    network = LeNet5(10)
    net_loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction="mean")
    net_opt = nn.Momentum(network.trainable_params(), 0.01, 0.9)
    model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()})

    print("============== Starting Training ==============")
    ds_train = create_dataset(os.path.join('/home/workspace/mindspore_dataset/mnist', "train"), 32, 1)
    model.train(1, ds_train, callbacks=[LossMonitor()], dataset_sink_mode=True)

    print("============== Starting Testing ==============")
    ds_eval = create_dataset(os.path.join('/home/workspace/mindspore_dataset/mnist', "test"), 32, 1)
    acc = model.eval(ds_eval, dataset_sink_mode=True)
    print("============== {} ==============".format(acc))
Beispiel #11
0
def eval_lenet5():
    """Evaluation of lenet5"""
    context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target)

    network = LeNet5(config.num_classes)
    net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
    net_opt = nn.Momentum(network.trainable_params(), config.lr, config.momentum)
    model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()})

    print("============== Starting Testing ==============")
    load_checkpoint(config.ckpt_path, network)
    ds_eval = create_lenet_dataset(os.path.join(config.data_path, "test"), config.batch_size, 1)
    if ds_eval.get_dataset_size() == 0:
        raise ValueError("Please check dataset size > 0 and batch_size <= dataset size")

    acc = model.eval(ds_eval)
    print("============== {} ==============".format(acc))
Beispiel #12
0
def test_original_resnet50():
    """ evaluate the original resnet50"""
    dataset = create_dataset(
        dataset_path=ARGS_OPT.eval_dataset,
        do_train=False,
        batch_size=config.batch_size,  # pylint: disable=no-member
        target=ARGS_OPT.device_target)
    network = resnet(10)
    network.set_train(False)
    param_dict = load_checkpoint(ARGS_OPT.pre_trained)
    load_param_into_net(network, param_dict)
    loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
    model = Model(network,
                  loss_fn=loss,
                  metrics={'top_1_accuracy', 'top_5_accuracy'})
    res = model.eval(dataset)
    print("result for original resnet50:", res, "ckpt=", ARGS_OPT.pre_trained)
Beispiel #13
0
def train(Net):
    ds_train, ds_test = create_dataset()
    # 构建网络
    network = Net(cfg.num_classes)
    # 定义模型的损失函数,优化器
    net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
    net_opt = nn.Adam(network.trainable_params(), cfg.lr)
    # 训练模型
    model = Model(network,
                  loss_fn=net_loss,
                  optimizer=net_opt,
                  metrics={'acc': Accuracy()})
    loss_cb = LossMonitor()
    print("============== Starting Training ==============")
    model.train(30, ds_train, callbacks=[loss_cb], dataset_sink_mode=True)
    # 验证
    metric = model.eval(ds_test)
    print(metric)

    return model
Beispiel #14
0
def quant_resnet50(network, dataset, loss, input_data):
    """quantize the resnet50 """

    # step2: creat the quant config json file
    create_quant_config('./config.json', network, input_data)

    # step3: do some network modification and return the modified network
    calibration_network = quantize_model('./config.json', network, input_data)
    calibration_network.set_train(False)

    # step4: perform the evaluation of network to do activation calibration
    model = Model(calibration_network,
                  loss_fn=loss,
                  metrics={'top_1_accuracy', 'top_5_accuracy'})

    _ = model.eval(dataset, dataset_sink_mode=False)

    # step5: export the air file
    save_model('results/resnet50_quant', calibration_network, input_data)
    print("[INFO] the quantized AIR file has been stored at: \n {}".format(
        'results/resnet50_quant.air'))
Beispiel #15
0
def eval_lenet():
    context.set_context(mode=context.GRAPH_MODE, device_target=device_target)
    cfg = nonquant_cfg
    ds_eval = create_dataset(os.path.join(data_path, "test"), cfg.batch_size,
                             1)
    ckpt_path = './ckpt_lenet_noquant-10_1875.ckpt'
    # define fusion network
    network = LeNet5(cfg.num_classes)
    net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
    net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
    # call back and monitor
    model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()})
    # load quantization aware network checkpoint
    param_dict = load_checkpoint(ckpt_path)
    not_load_param = load_param_into_net(network, param_dict)
    if not_load_param:
        raise ValueError("Load param into net fail!")

    print("============== Starting Testing ==============")
    acc = model.eval(ds_eval, dataset_sink_mode=True)
    print("============== {} ==============".format(acc))
    assert acc['Accuracy'] > 0.98
Beispiel #16
0
def mnist_train(epoch_size, batch_size, lr, momentum):
    context.set_context(mode=context.GRAPH_MODE,
                        device_target="Ascend",
                        enable_mem_reuse=False)

    lr = lr
    momentum = momentum
    epoch_size = epoch_size
    mnist_path = "./MNIST_unzip/"
    ds = generate_mnist_dataset(os.path.join(mnist_path, "train"),
                                batch_size=batch_size,
                                repeat_size=1)

    network = LeNet5()
    network.set_train()
    net_loss = CrossEntropyLoss()
    net_opt = nn.Momentum(network.trainable_params(), lr, momentum)
    config_ck = CheckpointConfig(save_checkpoint_steps=1875,
                                 keep_checkpoint_max=10)
    ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
                                 directory='./trained_ckpt_file/',
                                 config=config_ck)
    model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()})

    LOGGER.info(TAG, "============== Starting Training ==============")
    model.train(epoch_size,
                ds,
                callbacks=[ckpoint_cb, LossMonitor()],
                dataset_sink_mode=False)  # train

    LOGGER.info(TAG, "============== Starting Testing ==============")
    param_dict = load_checkpoint(
        "trained_ckpt_file/checkpoint_lenet-10_1875.ckpt")
    load_param_into_net(network, param_dict)
    ds_eval = generate_mnist_dataset(os.path.join(mnist_path, "test"),
                                     batch_size=batch_size)
    acc = model.eval(ds_eval)
    LOGGER.info(TAG, "============== Accuracy: %s ==============", acc)
Beispiel #17
0
class TrainerMs(TrainerBase):
    """Trainer mindspore class."""
    def build(self):
        """Build the trainer by assembling the necessary components."""
        super().build()
        if self.config.lr_scheduler.params:
            self.lr_scheduler = LrScheduler()
            dynamic_lr = self.lr_scheduler()(
                base_lr=self.config.optimizer.params["lr"],
                global_step=self.config.epochs * len(self.train_loader),
                total_epoch=self.config.epochs)
            self.optimizer = Optimizer()(model=self.model,
                                         dynamic_lr=dynamic_lr)
        else:
            self.optimizer = Optimizer()(model=self.model)
        if hasattr(self.model, 'add_loss'):
            loss_cls = Loss()()
            self.model.add_loss(loss_cls)
            self.loss = self.model.overall_loss()
        else:
            self.loss = Loss()()
        self.metric_name = self.config.metric.type

        # Some trainer has different train batch size from valid batch
        self.train_metrics = None
        self.valid_metrics = self._init_metrics()
        self.ms_metrics = self.valid_metrics() if isinstance(
            self.valid_metrics(), dict) else {
                self.metric_name: self.valid_metrics()
            }

        self.ms_model = MsModel(network=self.model,
                                loss_fn=self.loss,
                                optimizer=self.optimizer,
                                metrics=self.ms_metrics)

    def _set_condition(self):
        self._init_ms_context()
        self._init_distributed_setting()

    def _train_epoch(self):
        config_ck = CheckpointConfig(
            save_checkpoint_steps=self.config.save_steps,
            keep_checkpoint_max=1)
        # save the network model and parameters for subsequence fine-tuning
        save_path = self.get_local_worker_path(self.step_name, self.worker_id)
        ckpoint_cb = ModelCheckpoint(config=config_ck, directory=save_path)
        loss_cb = LossMonitor(per_print_times=1)
        eval_cb = EvalCallBack(self.ms_model, self.valid_loader,
                               self.dataset_sink_mode, self)
        callback_list = [ckpoint_cb, loss_cb] if self.config.mixup else [
            ckpoint_cb, loss_cb, eval_cb
        ]
        try:
            self.ms_model.train(epoch=self.epochs,
                                train_dataset=self.train_loader,
                                callbacks=callback_list,
                                dataset_sink_mode=self.dataset_sink_mode)
        except RuntimeError as e:
            logging.warning(
                f"failed to train the model, skip it, message: {str(e)}")

    def _valid_epoch(self):
        if self.config.mixup and self.config.loss.type == 'CrossEntropyLoss':
            from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits
            loss_fn = SoftmaxCrossEntropyWithLogits(sparse=True)
            self.ms_model = MsModel(network=self.model,
                                    loss_fn=loss_fn,
                                    optimizer=self.optimizer,
                                    metrics=self.ms_metrics)
        self.callbacks.before_valid()

        try:
            eval_metrics = self.ms_model.eval(
                valid_dataset=self.valid_loader,
                dataset_sink_mode=self.dataset_sink_mode)
            self.valid_metrics.update(eval_metrics)
            valid_logs = dict()
            valid_logs['cur_valid_perfs'] = self.valid_metrics.results

            self.callbacks.after_valid(valid_logs)
        except RuntimeError as exc:
            logging.warning(
                "RuntimeError occurred when eval the model. Skip eval this model."
            )
            logging.warning("The RuntimeError message is : {}.".format(exc))

    def _init_distributed_setting(self):
        if not self.distributed:
            return
        else:
            logging.info("init hccl ...")
            context.set_auto_parallel_context(
                parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True)
            hccl_init()

    def _init_ms_context(self):
        if hasattr(self.config, "execute_mode"):
            mode = context.PYNATIVE_MODE if self.config.execute_mode == "PYNATIVE_MODE" else context.GRAPH_MODE
        else:
            mode = context.GRAPH_MODE
        if vega.is_npu_device():
            context.set_context(mode=mode,
                                device_target="Ascend",
                                device_id=int(os.environ["DEVICE_ID"]))
        else:
            context.set_context(mode=mode, device_target="CPU")

        self.dataset_sink_mode = True if vega.is_npu_device() else False
Beispiel #18
0
                           batch_size=cfg.batch_size)

    loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True)
    opt = nn.Momentum(network.trainable_params(), cfg.learning_rate,
                      cfg.momentum)
    loss_cb = LossMonitor()
    model = Model(network, loss, opt, {'acc': Accuracy()})

    if args.mode == 'train':
        print("============== Starting Training ==============")
        ds_train = create_dataset(args.preprocess_path, cfg.batch_size,
                                  cfg.num_epochs, True)
        config_ck = CheckpointConfig(
            save_checkpoint_steps=cfg.save_checkpoint_steps,
            keep_checkpoint_max=cfg.keep_checkpoint_max)
        ckpoint_cb = ModelCheckpoint(prefix="lstm",
                                     directory=args.ckpt_path,
                                     config=config_ck)
        model.train(cfg.num_epochs, ds_train, callbacks=[ckpoint_cb, loss_cb])
    elif args.mode == 'test':
        print("============== Starting Testing ==============")
        ds_eval = create_dataset(args.preprocess_path, cfg.batch_size, 1,
                                 False)
        param_dict = load_checkpoint(args.ckpt_path)
        load_param_into_net(network, param_dict)
        acc = model.eval(ds_eval)
        print("============== Accuracy:{} ==============".format(acc))
    else:
        raise RuntimeError(
            'mode should be train or test, rather than {}'.format(args.mode))
Beispiel #19
0
class Trainer(DistributedWorker):
    """Trainer class.

    :param model: input model, defaults to None
    :type model: tf model, optional
    :param id: id of the model, defaults to None
    :type id: int, optional
    :param hps: hyperparameters, defaults to None
    :type hps: dict, optional
    """

    config = TrainerConfig()

    def __init__(self, model=None, id=None, hps=None,
                 load_ckpt_flag=False, model_desc=None,
                 lazy_build=True, **kwargs):
        super(Trainer, self).__init__()
        self.worker_type = WorkerTypes.TRAINER
        Trainer.__worker_id__ += 1
        if id is not None:
            self._worker_id = id
        else:
            self._worker_id = Trainer.__worker_id__

        # Data Memeber list of Trainer
        self.is_chief = True
        self.use_cuda = self.config.cuda
        self.epochs = self.config.epochs
        self.do_validation = True
        self.auto_save_ckpt = True
        self.auto_save_perf = True
        self.skip_train = False
        self.valid_interval = self.config.valid_interval
        self.hps = hps
        self.model = model
        self.model_desc = model_desc
        self.optimizer = None
        self.lr_scheduler = None
        self.loss = None
        self.use_syncbn = self.config.syncbn
        self.use_amp = self.config.amp
        self.train_metrics = None
        self.valid_metrics = None
        self.call_metrics_on_train = self.config.call_metrics_on_train
        self.train_verbose = self.config.train_verbose
        self.valid_verbose = self.config.valid_verbose
        self.train_report_steps = self.config.train_report_steps
        self.valid_report_steps = self.config.valid_report_steps
        self.train_loader = None
        self.valid_loader = None
        self.train_step = None
        self.valid_step = None
        self.make_batch = None
        self.model_fn = None
        self.train_input_fn = None
        self.valid_input_fn = None
        self.callbacks = None
        self.performance = None
        self.runtime = None
        self.visual_data = {}
        self.load_ckpt_flag = load_ckpt_flag
        self.distributed = self.config.distributed
        # Used by TimmTrainerCallbacks since it builds its trainer in
        # the before_train callback
        self.lazy_built = self.config.lazy_built
        # Indicate whether the necessary components of a trainer
        # has been built for running
        self._world_size = 1
        self._rank_id = 0
        self._local_rank_id = 0
        self.config.kwargs = kwargs
        self.checkpoint_file_name = 'checkpoint.pth'
        self.model_pickle_file_name = 'model.pkl'
        worker_path = self.get_local_worker_path()
        self.model_path = FileOps.join_path(worker_path, self.model_pickle_file_name)
        self.checkpoint_file = FileOps.join_path(worker_path, self.checkpoint_file_name)
        self.weights_file = FileOps.join_path(worker_path, "model_{}.pth".format(self.worker_id))
        self.loss_input = kwargs.get('loss_input', None)
        if not lazy_build:
            self.init_trainer()

    def _set_default_funcs(self):
        if zeus.is_torch_backend():
            self.make_batch = self._default_make_batch
            self.train_step = self._default_train_step
            self.valid_step = self._default_valid_step
        elif zeus.is_tf_backend():
            self.model_fn = self._default_model_fn
            self.train_input_fn = self._default_train_input_fn
            self.valid_input_fn = self._default_valid_input_fn

    def _set_condition(self):
        self._init_tf_session()
        self._init_distributed_setting()
        self._init_cuda_setting()
        self._init_tf_estimator()
        self._init_ms_context()

    def train_process(self):
        """Whole train process of the TrainWorker specified in config.

        After training, the model and validation results are saved to local_worker_path and s3_path.
        """
        init_log(level=General.logger.level,
                 log_file="log_worker_{}.txt".format(self.worker_id),
                 log_path=self.local_log_path)
        self._set_default_funcs()
        self._set_condition()
        self._init_callbacks()
        self.callbacks.init_trainer()
        if not self.lazy_built:
            self.build()
        self._train_loop()

    def build(self):
        """Build the trainer by assembling the necessary components."""
        self._init_hps(self.hps)
        logging.debug("Trainer Config: {}".format(self.config))
        self.do_validation = self.config.with_valid
        self.use_syncbn = self.config.syncbn
        if self.use_syncbn and zeus.is_torch_backend():
            self.model = apex.parallel.convert_syncbn_model(self.model)
        self.train_loader = self._init_dataloader(mode='train')
        self.valid_loader = self._init_dataloader(mode='val')
        self.batch_num_train = self.train_loader.get_dataset_size() if zeus.is_ms_backend() else len(self.train_loader)
        self.batch_num_valid = self.valid_loader.get_dataset_size() if zeus.is_ms_backend() else len(self.valid_loader)

        if zeus.is_torch_backend():
            self.optimizer = Optimizer()(model=self.model, distributed=self.distributed)
            if hasattr(self.model, 'add_loss'):
                loss_cls = Loss()()
                self.model.add_loss(loss_cls)
                self.loss = self.model.overall_loss()
            else:
                self.loss = Loss()()
            self.lr_scheduler = LrScheduler()(self.optimizer)
        elif zeus.is_ms_backend():
            self.optimizer = Optimizer()(model=self.model)
            if hasattr(self.model, 'add_loss'):
                loss_cls = Loss()()
                self.model.add_loss(loss_cls)
                self.loss = self.model.overall_loss()
            else:
                self.loss = Loss()()
            self.metric_name = self.config.metric().type
        # Some trainer has different train batch size from valid batch
        self.train_metrics = self._init_metrics() if zeus.is_torch_backend() else None
        self.valid_metrics = self._init_metrics()
        self._init_horovod_setting()
        if self.use_amp and zeus.is_torch_backend():
            self.model, self.optimizer = amp.initialize(
                self.model, self.optimizer, opt_level='O1')

    def init_trainer(self):
        """Init Train Op."""
        init_log(level=General.logger.level,
                 log_file="log_worker_{}.txt".format(self.worker_id),
                 log_path=self.local_log_path)
        self._set_default_funcs()
        self._set_condition()
        self._init_callbacks()
        self.callbacks.init_trainer()

        self.init_train_op()

    def init_train_op(self):
        """Init Train Op."""
        if zeus.is_tf_backend():
            with self.graph.as_default():
                self._init_train_op()

    def train(self, inputs, labels):
        """Train model."""
        if zeus.is_tf_backend():
            feed_dict = {}
            with self.graph.as_default():
                for i in range(len(inputs)):
                    feed_dict.update({self.inputs[i]: inputs[i]})

                for i in range(len(labels)):
                    feed_dict.update({self.labels[i]: labels[i]})

                _, loss = self.sess.run([self.train_op, self.loss], feed_dict)
                return loss

    def predict(self, input):
        """Inference model."""
        if zeus.is_tf_backend():
            with self.graph.as_default():
                feed_dict = {self.input: input}
                out = self.sess.run(self.logits, feed_dict)
                return out

    def save(self, file_name):
        """Save model."""
        if zeus.is_tf_backend():
            with self.graph.as_default():
                self.actor_var.save_weights(file_name + ".npz")

            return file_name + ".npz"

    def load(self, model_name, by_name):
        """Load model."""
        if zeus.is_tf_backend():
            with self.graph.as_default():
                self.actor_var.set_weights_with_npz(model_name)

    def set_weights(self, weights):
        """Set weight with memory tensor."""
        if zeus.is_tf_backend():
            with self.graph.as_default():
                self.actor_var.set_weights(weights)

    def get_weights(self):
        """Get the weights."""
        if zeus.is_tf_backend():
            with self.graph.as_default():
                return self.actor_var.get_weights()

    def _create_tensor(self, tensor_list):
        ret_list = []

        for tensor in tensor_list:
            tensor_type = tensor['type']
            tensor_shape = tensor['shape']
            tensor_name = tensor['name']

            if type(tensor_shape) is list:
                tf_tensor = tf.placeholder(tensor_type, name=tensor_name,
                                           shape=(None, ) + tuple(tensor_shape))
            else:
                tf_tensor = tf.placeholder(tensor_type, name=tensor_name,
                                           shape=(None, tensor_shape))
            ret_list.append(tf_tensor)

        return ret_list

    def _init_train_op(self):
        if self.loss_input is not None:
            self.inputs = self._create_tensor(self.loss_input['inputs'])
            self.labels = self._create_tensor(self.loss_input['labels'])

            self.input = self.inputs[0]
            logits = self.model(self.input)
            self.logits = logits
            self.actor_var = TFVariables(logits, self.sess)

            loss = Loss()()
            self.loss = loss(logits, self.labels)

            self.optimizer = Optimizer()(distributed=self.distributed)
            grads_and_var = self.optimizer.compute_gradients(self.loss)
            grads, var = zip(*grads_and_var)
            grads_and_var = list(zip(grads, var))
            self.train_op = self.optimizer.apply_gradients(grads_and_var)
            self.sess.run(tf.initialize_all_variables())

    def _init_cuda_setting(self):
        """Init CUDA setting."""
        if not zeus.is_torch_backend():
            return
        if not self.config.cuda:
            self.config.device = -1
            return
        self.config.device = self.config.cuda if self.config.cuda is not True else 0
        self.use_cuda = True
        if self.distributed:
            torch.cuda.set_device(self._local_rank_id)
        torch.cuda.manual_seed(self.config.seed)

    def _init_distributed_setting(self):
        if not self.distributed:
            return
        if zeus.is_npu_device():
            self.npu_init = npu_ops.initialize_system()
            self.npu_shutdown = npu_ops.shutdown_system()
            self.sess.run(self.npu_init)
        self._world_size = hvd.size() if zeus.is_gpu_device() else get_rank_size()
        self._rank_id = hvd.rank() if zeus.is_gpu_device() else get_rank_id()
        self._local_rank_id = hvd.local_rank() if zeus.is_gpu_device() else get_local_rank_id()

    def _init_horovod_setting(self):
        """Init horovod setting."""
        self.is_chief = True
        if self.distributed and zeus.is_torch_backend():
            hvd.broadcast_parameters(self.model.state_dict(), root_rank=0)
            hvd.broadcast_optimizer_state(self.optimizer, root_rank=0)
            if hvd.rank() != 0:
                self.is_chief = False
            else:
                self.is_chief = True

    def _init_hps(self, hps=None):
        """Load hps from file."""
        if hps is not None:
            self.hps = hps
        elif self.config.hps_file is not None:
            desc_file = self.config.hps_file.replace("{local_base_path}", self.local_base_path)
            self.hps = Config(desc_file)
        elif self.config.hps_folder is not None:
            folder = self.config.hps_folder.replace("{local_base_path}", self.local_base_path)
            pattern = FileOps.join_path(folder, "desc_*.json")
            desc_file = glob.glob(pattern)[0]
            self.hps = Config(desc_file)
        if self.hps and self.hps.get('trainer'):
            self.config.from_json(self.hps.get('trainer'))
            self.epochs = self.config.epochs

    def _init_metrics(self, metrics=None):
        """Init metrics."""
        if metrics is not None:
            return metrics
        else:
            return Metrics()

    def _init_dataloader(self, mode, loader=None):
        """Init dataloader."""
        if loader is not None:
            return loader
        if mode == "train" and self.hps is not None and self.hps.get("dataset") is not None:
            dataset_cls = ClassFactory.get_cls(ClassType.DATASET)
            dataset = dataset_cls(mode=mode, hps=self.hps.get("dataset"))
        else:
            dataset_cls = ClassFactory.get_cls(ClassType.DATASET)
            dataset = dataset_cls(mode=mode)
        if self.distributed and mode == "train":
            dataset.set_distributed(self._world_size, self._rank_id)
        # adapt the dataset to specific backend
        dataloader = Adapter(dataset).loader
        return dataloader

    def _train_loop(self):
        """Do the training with data, callbacks and step functions etc."""
        # Allow user to build trainer in before_train() callback, but they
        # should set lazy_built in configuration file to True
        self.callbacks.before_train()
        if self.skip_train:
            return
        repeat_time = 1 if zeus.is_ms_backend() else self.epochs
        for epoch in range(repeat_time):
            epoch_logs = {'train_num_batches': self.batch_num_train}
            if self.do_validation:
                epoch_logs.update({'valid_num_batches': self.batch_num_valid})
            self.callbacks.before_epoch(epoch, epoch_logs)
            self._train_epoch()
            if self.do_validation and self._should_run_validation(epoch):
                self._valid_epoch()
            self.callbacks.after_epoch(epoch)
        self.callbacks.after_train()
        if self.distributed:
            self._shutdown_distributed()

    def _train_epoch(self):
        if zeus.is_torch_backend():
            self.model.train()
            for batch_index, batch in enumerate(self.train_loader):
                batch = self.make_batch(batch)
                batch_logs = {'train_batch': batch}
                self.callbacks.before_train_step(batch_index, batch_logs)
                train_batch_output = self.train_step(batch)
                batch_logs.update(train_batch_output)
                if self.config.is_detection_trainer:
                    batch_logs.update({'is_detection_trainer': True})
                self.callbacks.after_train_step(batch_index, batch_logs)
        elif zeus.is_tf_backend():
            self.estimator.train(input_fn=self.train_input_fn,
                                 steps=len(self.train_loader),
                                 hooks=self._init_logging_hook())
        elif zeus.is_ms_backend():
            self.ms_model = MsModel(network=self.model,
                                    loss_fn=self.loss,
                                    optimizer=self.optimizer,
                                    metrics={self.metric_name: self.valid_metrics()})
            config_ck = CheckpointConfig(save_checkpoint_steps=self.config.save_steps)
            # save the network model and parameters for subsequence fine-tuning
            save_path = self.get_local_worker_path(self.step_name, self.worker_id)
            ckpoint_cb = ModelCheckpoint(config=config_ck, directory=save_path)
            loss_cb = LossMonitor(per_print_times=self.config.report_freq)
            eval_cb = EvalCallBack(self.ms_model, self.valid_loader)
            self.ms_model.train(epoch=self.epochs,
                                train_dataset=self.train_loader,
                                callbacks=[ckpoint_cb, loss_cb, eval_cb],
                                dataset_sink_mode=self.dataset_sink_mode)

    def _valid_epoch(self):
        self.callbacks.before_valid()
        valid_logs = None
        if zeus.is_torch_backend():
            self.model.eval()
            with torch.no_grad():
                for batch_index, batch in enumerate(self.valid_loader):
                    batch = self.make_batch(batch)
                    batch_logs = {'valid_batch': batch}
                    self.callbacks.before_valid_step(batch_index, batch_logs)
                    valid_batch_output = self.valid_step(batch)
                    self.callbacks.after_valid_step(batch_index, valid_batch_output)
        elif zeus.is_tf_backend():
            eval_metrics = self.estimator.evaluate(input_fn=self.valid_input_fn,
                                                   steps=len(self.valid_loader))
            self.valid_metrics.update(eval_metrics)
            valid_logs = dict()
            valid_logs['cur_valid_perfs'] = self.valid_metrics.results
        elif zeus.is_ms_backend():
            eval_metrics = self.ms_model.eval(valid_dataset=self.valid_loader,
                                              dataset_sink_mode=self.dataset_sink_mode)

            self.valid_metrics.update(eval_metrics)
            valid_logs = dict()
            valid_logs['cur_valid_perfs'] = self.valid_metrics.results
        self.callbacks.after_valid(valid_logs)

    def _default_make_batch(self, batch):
        """Unpack batch to get input and target."""
        input, target = batch
        if self.use_cuda and not self.config.is_detection_trainer:
            input, target = input.cuda(), target.cuda()
        return (input, target)

    def _default_train_step(self, batch):
        input, target = batch
        self.optimizer.zero_grad()
        output = self.model(input)
        loss = self.loss(output, target)
        if self.use_amp:
            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
                self.optimizer.synchronize()
            with self.optimizer.skip_synchronize():
                self.optimizer.step()
        else:
            loss.backward()
            if self.config.grad_clip:
                torch.nn.utils.clip_grad_norm_(
                    self.model.parameters(), self.config.grad_clip)
            self.optimizer.step()
        return {'loss': loss.item(),
                'train_batch_output': output,
                'lr': self.lr_scheduler.get_lr()}

    def _default_valid_step(self, batch):
        input, target = batch
        if self.config.is_detection_trainer:
            output = self.model(input, forward_train=False)
        else:
            output = self.model(input)
        return {'valid_batch_output': output}

    def _init_minimize_op(self, loss, global_step, var_list=None):
        """Init loss minimize operation, include loss scale method."""
        loss_scale = self.config.loss_scale if self.use_amp else 1.
        if loss_scale != 1:
            scaled_grad_vars = self.optimizer.compute_gradients(loss * loss_scale, var_list=var_list)
            unscaled_grad_vars = []
            for grad, var in scaled_grad_vars:
                unscaled_grad_vars.append((grad, var) if grad is None else (grad / loss_scale, var))
            minimize_op = self.optimizer.apply_gradients(unscaled_grad_vars, global_step)
        else:
            grad_vars = self.optimizer.compute_gradients(loss, var_list=var_list)
            minimize_op = self.optimizer.apply_gradients(grad_vars, global_step)
        return minimize_op

    def _default_train_input_fn(self):
        return self.train_loader.input_fn()

    def _default_valid_input_fn(self):
        return self.valid_loader.input_fn()

    def _default_model_fn(self, features, labels, mode):
        """Define model_fn used by TensorFlow Estimator.

        :params features: input features
        :type features: tensorflow tensors
        :params labels: label data
        :type labels: tensorflow tensors
        :params mode: mode of estimator
        :type mode: tf.estimator.ModeKeys
        :return: tensorflow EstimatorSpec
        :rtype: tf.estimator.EstimatorSpec
        """
        logging.info('model function action')
        self.model.training = mode == tf.estimator.ModeKeys.TRAIN
        logits = self.model(features)
        logits = tf.cast(logits, tf.float32)
        if hasattr(self.model, 'add_loss'):
            loss_cls = Loss()()
            self.model.add_loss(loss_cls)
            self.loss = self.model.overall_loss()
        else:
            self.loss = Loss()()
        loss = self.loss(logits, labels)
        train_op = None
        if mode == tf.estimator.ModeKeys.TRAIN:
            global_step = tf.compat.v1.train.get_or_create_global_step()
            epoch = tf.cast(global_step, tf.float32) / tf.cast(len(self.train_loader), tf.float32)
            self.optimizer = Optimizer()(distributed=self.distributed)
            self.lr_scheduler = LrScheduler()(optimizer=self.optimizer)
            self.lr_scheduler.step(epoch)
            update_ops = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS)
            loss_scale = self.config.loss_scale if self.use_amp else 1
            minimize_op = self.optimizer.step(loss, loss_scale, global_step)
            train_op = tf.group(minimize_op, update_ops)

        eval_metric_ops = None
        if mode == tf.estimator.ModeKeys.EVAL:
            eval_metric_ops = self.valid_metrics(logits, labels)
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op,
                                          eval_metric_ops=eval_metric_ops)

    def _should_run_validation(self, epoch):
        # Zero valid_interval means doesn't run _valid_loop of the trainer
        # and user may provide _valid_loop in other callbacks
        if self.valid_interval == 0:
            return False
        else:
            return epoch % self.valid_interval == 0 or (epoch + 1) == self.epochs

    def _init_callbacks(self):
        disables = []
        customs = self.config.callbacks or []
        if customs and not isinstance(customs, list):
            customs = [customs]
        if not self.config.model_statistics:
            disables.append('ModelStatistics')
        self.callbacks = CallbackList(customs, disables)
        self.callbacks.set_trainer(self)

    def _metric_average(self, val, name):
        """Do metric average.

        :param val: input value
        :param name: metric name
        :return:
        """
        tensor = torch.tensor(val)
        avg_tensor = hvd.allreduce(tensor, name=name)
        return avg_tensor.item()

    @property
    def _first_rank(self):
        """Check if the first rank."""
        if self.distributed and hvd.rank() != 0:
            return False
        else:
            return True

    def _backup(self):
        """Backup result worker folder."""
        if self.need_backup is True and self.backup_base_path is not None:
            backup_worker_path = FileOps.join_path(
                self.backup_base_path, self.get_worker_subpath())
            FileOps.copy_folder(
                self.get_local_worker_path(self.step_name, self.worker_id), backup_worker_path)

    def _save_visual_data(self, is_train=True, pfms=None, loss=None, lr=None):
        # TODO Will move to metric base class later.
        for _name, value in pfms.items():
            if is_train:
                _name = "{}_{}".format("t", _name)
            else:
                _name = "{}_{}".format("v", _name)
            if isinstance(value, list):
                for i, _item in enumerate(value):
                    _name = "{}_{}".format(_name, i)
                    self.visual_data[_name] = _item.data.item()
            elif isinstance(value, dict):
                for k, v in value.keys():
                    _name = "{}_{}".format(_name, k)
                    self.visual_data[_name] = v
            elif value is not None:
                self.visual_data[_name] = value.data.item()
        if loss is not None:
            self.visual_data["loss"] = loss
        if lr is not None:
            self.visual_data["lr"] = lr

    def _init_tf_estimator(self):
        """Init tensorflow estimator."""
        if not zeus.is_tf_backend():
            return
        sess_config = self._init_session_config()
        if zeus.is_gpu_device():
            self._init_gpu_estimator(sess_config)
        elif zeus.is_npu_device():
            self._init_npu_estimator(sess_config)

    def _init_tf_session(self):
        if not zeus.is_tf_backend():
            return
        sess_config = self._init_session_config()
        self.graph = tf.Graph()
        with self.graph.as_default():
            self.sess = tf.compat.v1.Session(config=sess_config)

    def _init_session_config(self):
        sess_config = self._init_gpu_session_config() if zeus.is_gpu_device() else \
            self._init_npu_session_config()
        return sess_config

    def _init_logging_hook(self):
        logging_hook = []
        if zeus.is_gpu_device() and self.distributed:
            logging_hook += [hvd.BroadcastGlobalVariablesHook(0)]
        return logging_hook

    def _init_gpu_estimator(self, sess_config):
        """Init tensorflow estimator."""
        distribution = None
        if not self.distributed and General._parallel and General.devices_per_trainer > 1:
            distribution = tf.contrib.distribute.MirroredStrategy()
        config = tf.estimator.RunConfig(model_dir=self.get_local_worker_path(),
                                        save_checkpoints_steps=self.config.save_steps,
                                        log_step_count_steps=self.config.report_freq,
                                        session_config=None if distribution else sess_config,
                                        train_distribute=distribution)
        self.estimator = tf.estimator.Estimator(model_fn=self.model_fn,
                                                config=config)

    def _init_npu_estimator(self, sess_config):
        model_dir = self.get_local_worker_path()
        config = NPURunConfig(model_dir=model_dir,
                              save_checkpoints_steps=self.config.save_steps,
                              log_step_count_steps=self.config.report_freq,
                              session_config=sess_config,
                              enable_data_pre_proc=True,
                              iterations_per_loop=1)
        self.estimator = NPUEstimator(model_fn=self.model_fn,
                                      config=config)

    def _init_gpu_session_config(self):
        sess_config = tf.compat.v1.ConfigProto()
        sess_config.gpu_options.allow_growth = True
        if self.distributed:
            sess_config.gpu_options.visible_device_list = str(hvd.local_rank())
        return sess_config

    def _init_npu_session_config(self):
        sess_config = tf.ConfigProto()
        sess_config.graph_options.rewrite_options.remapping = RewriterConfig.OFF
        custom_op = sess_config.graph_options.rewrite_options.custom_optimizers.add()
        custom_op.name = "NpuOptimizer"
        if self.use_amp:
            custom_op.parameter_map["precision_mode"].s = tf.compat.as_bytes("allow_mix_precision")
        custom_op.parameter_map["use_off_line"].b = True
        # custom_op.parameter_map['hcom_parallel'].b = True
        # custom_op.parameter_map["enable_data_pre_proc"].b = True
        # custom_op.parameter_map["mix_compile_mode"].b = True  # mixed calculation
        # custom_op.parameter_map["min_group_size"].b = 1
        return sess_config

    def _init_ms_context(self):
        if not zeus.is_ms_backend():
            return
        if zeus.is_npu_device():
            context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
        else:
            context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
        self.dataset_sink_mode = True if zeus.is_npu_device() else False

    def _shutdown_distributed(self):
        if zeus.is_npu_device() and self.distributed:
            self.sess.run(self.npu_shutdown)
            self.sess.close()
Beispiel #20
0
    def valid(self, valid_loader):
        """Validate one step of mode.

        :param loader: valid data loader
        """
        if zeus.is_torch_backend():
            import torch
            from zeus.metrics.pytorch import Metrics
            metrics = Metrics(self.config.metric)
            self.model.eval()
            data_num = 0
            latency_sum = 0.0
            with torch.no_grad():
                for step, batch in enumerate(valid_loader):
                    if isinstance(batch, list) or isinstance(batch, tuple):
                        data = batch[0]
                        target = batch[1]
                    else:
                        raise ValueError("The dataset format must be tuple or list,"
                                         "but get {}.".format(type(batch)))
                    if self.config.cuda:
                        data, target = data.cuda(), target.cuda()
                        self.model = self.model.cuda()
                    time_start = time.time()
                    logits = self.model(data)
                    latency_sum += time.time() - time_start
                    metrics(logits, target)
                    n = data.size(0)
                    data_num += n
                    if step % self.config.report_freq == 0:
                        logging.info("step [{}/{}], valid metric [{}]".format(
                            step + 1, len(valid_loader), str(metrics.results)))
            latency = latency_sum / data_num
        elif zeus.is_tf_backend():
            from zeus.metrics.tensorflow.metrics import Metrics
            metrics = Metrics(self.config.metric)
            estimator = self._init_tf_estimator()
            time_start = time.time()
            eval_metrics = estimator.evaluate(input_fn=valid_loader.input_fn, steps=len(valid_loader))
            latency = (time.time() - time_start) / (len(valid_loader) * valid_loader.args.batch_size)
            metrics.update(eval_metrics)
        elif zeus.is_ms_backend():
            from zeus.metrics.mindspore.metrics import Metrics
            from mindspore.train import Model as MsModel
            from .utils import FakeLoss
            metrics = Metrics(self.config.metric)
            metric_name = self.config.metric().type
            dataset_sink_mode = True if zeus.is_npu_device() else False
            # when eval, the loss_fn is not needed actually, but when initilized, the loss_fn can't be None
            ms_model = MsModel(network=self.model,
                               loss_fn=FakeLoss(),
                               metrics={metric_name: metrics()})
            time_start = time.time()
            eval_metrics = ms_model.eval(valid_dataset=valid_loader,
                                         callbacks=None,
                                         dataset_sink_mode=dataset_sink_mode)
            for batch in valid_loader.create_dict_iterator():
                batch_size = batch["image"].shape[0]
                break
            latency = (time.time() - time_start) / (valid_loader.get_dataset_size() * batch_size)
            metrics.update(eval_metrics)
        pfms = metrics.results
        if self.config.evaluate_latency:
            pfms["latency"] = latency
        logging.info("evaluate performance: {}".format(pfms))
        return pfms
Beispiel #21
0
# 训练模型
model = Model(network, loss_fn=net_loss, optimizer=net_opt, metrics={"acc"})
loss_cb = LossMonitor(per_print_times=int(cfg.train_size / cfg.batch_size))
config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_steps,
                             keep_checkpoint_max=cfg.keep_checkpoint_max)
ckpoint_cb = ModelCheckpoint(prefix=cfg.output_prefix,
                             directory=cfg.output_directory,
                             config=config_ck)
print("============== Starting Training ==============")
model.train(cfg.epoch_size,
            ds_train,
            callbacks=[ckpoint_cb, loss_cb],
            dataset_sink_mode=True)

# 使用测试集评估模型,打印总体准确率
metric = model.eval(ds_test)
print(metric)

# 预测
test_ = ds_test.create_dict_iterator().get_next()
test = Tensor(test_['x'], mindspore.float32)
predictions = model.predict(test)
softmax = nn.Softmax()
predictions = softmax(predictions)
predictions = predictions.asnumpy()
for i in range(15):
    p_np = predictions[i, :]
    p_list = p_np.tolist()
    print('第' + str(i) + '个sample预测结果:', p_list.index(max(p_list)), '   真实结果:',
          test_['y'][i])
Beispiel #22
0
                        device_target=args.device_target)
    ds_eval = create_dataset(os.path.join(args.data_path, "test"),
                             cfg.batch_size, 1)

    # define fusion network
    network = LeNet5Fusion(cfg.num_classes)
    # convert fusion network to quantization aware network
    network = quant.convert_quant_network(network,
                                          quant_delay=0,
                                          bn_fold=False,
                                          freeze_bn=10000,
                                          per_channel=[True, False])

    # define loss
    net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
    # define network optimization
    net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)

    # call back and monitor
    model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()})

    # load quantization aware network checkpoint
    param_dict = load_checkpoint(args.ckpt_path)
    not_load_param = load_param_into_net(network, param_dict)
    if not_load_param:
        raise ValueError("Load param into net fail!")

    print("============== Starting Testing ==============")
    acc = model.eval(ds_eval, dataset_sink_mode=True)
    print("============== {} ==============".format(acc))
Beispiel #23
0
class TrainerMs(TrainerBase):
    """Trainer mindspore class."""
    def build(self):
        """Build the trainer by assembling the necessary components."""
        super().build()

        self.optimizer = Optimizer()(model=self.model)
        if hasattr(self.model, 'add_loss'):
            loss_cls = Loss()()
            self.model.add_loss(loss_cls)
            self.loss = self.model.overall_loss()
        else:
            self.loss = Loss()()
        self.metric_name = self.config.metric.type

        # Some trainer has different train batch size from valid batch
        self.train_metrics = None
        self.valid_metrics = self._init_metrics()

    def _set_condition(self):
        self._init_distributed_setting()
        self._init_ms_context()

    def _train_epoch(self):
        self.ms_model = MsModel(
            network=self.model,
            loss_fn=self.loss,
            optimizer=self.optimizer,
            metrics={self.metric_name: self.valid_metrics()})
        config_ck = CheckpointConfig(
            save_checkpoint_steps=self.config.save_steps)
        # save the network model and parameters for subsequence fine-tuning
        save_path = self.get_local_worker_path(self.step_name, self.worker_id)
        ckpoint_cb = ModelCheckpoint(config=config_ck, directory=save_path)
        loss_cb = LossMonitor(per_print_times=self.config.report_freq)
        eval_cb = EvalCallBack(self.ms_model, self.valid_loader,
                               self.dataset_sink_mode)
        self.ms_model.train(epoch=self.epochs,
                            train_dataset=self.train_loader,
                            callbacks=[ckpoint_cb, loss_cb, eval_cb],
                            dataset_sink_mode=self.dataset_sink_mode)

    def _valid_epoch(self):
        self.callbacks.before_valid()
        valid_logs = None

        eval_metrics = self.ms_model.eval(
            valid_dataset=self.valid_loader,
            dataset_sink_mode=self.dataset_sink_mode)

        self.valid_metrics.update(eval_metrics)
        valid_logs = dict()
        valid_logs['cur_valid_perfs'] = self.valid_metrics.results

        self.callbacks.after_valid(valid_logs)

    def _init_distributed_setting(self):
        if not self.distributed:
            return

    def _init_ms_context(self):
        if zeus.is_npu_device():
            context.set_context(mode=context.GRAPH_MODE,
                                device_target="Ascend")
        else:
            context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
        self.dataset_sink_mode = True if zeus.is_npu_device() else False
Beispiel #24
0
                        path where the trained ckpt file')
    parser.add_argument('--dataset_sink_mode',
                        type=bool,
                        default=False,
                        help='dataset_sink_mode is False or True')

    args = parser.parse_args()

    context.set_context(mode=context.GRAPH_MODE,
                        device_target=args.device_target)

    network = LeNet5(cfg.num_classes)
    net_loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False,
                                                sparse=True,
                                                reduction="mean")
    repeat_size = cfg.epoch_size
    net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
    config_ck = CheckpointConfig(
        save_checkpoint_steps=cfg.save_checkpoint_steps,
        keep_checkpoint_max=cfg.keep_checkpoint_max)
    ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", config=config_ck)
    model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()})

    print("============== Starting Testing ==============")
    param_dict = load_checkpoint(args.ckpt_path)
    load_param_into_net(network, param_dict)
    ds_eval = create_dataset(os.path.join(args.data_path, "test"),
                             cfg.batch_size, 1)
    acc = model.eval(ds_eval, dataset_sink_mode=args.dataset_sink_mode)
    print("============== {} ==============".format(acc))
Beispiel #25
0
    # apply DatasetOps
    buffer_size = 10000
    mnist_ds = mnist_ds.shuffle(
        buffer_size=buffer_size)  # 10000 as in LeNet train script
    mnist_ds = mnist_ds.batch(batch_size, drop_remainder=True)
    mnist_ds = mnist_ds.repeat(repeat_size)

    return mnist_ds


if __name__ == "__main__":
    network = LeNet5(10)
    network.set_param_ps()
    net_loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False,
                                                sparse=True,
                                                reduction="mean")
    net_opt = nn.Momentum(network.trainable_params(), 0.01, 0.9)
    model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()})

    ds_train = create_dataset(os.path.join(dataset_path, "train"), 32, 1)
    model.train(1,
                ds_train,
                callbacks=[LossMonitor()],
                dataset_sink_mode=False)

    ds_eval = create_dataset(os.path.join(dataset_path, "test"), 32, 1)
    acc = model.eval(ds_eval, dataset_sink_mode=False)

    print("Accuracy:", acc['Accuracy'])
    assert acc['Accuracy'] > 0.93