Ejemplo n.º 1
0
 def _get_aleatoric_uncertainty_model(self):
     """
     Get the model which can obtain the aleatoric uncertainty.
     """
     if self.ale_uncer_model is None:
         self.ale_uncer_model = AleatoricUncertaintyModel(
             self.ale_model, self.num_classes, self.task_type)
         net_loss = AleatoricLoss(self.task_type)
         net_opt = Adam(self.ale_uncer_model.trainable_params())
         if self.task_type == 'classification':
             model = Model(self.ale_uncer_model,
                           net_loss,
                           net_opt,
                           metrics={"Accuracy": Accuracy()})
         else:
             model = Model(self.ale_uncer_model,
                           net_loss,
                           net_opt,
                           metrics={"MSE": MSE()})
         if self.save_model:
             config_ck = CheckpointConfig(keep_checkpoint_max=self.epochs)
             ckpoint_cb = ModelCheckpoint(
                 prefix='checkpoint_ale_uncer_model',
                 directory=self.ale_uncer_model_path,
                 config=config_ck)
             model.train(self.epochs,
                         self.ale_train_dataset,
                         callbacks=[ckpoint_cb, LossMonitor()])
         elif self.ale_uncer_model_path is None:
             model.train(self.epochs,
                         self.ale_train_dataset,
                         callbacks=[LossMonitor()])
         else:
             uncer_param_dict = load_checkpoint(self.ale_uncer_model_path)
             load_param_into_net(self.ale_uncer_model, uncer_param_dict)
 def _get_epistemic_uncertainty_model(self):
     """
     Get the model which can obtain the epistemic uncertainty.
     """
     if self.epi_uncer_model is None:
         self.epi_uncer_model = EpistemicUncertaintyModel(self.epi_model)
         if self.epi_uncer_model.drop_count == 0 and self.epi_train_dataset is not None:
             if self.task_type == 'classification':
                 net_loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
                 net_opt = Adam(self.epi_uncer_model.trainable_params())
                 model = Model(self.epi_uncer_model, net_loss, net_opt, metrics={"Accuracy": Accuracy()})
             else:
                 net_loss = MSELoss()
                 net_opt = Adam(self.epi_uncer_model.trainable_params())
                 model = Model(self.epi_uncer_model, net_loss, net_opt, metrics={"MSE": MSE()})
             if self.save_model:
                 config_ck = CheckpointConfig(keep_checkpoint_max=self.epochs)
                 ckpoint_cb = ModelCheckpoint(prefix='checkpoint_epi_uncer_model',
                                              directory=self.epi_uncer_model_path,
                                              config=config_ck)
                 model.train(self.epochs, self.epi_train_dataset, dataset_sink_mode=False,
                             callbacks=[ckpoint_cb, LossMonitor()])
             elif self.epi_uncer_model_path is None:
                 model.train(self.epochs, self.epi_train_dataset, dataset_sink_mode=False,
                             callbacks=[LossMonitor()])
             else:
                 uncer_param_dict = load_checkpoint(self.epi_uncer_model_path)
                 load_param_into_net(self.epi_uncer_model, uncer_param_dict)
Ejemplo n.º 3
0
def train_bert():
    """train bert"""
    context.set_context(mode=context.GRAPH_MODE)
    context.set_context(device_target="Ascend")
    context.set_context(enable_task_sink=True)
    context.set_context(enable_loop_sink=True)
    context.set_context(enable_mem_reuse=True)
    ds = create_train_dataset(bert_net_cfg.batch_size)
    netwithloss = BertNetworkWithLoss(bert_net_cfg, True)
    optimizer = Lamb(netwithloss.trainable_params(),
                     decay_steps=bert_train_cfg.decay_steps,
                     start_learning_rate=bert_train_cfg.start_learning_rate,
                     end_learning_rate=bert_train_cfg.end_learning_rate,
                     power=bert_train_cfg.power,
                     warmup_steps=bert_train_cfg.num_warmup_steps,
                     decay_filter=lambda x: False)
    netwithgrads = BertTrainOneStepCell(netwithloss, optimizer=optimizer)
    netwithgrads.set_train(True)
    model = Model(netwithgrads)
    config_ck = CheckpointConfig(
        save_checkpoint_steps=bert_train_cfg.save_checkpoint_steps,
        keep_checkpoint_max=bert_train_cfg.keep_checkpoint_max)
    ckpoint_cb = ModelCheckpoint(prefix=bert_train_cfg.checkpoint_prefix,
                                 config=config_ck)
    model.train(ds.get_repeat_count(),
                ds,
                callbacks=[LossMonitor(), ckpoint_cb],
                dataset_sink_mode=False)
Ejemplo n.º 4
0
def train_lenet_quant():
    context.set_context(mode=context.GRAPH_MODE, device_target=device_target)
    cfg = quant_cfg
    ckpt_path = './ckpt_lenet_noquant-10_1875.ckpt'
    ds_train = create_dataset(os.path.join(data_path, "train"), cfg.batch_size, 1)
    step_size = ds_train.get_dataset_size()

    # define fusion network
    network = LeNet5Fusion(cfg.num_classes)

    # load quantization aware network checkpoint
    param_dict = load_checkpoint(ckpt_path)
    load_nonquant_param_into_quant_net(network, param_dict)

    # convert fusion network to quantization aware network
    network = quant.convert_quant_network(network, quant_delay=900, bn_fold=False, per_channel=[True, False],
                                          symmetric=[False, False])

    # define network loss
    net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
    # define network optimization
    net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)

    # call back and monitor
    config_ckpt = CheckpointConfig(save_checkpoint_steps=cfg.epoch_size * step_size,
                                   keep_checkpoint_max=cfg.keep_checkpoint_max)
    ckpt_callback = ModelCheckpoint(prefix="ckpt_lenet_quant", config=config_ckpt)

    # define model
    model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()})

    print("============== Starting Training ==============")
    model.train(cfg['epoch_size'], ds_train, callbacks=[ckpt_callback, LossMonitor()],
                dataset_sink_mode=True)
    print("============== End Training ==============")
Ejemplo n.º 5
0
def train_net(network, model, args, ckpoint_cb, sink_mode):
    """Define the training method."""
    print("============== Starting Training ==============")
    # load training dataset
    ds_train = create_dataset(os.path.join(args.data_dir, "train"),
                              args.batch_size, args.repeat_size)

    callbacks = [
        # ckpoint_cb,
        LossMonitor(per_print_times=20),
    ]

    if args.use_kungfu:
        if args.use_kungfu_elastic:
            from kungfu_mindspore_callbacks import KungFuElasticCallback
            schedule = {
                10: 2,
                20: 3,
                30: 4,
                40: 1,
                50: 2,
                60: 3,
                70: 4,
                80: 1,
            }
            kungfu_elastic_callback = KungFuElasticCallback(schedule)
            callbacks.append(kungfu_elastic_callback)

    log_callbacks(callbacks)
    print('sink_mode: %s' % (sink_mode))
    model.train(args.epoch_size,
                ds_train,
                callbacks=callbacks,
                dataset_sink_mode=sink_mode)
Ejemplo n.º 6
0
 def _train_epoch(self):
     if zeus.is_torch_backend():
         self.model.train()
         for batch_index, batch in enumerate(self.train_loader):
             batch = self.make_batch(batch)
             batch_logs = {'train_batch': batch}
             self.callbacks.before_train_step(batch_index, batch_logs)
             train_batch_output = self.train_step(batch)
             batch_logs.update(train_batch_output)
             if self.config.is_detection_trainer:
                 batch_logs.update({'is_detection_trainer': True})
             self.callbacks.after_train_step(batch_index, batch_logs)
     elif zeus.is_tf_backend():
         self.estimator.train(input_fn=self.train_input_fn,
                              steps=len(self.train_loader),
                              hooks=self._init_logging_hook())
     elif zeus.is_ms_backend():
         self.ms_model = MsModel(network=self.model,
                                 loss_fn=self.loss,
                                 optimizer=self.optimizer,
                                 metrics={self.metric_name: self.valid_metrics()})
         config_ck = CheckpointConfig(save_checkpoint_steps=self.config.save_steps)
         # save the network model and parameters for subsequence fine-tuning
         save_path = self.get_local_worker_path(self.step_name, self.worker_id)
         ckpoint_cb = ModelCheckpoint(config=config_ck, directory=save_path)
         loss_cb = LossMonitor(per_print_times=self.config.report_freq)
         eval_cb = EvalCallBack(self.ms_model, self.valid_loader)
         self.ms_model.train(epoch=self.epochs,
                             train_dataset=self.train_loader,
                             callbacks=[ckpoint_cb, loss_cb, eval_cb],
                             dataset_sink_mode=self.dataset_sink_mode)
Ejemplo n.º 7
0
def train_net(model, epoch_size, data_path, ckpoint_cb, sink_mode):
    """train_net"""
    ds_train = create_dataset(os.path.join(data_path, "train"), 32)
    model.train(epoch_size,
                ds_train,
                callbacks=[ckpoint_cb, LossMonitor(125)],
                dataset_sink_mode=sink_mode)
Ejemplo n.º 8
0
def mnist_train(epoch_size, batch_size, lr, momentum):
    mnist_path = "./MNIST_unzip/"
    ds = generate_mnist_dataset(os.path.join(mnist_path, "train"),
                                batch_size=batch_size,
                                repeat_size=1)

    network = LeNet5()
    net_loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False,
                                                sparse=True,
                                                reduction="mean")
    net_opt = nn.Momentum(network.trainable_params(), lr, momentum)
    config_ck = CheckpointConfig(save_checkpoint_steps=1875,
                                 keep_checkpoint_max=10)
    ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
                                 directory="./trained_ckpt_file/",
                                 config=config_ck)
    model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()})

    LOGGER.info(TAG, "============== Starting Training ==============")
    model.train(epoch_size,
                ds,
                callbacks=[ckpoint_cb, LossMonitor()],
                dataset_sink_mode=False)

    LOGGER.info(TAG, "============== Starting Testing ==============")
    ckpt_file_name = "trained_ckpt_file/checkpoint_lenet-10_1875.ckpt"
    param_dict = load_checkpoint(ckpt_file_name)
    load_param_into_net(network, param_dict)
    ds_eval = generate_mnist_dataset(os.path.join(mnist_path, "test"),
                                     batch_size=batch_size)
    acc = model.eval(ds_eval, dataset_sink_mode=False)
    LOGGER.info(TAG, "============== Accuracy: %s ==============", acc)
Ejemplo n.º 9
0
def train(data_dir, lr=0.01, momentum=0.9, num_epochs=2, ckpt_name="lenet"):
    dataset_sink = context.get_context('device_target') == 'Ascend'
    repeat = num_epochs if dataset_sink else 1
    ds_train = create_dataset(data_dir, repeat=repeat)
    ds_eval = create_dataset(data_dir, training=False)
    steps_per_epoch = ds_train.get_dataset_size()

    net = LeNet5()
    loss = nn.loss.SoftmaxCrossEntropyWithLogits(is_grad=False,
                                                 sparse=True,
                                                 reduction='mean')
    opt = nn.Momentum(net.trainable_params(), lr, momentum)

    ckpt_cfg = CheckpointConfig(save_checkpoint_steps=steps_per_epoch,
                                keep_checkpoint_max=5)
    ckpt_cb = ModelCheckpoint(prefix=ckpt_name,
                              directory='ckpt',
                              config=ckpt_cfg)
    loss_cb = LossMonitor(steps_per_epoch)

    model = Model(net, loss, opt, metrics={'acc', 'loss'})
    model.train(num_epochs,
                ds_train,
                callbacks=[ckpt_cb, loss_cb],
                dataset_sink_mode=dataset_sink)
    metrics = model.eval(ds_eval, dataset_sink_mode=dataset_sink)
    print('Metrics:', metrics)
Ejemplo n.º 10
0
def train(model,
          dataset_direct,
          filename,
          columns_list,
          num_consumer=4,
          batch=16,
          epoch=50,
          save_checkpoint_steps=2172,
          keep_checkpoint_max=50,
          prefix="model",
          directory='./'):
    """
    train network
    """
    config_ck = CheckpointConfig(save_checkpoint_steps=save_checkpoint_steps,
                                 keep_checkpoint_max=keep_checkpoint_max)
    ckpoint_cb = ModelCheckpoint(prefix=prefix,
                                 directory=directory,
                                 config=config_ck)
    data_train = create_dataset(dataset_direct, filename, batch, columns_list,
                                num_consumer)

    model.train(epoch,
                data_train,
                callbacks=[
                    ckpoint_cb,
                    LossMonitor(per_print_times=181),
                    TimeMonitor()
                ],
                dataset_sink_mode=True)
Ejemplo n.º 11
0
def train():
    context.set_context(
        mode=context.GRAPH_MODE,
        device_target="Ascend",
        #save_graphs=True,
        #save_graphs_path="/home/work/user-job-dir/EAST/",
        #enable_reduce_precision=False,
        #device_id=5
    )

    epoch = 600

    my_dataset.download_dataset()
    train_img_path = os.path.abspath('/cache/train_img')
    train_gt_path = os.path.abspath('/cache/train_gt')
    #my_dataset.data_to_mindrecord_byte_image(train_img_path, train_gt_path, mindrecord_dir='/cache', prefix='icdar_train.mindrecord',file_num=1)
    #dataset = my_dataset.create_icdar_train_dataset(mindrecord_file=['icdar_train.mindrecord0','icdar_train.mindrecord1','icdar_train.mindrecord2','icdar_train.mindrecord3'], batch_size=32, repeat_num=epoch,
    #                            is_training=True, num_parallel_workers=8, length=512, scale=0.25)
    #dataset = my_dataset.create_icdar_train_dataset(mindrecord_file='/cache/icdar_train.mindrecord', batch_size=32, repeat_num=epoch,
    #                            is_training=True, num_parallel_workers=24, length=512, scale=0.25)
    #dataset = my_dataset.create_demo_dataset(batch_size=21, repeat_num=2)
    #train_img_path = os.path.abspath('/home/licheng/gpzlx1/ICDAR_2015/train/img')
    #train_gt_path  = os.path.abspath('/home/licheng/gpzlx1/ICDAR_2015/train/gt')
    dataset = datasetV2.create_icdar_train_dataset(train_img_path,
                                                   train_gt_path,
                                                   batch_size=14,
                                                   repeat_num=1,
                                                   is_training=True,
                                                   num_parallel_workers=24)
    #dataset = datasetV3.create_icdar_train_dataset(train_img_path, train_gt_path, batch_size=14, repeat_num=1, is_training=True, num_parallel_workers=24)
    dataset_size = dataset.get_dataset_size()

    print("Create dataset done!, dataset_size: ", dataset_size)

    #east = EAST.EAST()
    net = EAST_VGG.EAST()

    #ckpt_config = CheckpointConfig(save_checkpoint_steps=dataset_size * 20)
    #ckpoint_cb = ModelCheckpoint(prefix='EAST', directory='/cache', config=ckpt_config)

    milestone = [100, 300]
    learning_rates = [1e-3, 1e-4]
    lr = piecewise_constant_lr(milestone, learning_rates)
    opt = nn.Adam(filter(lambda x: x.requires_grad, net.get_parameters()),
                  learning_rate=lr)
    net = my_loss.EASTWithLossCell(net)
    net = my_loss.TrainingWrapper(net, opt)
    net.set_train(True)

    callback = [TimeMonitor(data_size=dataset_size),
                LossMonitor()]  #, ckpoint_cb]

    model = Model(net)
    dataset_sink_mode = False
    print("start trainig")
    model.train(epoch,
                dataset,
                callbacks=callback,
                dataset_sink_mode=dataset_sink_mode)
Ejemplo n.º 12
0
def resnet50_train(args_opt):
    device_id = 0
    device_num = 1
    epoch_size = args_opt.epoch_size
    batch_size = 32
    class_num = 10
    loss_scale_num = 1024
    local_data_path = '/home/share/dataset/cifar-10-batches-bin/' # your cifar10 path

    # set graph mode and parallel mode
    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False)
    context.set_context(device_id=device_id)
    if device_num > 1:
        context.set_auto_parallel_context(device_num=device_num,
                                          parallel_mode=ParallelMode.DATA_PARALLEL,
                                          gradients_mean=True)
        init()
        local_data_path = os.path.join(local_data_path, str(device_id))

    # data download
    print('Download data.')
    mox.file.copy_parallel(src_url=args_opt.data_url, dst_url=local_data_path)

    # create dataset
    print('Create train and evaluate dataset.')
    train_dataset = create_dataset(dataset_path=local_data_path, do_train=True,
                                   repeat_num=1, batch_size=batch_size)
    eval_dataset = create_dataset(dataset_path=local_data_path, do_train=False,
                                   repeat_num=1, batch_size=batch_size)
    train_step_size = train_dataset.get_dataset_size()
    print('Create dataset success.')

    # create model
    net = resnet50(class_num = class_num)
    # reduction='mean' means that apply reduction of mean to loss
    loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
    lr = Tensor(get_lr(global_step=0, total_epochs=epoch_size, steps_per_epoch=train_step_size))
    opt = Momentum(net.trainable_params(), lr, momentum=0.9, weight_decay=1e-4, loss_scale=loss_scale_num)
    loss_scale = FixedLossScaleManager(loss_scale_num, False)

    # amp_level="O2" means that the hybrid precision of O2 mode is used for training
    # the whole network except that batchnoram will be cast into float16 format and dynamic loss scale will be used
    # 'keep_batchnorm_fp32 = False' means that use the float16 format
    model = Model(net, amp_level="O2", keep_batchnorm_fp32=False, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'})

    # define performance callback to show ips and loss callback to show loss for every epoch
    performance_cb = PerformanceCallback(batch_size)
    loss_cb = LossMonitor()
    cb = [performance_cb, loss_cb]

    print(f'Start run training, total epoch: {epoch_size}.')
    model.train(epoch_size, train_dataset, callbacks=cb)
    if device_num == 1 or device_id == 0:
         print(f'=================================Start run evaluation.=================================')
        output = model.eval(eval_dataset)
        print(f'Evaluation result: {output}.')
Ejemplo n.º 13
0
def train_net(args, model, epoch_size, mnist_path, repeat_size, ckpoint_cb):
    """Define the training method."""
    print("============== Starting Training ==============")
    # load training dataset
    ds_train = create_dataset(os.path.join(mnist_path, "train"), 32,
                              repeat_size)
    model.train(epoch_size,
                ds_train,
                callbacks=[ckpoint_cb, LossMonitor()],
                dataset_sink_mode=False)
Ejemplo n.º 14
0
def main():
    # We currently support pynative mode with device GPU
    context.set_context(mode=context.PYNATIVE_MODE, device_target='GPU')
    epoch_size = 1
    batch_size = 32
    mnist_path = "/data/chengzi/zhusuan-mindspore/data/MNIST"
    repeat_size = 1

    # Define model parameters
    z_dim = 40
    x_dim = 32 * 32

    # create the network
    generator = Generator(x_dim, z_dim, batch_size)
    variational = Variational(x_dim, z_dim, batch_size)
    network = zs.variational.ELBO(generator, variational)

    # define loss
    # learning rate setting
    lr = 0.001
    net_loss = ReduceMeanLoss()

    # define the optimizer
    print(network.trainable_params()[0])
    net_opt = nn.Adam(network.trainable_params(), lr)

    model = Model(network, net_loss, net_opt)

    ds_train = create_dataset(os.path.join(mnist_path, "train"), batch_size,
                              repeat_size)
    model.train(epoch_size,
                ds_train,
                callbacks=[LossMonitor()],
                dataset_sink_mode=False)

    print(network.trainable_params()[0])

    iterator = ds_train.create_tuple_iterator()
    for item in iterator:
        batch_x = item[0].reshape(32, 32 * 32)
        break
    z, _ = network.variational(Tensor(batch_x), None, None)
    sample, _, _, _ = network.generator(None, z, None)
    sample = sample.asnumpy()
    save_img(batch_x, 'result/origin_x.png')
    save_img(sample, 'result/reconstruct_x.png')

    for i in range(4):
        sample, _, _, _ = network.generator(None, None, None)
        sample = sample.asnumpy()
        samples = sample if i == 0 else np.concatenate([samples, sample],
                                                       axis=0)
    save_img(samples, 'result/sample_x.png', num=4 * batch_size)
Ejemplo n.º 15
0
def train_net(data_dir, seg_dir, run_distribute, config=None):
    if run_distribute:
        init()
        rank_id = get_rank()
        rank_size = get_group_size()
        parallel_mode = ParallelMode.DATA_PARALLEL
        context.set_auto_parallel_context(parallel_mode=parallel_mode,
                                          device_num=rank_size,
                                          gradients_mean=True)
    else:
        rank_id = 0
        rank_size = 1
    # train_dataset = create_dataset(data_path=data_dir, seg_path=seg_dir, config=config, \
    #                                 rank_size=rank_size, rank_id=rank_id, is_training=True)
    train_dataset = create_dataset_diy()
    # for item in train_dataset:
    #     print(item)
    # exit(0)

    train_data_size = train_dataset.get_dataset_size()
    print("train dataset length is:", train_data_size)

    network = UNet3d(config=config)

    loss = SoftmaxCrossEntropyWithLogits()
    # loss = nn.DiceLoss()
    lr = Tensor(dynamic_lr(config, train_data_size), mstype.float32)
    optimizer = nn.Adam(params=network.trainable_params(), learning_rate=lr)
    scale_manager = FixedLossScaleManager(config.loss_scale,
                                          drop_overflow_update=False)
    network.set_train()

    model = Model(network,
                  loss_fn=loss,
                  optimizer=optimizer,
                  loss_scale_manager=scale_manager,
                  amp_level='O3')

    time_cb = TimeMonitor(data_size=train_data_size)
    loss_cb = LossMonitor(per_print_times=2)
    ckpt_config = CheckpointConfig(
        save_checkpoint_steps=train_data_size,
        keep_checkpoint_max=config.keep_checkpoint_max)
    ckpoint_cb = ModelCheckpoint(prefix='{}'.format(config.model),
                                 directory='./ckpt_{}/'.format(rank_size),
                                 config=ckpt_config)
    callbacks_list = [loss_cb, time_cb, ckpoint_cb]
    print("============== Starting Training ==============")
    model.train(config.epoch_size,
                train_dataset,
                callbacks=callbacks_list,
                dataset_sink_mode=False)
    print("============== End Training ==============")
def resnet50_train(args_opt):
    epoch_size = args_opt.epoch_size
    batch_size = cfg.batch_size
    class_num = cfg.class_num
    loss_scale_num = cfg.loss_scale
    local_data_path = '/cache/data'
    local_ckpt_path = '/cache/ckpt_file'

    # set graph mode and parallel mode
    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False)

    # data download
    print('Download data.')
    mox.file.copy_parallel(src_url=args_opt.data_url, dst_url=local_data_path)

    # create dataset
    print('Create train and evaluate dataset.')
    train_dataset = create_dataset(dataset_path=local_data_path, do_train=True,
                                   repeat_num=epoch_size, batch_size=batch_size)
    train_step_size = train_dataset.get_dataset_size()
    print('Create dataset success.')

    # create model
    net = resnet50(class_num=class_num)
    # reduction='mean' means that apply reduction of mean to loss
    loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
    lr = Tensor(get_lr(global_step=0, total_epochs=epoch_size, steps_per_epoch=train_step_size))
    opt = Momentum(net.trainable_params(), lr, momentum=0.9, weight_decay=1e-4, loss_scale=loss_scale_num)
    loss_scale = FixedLossScaleManager(loss_scale_num, False)

    # amp_level="O2" means that the hybrid precision of O2 mode is used for training
    # the whole network except that batchnorm will be cast into float16 format and dynamic loss scale will be used
    # 'keep_batchnorm_fp32 = False' means that use the float16 format
    model = Model(net, amp_level="O2", keep_batchnorm_fp32=False, loss_fn=loss,
                  optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'})

    # define performance callback to show ips and loss callback to show loss for every epoch
    time_cb = TimeMonitor(data_size=train_step_size)
    performance_cb = PerformanceCallback(batch_size)
    loss_cb = LossMonitor()
    cb = [time_cb, performance_cb, loss_cb]
    config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_epochs * train_step_size,
                                 keep_checkpoint_max=cfg.keep_checkpoint_max)
    ckpt_cb = ModelCheckpoint(prefix="resnet", directory=local_ckpt_path, config=config_ck)
    cb += [ckpt_cb]

    print(f'Start run training, total epoch: {epoch_size}.')
    model.train(epoch_size, train_dataset, callbacks=cb)

    # upload checkpoint files
    print('Upload checkpoint.')
    mox.file.copy_parallel(src_url=local_ckpt_path, dst_url=args_opt.train_url)
Ejemplo n.º 17
0
def train_net(network_model, epoch_size, data_path, repeat_size, ckpoint_cb,
              sink_mode):
    """Define the training method."""
    print("============== Starting Training ==============")
    # load training dataset
    ds_train = dm.create_dataset(os.path.join(
        data_path, "./MindSpore_train_images_dataset/train"),
                                 do_train=True,
                                 repeat_num=1)
    network_model.train(epoch_size,
                        ds_train,
                        callbacks=[ckpoint_cb, LossMonitor()],
                        dataset_sink_mode=sink_mode)
Ejemplo n.º 18
0
def resnet50_train(args_opt):
    epoch_size = args_opt.epoch_size
    batch_size = 32
    class_num = 10
    loss_scale_num = 1024
    local_data_path = '/cache/data'

    # set graph mode and parallel mode
    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False)
    context.set_context(enable_task_sink=True, device_id=device_id)
    context.set_context(enable_loop_sink=True)
    context.set_context(enable_mem_reuse=True)
    if device_num > 1:
        context.set_auto_parallel_context(device_num=device_num,
                                          parallel_mode=ParallelMode.DATA_PARALLEL,
                                          mirror_mean=True)
        local_data_path = os.path.join(local_data_path, str(device_id))

    # data download
    print('Download data.')
    mox.file.copy_parallel(src_url=args_opt.data_url, dst_url=local_data_path)

    # create dataset
    print('Create train and evaluate dataset.')
    train_dataset = create_dataset(dataset_path=local_data_path, do_train=True,
                                   repeat_num=epoch_size, batch_size=batch_size)
    eval_dataset = create_dataset(dataset_path=local_data_path, do_train=False,
                                   repeat_num=1, batch_size=batch_size)
    train_step_size = train_dataset.get_dataset_size()
    print('Create dataset success.')

    # create model
    net = resnet50(class_num = class_num)
    loss = SoftmaxCrossEntropyWithLogits(sparse=True)
    lr = Tensor(get_lr(global_step=0, total_epochs=epoch_size, steps_per_epoch=train_step_size))
    opt = Momentum(net.trainable_params(), lr, momentum=0.9, weight_decay=1e-4, loss_scale=loss_scale_num)
    loss_scale = FixedLossScaleManager(loss_scale_num, False)

    model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'})

    # define performance callback to show ips and loss callback to show loss for every epoch
    performance_cb = PerformanceCallback(batch_size)
    loss_cb = LossMonitor()
    cb = [performance_cb, loss_cb]

    print(f'Start run training, total epoch: {epoch_size}.')
    model.train(epoch_size, train_dataset, callbacks=cb)
    if device_num == 1 or device_id == 0:
        print(f'Start run evaluation.')
        output = model.eval(eval_dataset)
        print(f'Evaluation result: {output}.')
Ejemplo n.º 19
0
def logistic_regression(ds_train, X_test, Y_test):
    net = nn.Dense(4, 1)
    loss = Loss()
    opt = nn.optim.SGD(net.trainable_params(), learning_rate=0.003)

    model = ms.train.Model(net, loss, opt)
    model.train(5, ds_train, callbacks=[LossMonitor(per_print_times=ds_train.get_dataset_size())], dataset_sink_mode=False)

    # 计算测试集上的精度
    x = model.predict(ms.Tensor(X_test)).asnumpy()
    pred = np.round(1 / (1 + np.exp(-x)))
    correct = np.equal(pred, Y_test)
    acc = np.mean(correct)
    print('Test accuracy is', acc)
Ejemplo n.º 20
0
def test_all_trains():
    ds_train = create_dataset(
        os.path.join('/home/workspace/mindspore_dataset/mnist', "train"), 32,
        1)

    network = LeNet5(10)
    net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
    net_opt = nn.Momentum(network.trainable_params(), 0.01, 0.9)
    time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())

    model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()})

    print("============== Starting Training ==============")
    model.train(1, ds_train, callbacks=[time_cb, LossMonitor()])
Ejemplo n.º 21
0
def train(data_dir, lr=0.01, momentum=0.9, num_epochs=3):
    ds_train = create_dataset(data_dir)
    ds_eval = create_dataset(data_dir, training=False)

    net = LeNet5()
    loss = nn.loss.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
    opt = nn.Momentum(net.trainable_params(), lr, momentum)
    loss_cb = LossMonitor(per_print_times=ds_train.get_dataset_size())

    model = Model(net, loss, opt, metrics={'acc', 'loss'})
    # dataset_sink_mode can be True when using Ascend
    model.train(num_epochs, ds_train, callbacks=[loss_cb], dataset_sink_mode=False)
    metrics = model.eval(ds_eval, dataset_sink_mode=False)
    print('Metrics:', metrics)
Ejemplo n.º 22
0
    def get_tensor_evolution_data(self,
                                  indices,
                                  ckpt_file,
                                  data_type="activation"):
        indices = [1]
        dataset = create_dataset(indices)
        load_checkpoint(ckpt_file, net=self.model._network)
        data_evolution_callback = DataEvolutionCallback(data_type=data_type)

        self.model.train(1,
                         dataset,
                         callbacks=[LossMonitor(), data_evolution_callback],
                         dataset_sink_mode=False)
        return data_evolution_callback.result
Ejemplo n.º 23
0
def test_train_cifar(num_classes=10, epoch_size=10):
    context.set_auto_parallel_context(parallel_mode=ParallelMode.AUTO_PARALLEL,
                                      mirror_mean=True)
    loss_cb = LossMonitor()
    dataset = create_dataset(epoch_size)
    net = resnet50(32, num_classes)
    loss = SoftmaxCrossEntropyExpand(sparse=True)
    opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()),
                   0.01, 0.9)
    model = Model(net, loss_fn=loss, optimizer=opt)
    model.train(epoch_size,
                dataset,
                callbacks=[loss_cb],
                dataset_sink_mode=False)
Ejemplo n.º 24
0
def test_train_and_eval_lenet():
    context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
    network = LeNet5(10)
    net_loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction="mean")
    net_opt = nn.Momentum(network.trainable_params(), 0.01, 0.9)
    model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()})

    print("============== Starting Training ==============")
    ds_train = create_dataset(os.path.join('/home/workspace/mindspore_dataset/mnist', "train"), 32, 1)
    model.train(1, ds_train, callbacks=[LossMonitor()], dataset_sink_mode=True)

    print("============== Starting Testing ==============")
    ds_eval = create_dataset(os.path.join('/home/workspace/mindspore_dataset/mnist', "test"), 32, 1)
    acc = model.eval(ds_eval, dataset_sink_mode=True)
    print("============== {} ==============".format(acc))
Ejemplo n.º 25
0
def train_net(args, epoch_size, data_path, eval_per_epoch, repeat_size,
              ckpoint_cb, sink_mode):
    """define the training method"""
    print("============== Starting Training ==============")
    # Create training dataset
    ds_train = create_dataset(args, True, training_path, 32, repeat_size)
    # Initialise model
    model = Model(resnet, net_loss, net_opt, metrics={"Accuracy": Accuracy()})
    # model = Model(resnet, net_loss, net_opt, metrics={"Accuracy": Accuracy()}, amp_level="O3") # this will not work for CPU
    epoch_per_eval = {"epoch": [], "acc": []}
    eval_cb = Evalcb(model, ds_train, eval_per_epoch, epoch_per_eval)
    model.train(epoch_size,
                ds_train,
                callbacks=[ckpoint_cb, LossMonitor(), eval_cb],
                dataset_sink_mode=sink_mode)
Ejemplo n.º 26
0
    def get_tensor_from_training(self,
                                 indices,
                                 ckpt_file="./logs/resnet/weights/-1_30.ckpt",
                                 node_name="fc",
                                 data_type="activation"):
        dataset = create_dataset(indices)
        load_checkpoint(ckpt_file, net=self.model._network)
        data_inception_callback = DataInterceptionCallback(node_name=node_name,
                                                           data_type=data_type)

        self.model.train(1,
                         dataset,
                         callbacks=[LossMonitor(), data_inception_callback],
                         dataset_sink_mode=False)
        return data_inception_callback.result, data_inception_callback.labels
Ejemplo n.º 27
0
    def train(self, trainset, *args):
        """The main training loop in a federated learning workload.

        Arguments:
        trainset: The training dataset.
        """
        self.start_training()

        self.mindspore_model.train(
            Config().trainer.epochs,
            trainset,
            callbacks=[LossMonitor(per_print_times=300)],
            dataset_sink_mode=False)

        self.pause_training()
def train_net(args, model, epoch_size, data_home, repeat_size, ckpoint_cb,
              sink_mode):
    """define the training method"""
    print("============== Starting Training ==============")
    # init weight

    #load training dataset
    ds_train = create_dataset(os.path.join(data_home, "cifar-10-batches-bin"),
                              do_train=True,
                              batch_size=32,
                              repeat_num=1)
    model.train(epoch_size,
                ds_train,
                callbacks=[ckpoint_cb, LossMonitor()],
                dataset_sink_mode=sink_mode)  # cifar-10-batches-bin
Ejemplo n.º 29
0
def test_build_callbacks():
    """Test_build_callbacks."""
    ck_obj = ModelCheckpoint()
    loss_cb_1 = LossMonitor(1)

    callbacks = [None]
    with pytest.raises(TypeError):
        callbacks = _build_callbacks(callbacks)

    callbacks = ['Error']
    with pytest.raises(TypeError):
        callbacks = _build_callbacks(callbacks)

    callbacks = [ck_obj, loss_cb_1, 'Error', None]
    with pytest.raises(TypeError):
        callback_list = _build_callbacks(callbacks)
Ejemplo n.º 30
0
def test_CallbackManager():
    """TestCallbackManager."""
    ck_obj = ModelCheckpoint()
    loss_cb_1 = LossMonitor(1)

    callbacks = [None]
    with pytest.raises(TypeError):
        _CallbackManager(callbacks)

    callbacks = ['Error']
    with pytest.raises(TypeError):
        _CallbackManager(callbacks)

    callbacks = [ck_obj, loss_cb_1, 'Error', None]
    with pytest.raises(TypeError):
        _CallbackManager(callbacks)