def train_process(device_id, epoch_size, num_classes, device_num, batch_size): os.system("mkdir " + str(device_id)) os.chdir(str(device_id)) context.set_context(mode=context.GRAPH_MODE, device_target="Ascend") context.set_context(device_id=device_id) context.set_context(enable_loop_sink=True) context.set_context(enable_mem_reuse=True) context.set_context(mode=context.GRAPH_MODE) net = resnet50(batch_size, num_classes) loss = CrossEntropyLoss() opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), 0.01, 0.9) model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'}) dataset = create_dataset(epoch_size, training=True, batch_size=batch_size) batch_num = dataset.get_dataset_size() config_ck = CheckpointConfig(save_checkpoint_steps=batch_num, keep_checkpoint_max=1) ckpoint_cb = ModelCheckpoint(prefix="train_resnet_cifar10_device_id_" + str(device_id), directory="./", config=config_ck) loss_cb = LossGet() model.train(epoch_size, dataset, callbacks=[ckpoint_cb, loss_cb])
def test_batchnorm_batch_parallel(): num_classes = 1001 batch_size = 32 learning_rate = 0.1 momentum = 0.9 epoch_size = 2 rank_size = 0 predict = Tensor(np.ones([batch_size, 3, 224, 224]), dtype=ms.float32) label = Tensor(np.ones([batch_size]), dtype=ms.int32) dataset = DatasetLenet(predict, label, 2) net = batchnorm_net(num_classes) loss = SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True) loss.softmax_cross_entropy.set_strategy(((dev_num, 1), (dev_num, 1))) opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate, momentum) context.reset_auto_parallel_context() context.set_auto_parallel_context( parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, device_num=dev_num) context.set_context(mode=context.GRAPH_MODE) model = Model(net, loss, opt) model.train(epoch_size, dataset, dataset_sink_mode=False)
def loss_scale_manager_common(strategy1): learning_rate = 0.1 momentum = 0.9 epoch_size = 2 context.reset_auto_parallel_context() context.set_auto_parallel_context(parallel_mode=ParallelMode.AUTO_PARALLEL, device_num=8) predict = Tensor(np.ones([32, 128]), dtype=ms.float32) label = Tensor(np.ones([32]), dtype=ms.int32) dataset = Dataset(predict, label, 2) net = all_to_all_net(strategy1) loss = SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True) loss.softmax_cross_entropy.set_strategy(((8, 1), (8, 1))) opt = Momentum(net.trainable_params(), learning_rate, momentum) scale_manager = DynamicLossScaleManager(32, 2, 2000) model = Model(net, loss, opt, loss_scale_manager=scale_manager) # if no GE exists, outputs = self._train_network(*next_element) outputs inputs tensor. try: model.train(epoch_size, dataset, dataset_sink_mode=False) except TypeError: pass else: assert False
def test_nad(): """UT for natural adversarial defense.""" num_classes = 10 batch_size = 32 sparse = False context.set_context(mode=context.GRAPH_MODE) context.set_context(device_target='Ascend') # create test data inputs = np.random.rand(batch_size, 1, 32, 32).astype(np.float32) labels = np.random.randint(num_classes, size=batch_size).astype(np.int32) if not sparse: labels = np.eye(num_classes)[labels].astype(np.float32) net = Net() loss_fn = nn.SoftmaxCrossEntropyWithLogits(sparse=sparse) optimizer = Momentum(net.trainable_params(), 0.001, 0.9) # defense nad = NaturalAdversarialDefense(net, loss_fn=loss_fn, optimizer=optimizer) LOGGER.set_level(logging.DEBUG) LOGGER.debug(TAG, '---start natural adversarial defense--') loss = nad.defense(inputs, labels) LOGGER.debug(TAG, '---end natural adversarial defense--') assert np.any(loss >= 0.0)
def train_common(net): batch_size = 32 learning_rate = 0.1 momentum = 0.9 epoch_size = 2 device_num = 4 context.reset_auto_parallel_context() auto_parallel_context().set_enable_all_reduce_fusion( enable_all_reduce_fusion=True) context.set_auto_parallel_context( parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, device_num=device_num, parameter_broadcast=False) context.set_context(mode=context.GRAPH_MODE) predict = Tensor(np.ones([batch_size, 128]), dtype=ms.float32) label = Tensor(np.ones([batch_size]), dtype=ms.int32) dataset = Dataset(predict, label, 2) loss = SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True) opt = Momentum(net.trainable_params(), learning_rate, momentum) model = Model(net, loss, opt) model.train(epoch_size, dataset, dataset_sink_mode=False) allreduce_fusion_dict = _executor._get_allreduce_fusion( model._train_network) print(allreduce_fusion_dict) return allreduce_fusion_dict
def test_train_64k_8p(epoch_size=3, batch_size=32, num_classes=65536): #1048576 #131072 #32768 #8192 dev_num = 8 context.set_auto_parallel_context(parallel_mode=ParallelMode.AUTO_PARALLEL, device_num=dev_num) cost_model_context.set_cost_model_context(costmodel_gamma=0.001, costmodel_beta=260.0) set_algo_parameters(elementwise_op_strategy_follow=True) resset_op_id() np.random.seed(6) input_np = np.ones([batch_size, 3, 224, 224]).astype(np.float32) label_np = np.zeros([batch_size]).astype(np.int32) for i in range(0, batch_size): label_np[i] = i % num_classes dataset = DatasetLenet(Tensor(input_np), Tensor(label_np), 1) net = resnet50(num_classes) loss = SoftmaxCrossEntropyExpand(sparse=True) opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), 0.01, 0.9) model = Model(net, loss_fn=loss, optimizer=opt) model.train(5, dataset, dataset_sink_mode=False) strategies = _executor._get_strategy(model._train_network) for (k, v) in strategies.items(): if re.match(k, 'Conv2D-op') is not None: assert v[0][0] == dev_num elif re.match(k, 'MatMul-op') is not None: assert v == [[1, 1], [dev_num, 1]] elif re.match(k, 'ReduceSum-op') is not None: assert v == [[1, dev_num]]
def train_process(q, device_id, epoch_size, num_classes, device_num, batch_size, enable_hccl): os.system("mkdir " + str(device_id)) os.chdir(str(device_id)) context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False) context.set_context(device_id=device_id) os.environ['MINDSPORE_HCCL_CONFIG_PATH'] = MINDSPORE_HCCL_CONFIG_PATH os.environ['RANK_ID'] = str(device_id) os.environ['RANK_SIZE'] = str(device_num) if enable_hccl: context.set_auto_parallel_context( device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL) auto_parallel_context().set_all_reduce_fusion_split_indices([140]) init() context.set_context(mode=context.GRAPH_MODE) net = resnet50(batch_size, num_classes) loss = CrossEntropyLoss() opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), 0.01, 0.9) model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'}) dataset = create_dataset(1, training=True, batch_size=batch_size, rank_id=device_id, rank_size=device_num, enable_hccl=enable_hccl) loss_cb = LossGet() model.train(epoch_size, dataset, callbacks=[loss_cb]) q.put(loss_cb.get_loss())
def test_deeplabv3_1p(): start_time = time.time() epoch_size = 100 args_opt = argparse.Namespace(base_size=513, crop_size=513, batch_size=2) args_opt.base_size = config.crop_size args_opt.crop_size = config.crop_size args_opt.batch_size = config.batch_size train_dataset = create_dataset(args_opt, data_url, 1, config.batch_size, usage="eval") dataset_size = train_dataset.get_dataset_size() callback = LossCallBack(dataset_size) net = deeplabv3_resnet50(config.seg_num_classes, [config.batch_size, 3, args_opt.crop_size, args_opt.crop_size], infer_scale_sizes=config.eval_scales, atrous_rates=config.atrous_rates, decoder_output_stride=config.decoder_output_stride, output_stride=config.output_stride, fine_tune_batch_norm=config.fine_tune_batch_norm, image_pyramid=config.image_pyramid) net.set_train() model_fine_tune(net, 'layer') loss = OhemLoss(config.seg_num_classes, config.ignore_label) opt = Momentum(filter(lambda x: 'beta' not in x.name and 'gamma' not in x.name and 'depth' not in x.name and 'bias' not in x.name, net.trainable_params()), learning_rate=config.learning_rate, momentum=config.momentum, weight_decay=config.weight_decay) model = Model(net, loss, opt) model.train(epoch_size, train_dataset, callback) print(time.time() - start_time) print("expect loss: ", callback.loss) print("expect time: ", callback.time) expect_loss = 0.92 expect_time = 40 assert callback.loss.asnumpy() <= expect_loss assert callback.time <= expect_time
def test_pynative_resnet50(): context.set_context(mode=context.PYNATIVE_MODE, device_target="Ascend") batch_size = 32 num_classes = 10 net = resnet50(batch_size, num_classes) criterion = CrossEntropyLoss() optimizer = Momentum(learning_rate=0.01, momentum=0.9, params=filter(lambda x: x.requires_grad, net.get_parameters())) net_with_criterion = WithLossCell(net, criterion) net_with_criterion.set_grad() train_network = GradWrap(net_with_criterion) train_network.set_train() step = 0 max_step = 21 exceed_num = 0 data_set = create_dataset(repeat_num=1, training=True, batch_size=batch_size) for element in data_set.create_dict_iterator(num_epochs=1): step = step + 1 if step > max_step: break start_time = time.time() input_data = element["image"] input_label = element["label"] loss_output = net_with_criterion(input_data, input_label) grads = train_network(input_data, input_label) optimizer(grads) end_time = time.time() cost_time = end_time - start_time print("======step: ", step, " loss: ", loss_output.asnumpy(), " cost time: ", cost_time) if step > 1 and cost_time > 0.25: exceed_num = exceed_num + 1 assert exceed_num < 20
def test(cloud_args=None): """test""" args = parse_args(cloud_args) context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True, device_target=args.device_target, save_graphs=False) if os.getenv('DEVICE_ID', "not_set").isdigit(): context.set_context(device_id=int(os.getenv('DEVICE_ID'))) args.outputs_dir = os.path.join( args.log_path, datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S')) args.logger = LogUtil.get_instance() args.logger.set_level(20) net = vgg16(num_classes=args.num_classes, args=args) opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), 0.01, args.momentum, weight_decay=args.weight_decay) loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'}) param_dict = load_checkpoint(args.pre_trained) load_param_into_net(net, param_dict) net.set_train(False) dataset_test = vgg_create_dataset100(args.data_path, args.image_size, args.per_batch_size, training=False) res = model.eval(dataset_test) print("result: ", res)
def test_ead(): """UT for ensemble adversarial defense.""" num_classes = 10 batch_size = 64 sparse = False context.set_context(mode=context.GRAPH_MODE) context.set_context(device_target='Ascend') # create test data inputs = np.random.rand(batch_size, 1, 32, 32).astype(np.float32) labels = np.random.randint(num_classes, size=batch_size).astype(np.int32) if not sparse: labels = np.eye(num_classes)[labels].astype(np.float32) net = Net() loss_fn = nn.SoftmaxCrossEntropyWithLogits(sparse=sparse) optimizer = Momentum(net.trainable_params(), 0.001, 0.9) net = Net() fgsm = FastGradientSignMethod(net, loss_fn=loss_fn) pgd = ProjectedGradientDescent(net, loss_fn=loss_fn) ead = EnsembleAdversarialDefense(net, [fgsm, pgd], loss_fn=loss_fn, optimizer=optimizer) LOGGER.set_level(logging.DEBUG) LOGGER.debug(TAG, '---start ensemble adversarial defense--') loss = ead.defense(inputs, labels) LOGGER.debug(TAG, '---end ensemble adversarial defense--') assert np.any(loss >= 0.0)
def test_resnet_model_parallel(): num_classes = 1024 batch_size = 32 learning_rate = 0.1 momentum = 0.9 epoch_size = 2 context.reset_auto_parallel_context() context.set_auto_parallel_context(device_num=dev_num, global_rank=0) context.set_auto_parallel_context( parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, device_num=dev_num) context.set_context(mode=context.GRAPH_MODE) predict = Tensor(np.ones([batch_size, 64, 112, 112]), dtype=ms.float32) label = Tensor(np.ones([batch_size]), dtype=ms.int32) dataset = DatasetLenet(predict, label, 2) net = resnet_model_parallel_net(num_classes) loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') loss.softmax_cross_entropy.shard(((dev_num, 1), (dev_num, 1))) opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate, momentum) model = Model(net, loss, opt) model.train(epoch_size, dataset, dataset_sink_mode=False)
def mix_parallel_matmul_trains(self): parallel_callback = ModelCallback() matmul_stra = ((device_num, 1), (1, 1)) reduce_max_stra = ((1, device_num), ) sub_stra = ((device_num, 1), (device_num, 1)) exp_stra = ((1, device_num), ) reduce_sum_stra = ((1, device_num), ) div_stra = ((1, device_num), (1, 1)) log_stra = ((1, device_num), ) mul_stra = ((1, device_num), (1, device_num)) sum_cross_entropy_stra = ((1, device_num), ) mul2_stra = ((), (device_num, )) reduce_mean_stra = ((device_num, ), ) onehot_stra = ((1, device_num), (), ()) loss_stra_list = [ exp_stra, reduce_sum_stra, onehot_stra, div_stra, log_stra, sum_cross_entropy_stra, mul_stra, mul2_stra, reduce_mean_stra, reduce_max_stra, sub_stra ] context.set_auto_parallel_context(parallel_mode="auto_parallel") net = MatmulNet(matmul_stra=matmul_stra, loss_stra_list=loss_stra_list) optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9) model = Model(net, optimizer=optimizer) epoch_size = 6 dataset = Dataset(self.input_part, self.label_part) model.train(epoch_size, dataset, callbacks=parallel_callback, dataset_sink_mode=False) loss_value = np.array(parallel_callback.loss_list) return loss_value
def test_train_4k_8p_gpu(batch_size=32, num_classes=4096): dev_num = 8 context.set_context(mode=context.GRAPH_MODE, device_target="GPU") context.set_auto_parallel_context(parallel_mode=ParallelMode.AUTO_PARALLEL, device_num=dev_num) set_algo_parameters(elementwise_op_strategy_follow=True) resset_op_id() np.random.seed(6) input_np = np.ones([batch_size, 3, 224, 224]).astype(np.float32) label_np = np.zeros([batch_size]).astype(np.int32) for i in range(0, batch_size): label_np[i] = i % num_classes dataset = DatasetLenet(Tensor(input_np), Tensor(label_np), 1) net = resnet50(num_classes) loss = SoftmaxCrossEntropyExpand(sparse=True) opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), 0.01, 0.9) model = Model(net, loss_fn=loss, optimizer=opt) model.train(5, dataset, dataset_sink_mode=False) strategies = _executor._get_shard_strategy(model._train_network) for (k, v) in strategies.items(): if re.search('Conv2D-op', k) is not None: assert v[0][0] == dev_num elif re.search('MatMul-op', k) is not None: assert v == [[dev_num, 1], [1, 1]] elif re.search('ReduceSum-op', k) is not None: assert v == [[dev_num, 1]]
def test_multi_grads(): context.set_context(mode=context.PYNATIVE_MODE, device_target="Ascend") sparse = False inputs_np = np.random.rand(32, 1, 32, 32).astype(np.float32) labels_np = np.random.randint(10, size=32).astype(np.int32) inputs_np_2 = np.random.rand(64, 1, 32, 32).astype(np.float32) labels_np_2 = np.random.randint(10, size=64).astype(np.int32) if not sparse: labels_np = np.eye(10)[labels_np].astype(np.float32) labels_np_2 = np.eye(10)[labels_np_2].astype(np.float32) net = LeNet() # grad operation loss_fn = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=sparse) with_loss_cell = WithLossCell(net, loss_fn) grad_all = GradWrapWithLoss(with_loss_cell) grad_out = grad_all(Tensor(inputs_np), Tensor(labels_np)).asnumpy() assert np.any(grad_out != 0), 'grad result can not be all zeros' # train-one-step operation loss_fn = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=sparse) optimizer = Momentum( filter(lambda x: x.requires_grad, net.get_parameters()), 0.01, 0.9) loss_net = WithLossCell(net, loss_fn) train_net = TrainOneStepCell(loss_net, optimizer) train_net.set_train() train_net(Tensor(inputs_np_2), Tensor(labels_np_2))
def bn_common(parallel_mode, train_flag, strategy_loss=None): context.set_context(mode=context.GRAPH_MODE) context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=8) learning_rate = 0.1 momentum = 0.9 epoch_size = 2 rank_size = 8 predict = Tensor(np.ones([32, 512]), dtype=ms.float32) label = Tensor(np.ones([32]), dtype=ms.int32) dataset = Dataset(predict, label, 2) net = bn_net() loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') loss.softmax_cross_entropy.shard(strategy_loss) opt = Momentum(net.trainable_params(), learning_rate, momentum, 0.0001, 1024 * rank_size) if not train_flag: net = WithLossCell(net, loss) net.set_train() if parallel_mode == ParallelMode.DATA_PARALLEL: context.set_auto_parallel_context(parameter_broadcast=True) model = Model(net, loss, opt) if train_flag: model.train(epoch_size, dataset, dataset_sink_mode=False) else: model._predict(predict, label)
def test_ad(): """UT for adversarial defense.""" num_classes = 10 batch_size = 32 sparse = False context.set_context(mode=context.GRAPH_MODE) context.set_context(device_target='Ascend') # create test data inputs = np.random.rand(batch_size, 1, 32, 32).astype(np.float32) labels = np.random.randint(num_classes, size=batch_size).astype(np.int32) if not sparse: labels = np.eye(num_classes)[labels].astype(np.float32) net = Net() loss_fn = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=sparse) optimizer = Momentum(learning_rate=Tensor(np.array([0.001], np.float32)), momentum=0.9, params=net.trainable_params()) ad_defense = AdversarialDefense(net, loss_fn=loss_fn, optimizer=optimizer) LOGGER.set_level(logging.DEBUG) LOGGER.debug(TAG, '--start adversarial defense--') loss = ad_defense.defense(inputs, labels) LOGGER.debug(TAG, '--end adversarial defense--') assert np.any(loss >= 0.0)
def test_auto_parallel_arithmetic_model(): class NetOneHot(nn.Cell): def __init__(self): super().__init__() self.matmul = P.MatMul() self.one_hot = P.OneHot().shard(((1, 8), (), ())) self.on_value = Tensor(1.0, ms.float32) self.off_value = Tensor(0.0, ms.float32) self.matmul2 = P.MatMul() self.w = Parameter(Tensor(np.zeros([32, 64]).astype(np.float32)), "weight", requires_grad=True) def construct(self, x, b): out = self.matmul(x, self.w) out1 = self.one_hot(b, 64, self.on_value, self.off_value) out2 = self.matmul2(out, out1) return out2 context.reset_auto_parallel_context() context.set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode=ParallelMode.AUTO_PARALLEL) net = NetOneHot() x = Tensor(np.ones([8, 32]), dtype=ms.float32) b = Tensor(np.ones([8]), dtype=ms.int32) dataset = Dataset(x, b, 2) opt = Momentum(net.trainable_params(), 0.1, 0.9) model = Model(net, optimizer=opt) model.train(2, dataset, dataset_sink_mode=False)
def resnet50_train(args_opt): device_id = 0 device_num = 1 epoch_size = args_opt.epoch_size batch_size = 32 class_num = 10 loss_scale_num = 1024 local_data_path = '/home/share/dataset/cifar-10-batches-bin/' # your cifar10 path # set graph mode and parallel mode context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False) context.set_context(device_id=device_id) if device_num > 1: context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) init() local_data_path = os.path.join(local_data_path, str(device_id)) # data download print('Download data.') mox.file.copy_parallel(src_url=args_opt.data_url, dst_url=local_data_path) # create dataset print('Create train and evaluate dataset.') train_dataset = create_dataset(dataset_path=local_data_path, do_train=True, repeat_num=1, batch_size=batch_size) eval_dataset = create_dataset(dataset_path=local_data_path, do_train=False, repeat_num=1, batch_size=batch_size) train_step_size = train_dataset.get_dataset_size() print('Create dataset success.') # create model net = resnet50(class_num = class_num) # reduction='mean' means that apply reduction of mean to loss loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') lr = Tensor(get_lr(global_step=0, total_epochs=epoch_size, steps_per_epoch=train_step_size)) opt = Momentum(net.trainable_params(), lr, momentum=0.9, weight_decay=1e-4, loss_scale=loss_scale_num) loss_scale = FixedLossScaleManager(loss_scale_num, False) # amp_level="O2" means that the hybrid precision of O2 mode is used for training # the whole network except that batchnoram will be cast into float16 format and dynamic loss scale will be used # 'keep_batchnorm_fp32 = False' means that use the float16 format model = Model(net, amp_level="O2", keep_batchnorm_fp32=False, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'}) # define performance callback to show ips and loss callback to show loss for every epoch performance_cb = PerformanceCallback(batch_size) loss_cb = LossMonitor() cb = [performance_cb, loss_cb] print(f'Start run training, total epoch: {epoch_size}.') model.train(epoch_size, train_dataset, callbacks=cb) if device_num == 1 or device_id == 0: print(f'=================================Start run evaluation.=================================') output = model.eval(eval_dataset) print(f'Evaluation result: {output}.')
def test_exec_save_checkpoint(): net = Net() loss = SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True) opt = Momentum(net.trainable_params(), 0.0, 0.9, 0.0001, 1024) loss_net = WithLossCell(net, loss) train_network = TrainOneStepCell(loss_net, opt) _exec_save_checkpoint(train_network, ckpoint_file_name="./new_ckpt.ckpt") load_checkpoint("new_ckpt.ckpt")
def single_matmul_trains(self): single_callback = ModelCallback() net = MatmulNet() optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9) model = Model(net, optimizer=optimizer) epoch_size = 6 dataset = Dataset(self.input_full, self.label_full) model.train(epoch_size, dataset, callbacks=single_callback, dataset_sink_mode=False) loss_value = np.array(single_callback.loss_list) return loss_value
def data_parallel_matmul_trains(self): parallel_callback = ModelCallback() context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") net = MatmulNet() optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9) model = Model(net, optimizer=optimizer) epoch_size = 6 dataset = Dataset(self.input_part, self.label_part) model.train(epoch_size, dataset, callbacks=parallel_callback, dataset_sink_mode=False) loss_value = np.array(parallel_callback.loss_list) return loss_value
def __init__(self): context.set_context(reserve_class_name_in_scope=False) net = resnet50(batch_size, num_classes) ls = SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), 0.01, 0.9) model = Model(net, loss_fn=ls, optimizer=opt, metrics={'acc'}) self.model = model self.model.train(1, create_dataset(list(range(32))), dataset_sink_mode=False)
def test_save_checkpoint_for_network(): """ test save_checkpoint for network""" net = Net() loss = SoftmaxCrossEntropyWithLogits(sparse=True) opt = Momentum(net.trainable_params(), 0.0, 0.9, 0.0001, 1024) loss_net = WithLossCell(net, loss) train_network = TrainOneStepCell(loss_net, opt) save_checkpoint(train_network, ckpt_file_name="./new_ckpt.ckpt") load_checkpoint("new_ckpt.ckpt")
def resnet50_train(args_opt): epoch_size = args_opt.epoch_size batch_size = cfg.batch_size class_num = cfg.class_num loss_scale_num = cfg.loss_scale local_data_path = '/cache/data' local_ckpt_path = '/cache/ckpt_file' # set graph mode and parallel mode context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False) # data download print('Download data.') mox.file.copy_parallel(src_url=args_opt.data_url, dst_url=local_data_path) # create dataset print('Create train and evaluate dataset.') train_dataset = create_dataset(dataset_path=local_data_path, do_train=True, repeat_num=epoch_size, batch_size=batch_size) train_step_size = train_dataset.get_dataset_size() print('Create dataset success.') # create model net = resnet50(class_num=class_num) # reduction='mean' means that apply reduction of mean to loss loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') lr = Tensor(get_lr(global_step=0, total_epochs=epoch_size, steps_per_epoch=train_step_size)) opt = Momentum(net.trainable_params(), lr, momentum=0.9, weight_decay=1e-4, loss_scale=loss_scale_num) loss_scale = FixedLossScaleManager(loss_scale_num, False) # amp_level="O2" means that the hybrid precision of O2 mode is used for training # the whole network except that batchnorm will be cast into float16 format and dynamic loss scale will be used # 'keep_batchnorm_fp32 = False' means that use the float16 format model = Model(net, amp_level="O2", keep_batchnorm_fp32=False, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'}) # define performance callback to show ips and loss callback to show loss for every epoch time_cb = TimeMonitor(data_size=train_step_size) performance_cb = PerformanceCallback(batch_size) loss_cb = LossMonitor() cb = [time_cb, performance_cb, loss_cb] config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_epochs * train_step_size, keep_checkpoint_max=cfg.keep_checkpoint_max) ckpt_cb = ModelCheckpoint(prefix="resnet", directory=local_ckpt_path, config=config_ck) cb += [ckpt_cb] print(f'Start run training, total epoch: {epoch_size}.') model.train(epoch_size, train_dataset, callbacks=cb) # upload checkpoint files print('Upload checkpoint.') mox.file.copy_parallel(src_url=local_ckpt_path, dst_url=args_opt.train_url)
def loss_scale_manager_sens(strategy1, sens): learning_rate = 0.1 momentum = 0.9 device_num = 8 context.reset_auto_parallel_context() context.set_auto_parallel_context( parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, device_num=device_num) predict = Tensor(np.ones([32 * device_num, 128]), dtype=ms.float32) net = all_to_all_net(strategy1) opt = Momentum(net.trainable_params(), learning_rate, momentum) train_net = TrainOneStepCell(net, opt) train_net.set_train() train_net(predict, sens)
def resnet50_train(args_opt): epoch_size = args_opt.epoch_size batch_size = 32 class_num = 10 loss_scale_num = 1024 local_data_path = '/cache/data' # set graph mode and parallel mode context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False) context.set_context(enable_task_sink=True, device_id=device_id) context.set_context(enable_loop_sink=True) context.set_context(enable_mem_reuse=True) if device_num > 1: context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True) local_data_path = os.path.join(local_data_path, str(device_id)) # data download print('Download data.') mox.file.copy_parallel(src_url=args_opt.data_url, dst_url=local_data_path) # create dataset print('Create train and evaluate dataset.') train_dataset = create_dataset(dataset_path=local_data_path, do_train=True, repeat_num=epoch_size, batch_size=batch_size) eval_dataset = create_dataset(dataset_path=local_data_path, do_train=False, repeat_num=1, batch_size=batch_size) train_step_size = train_dataset.get_dataset_size() print('Create dataset success.') # create model net = resnet50(class_num = class_num) loss = SoftmaxCrossEntropyWithLogits(sparse=True) lr = Tensor(get_lr(global_step=0, total_epochs=epoch_size, steps_per_epoch=train_step_size)) opt = Momentum(net.trainable_params(), lr, momentum=0.9, weight_decay=1e-4, loss_scale=loss_scale_num) loss_scale = FixedLossScaleManager(loss_scale_num, False) model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'}) # define performance callback to show ips and loss callback to show loss for every epoch performance_cb = PerformanceCallback(batch_size) loss_cb = LossMonitor() cb = [performance_cb, loss_cb] print(f'Start run training, total epoch: {epoch_size}.') model.train(epoch_size, train_dataset, callbacks=cb) if device_num == 1 or device_id == 0: print(f'Start run evaluation.') output = model.eval(eval_dataset) print(f'Evaluation result: {output}.')
def __init__(self, network, loss_fn=None, optimizer=None): super(AdversarialDefense, self).__init__(network) network = check_model('network', network, Cell) if loss_fn is None: loss_fn = SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True) if optimizer is None: optimizer = Momentum(params=network.trainable_params(), learning_rate=0.01, momentum=0.9) loss_net = WithLossCell(network, loss_fn) self._train_net = TrainOneStepCell(loss_net, optimizer) self._train_net.set_train()
def test_train_cifar(num_classes=10, epoch_size=10): context.set_auto_parallel_context(parallel_mode=ParallelMode.AUTO_PARALLEL, mirror_mean=True) loss_cb = LossMonitor() dataset = create_dataset(epoch_size) net = resnet50(32, num_classes) loss = SoftmaxCrossEntropyExpand(sparse=True) opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), 0.01, 0.9) model = Model(net, loss_fn=loss, optimizer=opt) model.train(epoch_size, dataset, callbacks=[loss_cb], dataset_sink_mode=False)
def test_loss_scale2(): context.set_context(mode=context.GRAPH_MODE, save_graphs=True) context.set_auto_parallel_context( parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, device_num=8) predict = Tensor(np.ones([64, 64]), dtype=ms.float32) label = Tensor(np.ones([ 64, ]), dtype=ms.int32) dataset = DatasetLenet(predict, label) net = Net2() opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), 0.01, 0.9) net = nn.TrainOneStepWithLossScaleCell(net, opt, update_cell) model = Model(network=net) model.train(2, dataset, dataset_sink_mode=False)