def __init__(self, size=256, batch_size=16, image_size=(96, ), num_classes=16, random_offset=0): """init""" self.size = size self.rank_batch_size = batch_size self.total_batch_size = self.rank_batch_size self.random_offset = random_offset self.image_size = image_size self.num_classes = num_classes self.num_epochs = -1 self.rank_size = 1 self.rank_id = 0 self.batch_index = 0 self.image_data_type = np.float32 self.label_data_type = np.float32 self.is_onehot = True init(backend_name='hccl') self.rank_size = get_group_size() self.rank_id = get_rank() self.total_batch_size = self.rank_batch_size * self.rank_size self.total_batch_data_size = (self.rank_size, self.rank_batch_size) + image_size self.do_copy = False
def get_bprop_all_gather(self): """Generate bprop for AllGather""" fusion = self.get_attr_dict()["fusion"] if fusion == 0: reduce_scatter = ReduceScatter(ReduceOp.SUM, self.group) if self.instance_name: instance_name = "grad_" + self.instance_name reduce_scatter.set_prim_instance_name(instance_name) else: all_reduce = AllReduce(ReduceOp.SUM, self.group).add_prim_attr("fusion", fusion) if self.instance_name: instance_name = "grad_" + self.instance_name all_reduce.set_prim_instance_name(instance_name) rank = get_rank(self.group) dev_num = get_group_size(self.group) split = P.Split(output_num=dev_num) mean_flag = self.get_attr_dict()["mean_flag"] scale = 1 / self.rank_size def bprop(x, out, dout): if fusion == 0: dx = reduce_scatter(dout) else: grad = all_reduce(dout) dx = split(grad)[rank] if mean_flag: dx = F.tensor_mul(dx, scale) return (dx, ) return bprop
def get_bprop_mini_step_all_gather(self): """Generate bprop for _MiniStepAllGather""" fusion = self.get_attr_dict()["fusion"] mean_flag = self.get_attr_dict()["mean_flag"] do_mirror = self.get_attr_dict()["do_mirror"] scale = 1 / self.rank_size all_reduce = AllReduce(ReduceOp.SUM, self.group).add_prim_attr("fusion", fusion) if self.instance_name: instance_name = "grad_" + self.instance_name all_reduce.set_prim_instance_name(instance_name) rank = get_rank(self.group) dev_num = get_group_size(self.group) split = P.Split(output_num=dev_num) def bprop(x, z, out, dout): if do_mirror: if mean_flag: z = F.depend(z, F.assign_add(z, dout)) grad = all_reduce(z) dx = split(grad)[rank] dx = F.tensor_mul(dx, scale) else: z = F.depend(z, F.assign_add(z, dout)) grad = all_reduce(z) dx = split(grad)[rank] else: dx = dout return (dx, zeros_like(z)) return bprop
def create_dataset(data_path, repeat_num=1, batch_size=32, rank_id=0, rank_size=1): """create dataset""" resize_height = 224 resize_width = 224 rescale = 1.0 / 255.0 shift = 0.0 # get rank_id and rank_size rank_id = get_rank() rank_size = get_group_size() data_set = ds.Cifar10Dataset(data_path, num_shards=rank_size, shard_id=rank_id) # define map operations random_crop_op = vision.RandomCrop((32, 32), (4, 4, 4, 4)) random_horizontal_op = vision.RandomHorizontalFlip() resize_op = vision.Resize((resize_height, resize_width)) rescale_op = vision.Rescale(rescale, shift) normalize_op = vision.Normalize((0.4465, 0.4822, 0.4914), (0.2010, 0.1994, 0.2023)) changeswap_op = vision.HWC2CHW() type_cast_op = C.TypeCast(mstype.int32) c_trans = [random_crop_op, random_horizontal_op] c_trans += [resize_op, rescale_op, normalize_op, changeswap_op] # apply map operations on images data_set = data_set.map(operations=type_cast_op, input_columns="label") data_set = data_set.map(operations=c_trans, input_columns="image") # apply shuffle operations data_set = data_set.shuffle(buffer_size=10) # apply batch operations data_set = data_set.batch(batch_size=batch_size, drop_remainder=True) # apply repeat operations data_set = data_set.repeat(repeat_num) return data_set
def inception_v4_train(): """ Train Inceptionv4 in data parallelism """ print('epoch_size: {} batch_size: {} class_num {}'.format(config.epoch_size, config.batch_size, config.num_classes)) context.set_context(mode=context.GRAPH_MODE, device_target=args.platform) if args.platform == "Ascend": context.set_context(device_id=args.device_id) context.set_context(enable_graph_kernel=False) rank = 0 if device_num > 1: if args.platform == "Ascend": init(backend_name='hccl') elif args.platform == "GPU": init() else: raise ValueError("Unsupported device target.") rank = get_rank() context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, all_reduce_fusion_config=[200, 400]) # create dataset train_dataset = create_dataset(dataset_path=args.dataset_path, do_train=True, repeat_num=1, batch_size=config.batch_size, shard_id=rank) train_step_size = train_dataset.get_dataset_size() # create model net = Inceptionv4(classes=config.num_classes) # loss loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") # learning rate lr = Tensor(generate_cosine_lr(steps_per_epoch=train_step_size, total_epochs=config.epoch_size)) decayed_params = [] no_decayed_params = [] for param in net.trainable_params(): if 'beta' not in param.name and 'gamma' not in param.name and 'bias' not in param.name: decayed_params.append(param) else: no_decayed_params.append(param) for param in net.trainable_params(): if 'beta' not in param.name and 'gamma' not in param.name and 'bias' not in param.name: param.set_data(initializer(XavierUniform(), param.data.shape, param.data.dtype)) group_params = [{'params': decayed_params, 'weight_decay': config.weight_decay}, {'params': no_decayed_params}, {'order_params': net.trainable_params()}] opt = RMSProp(group_params, lr, decay=config.decay, epsilon=config.epsilon, weight_decay=config.weight_decay, momentum=config.momentum, loss_scale=config.loss_scale) if args.device_id == 0: print(lr) print(train_step_size) if args.resume: ckpt = load_checkpoint(args.resume) load_param_into_net(net, ckpt) loss_scale_manager = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) if args.platform == "Ascend": model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc', 'top_1_accuracy', 'top_5_accuracy'}, loss_scale_manager=loss_scale_manager, amp_level=config.amp_level) elif args.platform == "GPU": model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc', 'top_1_accuracy', 'top_5_accuracy'}, loss_scale_manager=loss_scale_manager, amp_level='O0') else: raise ValueError("Unsupported device target.") # define callbacks performance_cb = TimeMonitor(data_size=train_step_size) loss_cb = LossMonitor(per_print_times=train_step_size) ckp_save_step = config.save_checkpoint_epochs * train_step_size config_ck = CheckpointConfig(save_checkpoint_steps=ckp_save_step, keep_checkpoint_max=config.keep_checkpoint_max) ckpoint_cb = ModelCheckpoint(prefix=f"inceptionV4-train-rank{rank}", directory='ckpts_rank_' + str(rank), config=config_ck) callbacks = [performance_cb, loss_cb] if device_num > 1 and config.is_save_on_master: if args.device_id == 0: callbacks.append(ckpoint_cb) else: callbacks.append(ckpoint_cb) # train model model.train(config.epoch_size, train_dataset, callbacks=callbacks, dataset_sink_mode=True)