target = args_opt.device_target ckpt_save_dir = config.save_checkpoint_path # init context context.set_context(mode=context.GRAPH_MODE, device_target=target, save_graphs=False) if args_opt.parameter_server: context.set_ps_context(enable_ps=True) if args_opt.run_distribute: if target == "Ascend": device_id = int(os.getenv('DEVICE_ID')) context.set_context(device_id=device_id, enable_auto_mixed_precision=True) context.set_auto_parallel_context( device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) if args_opt.net == "resnet50" or args_opt.net == "se-resnet50": context.set_auto_parallel_context( all_reduce_fusion_config=[85, 160]) else: context.set_auto_parallel_context( all_reduce_fusion_config=[180, 313]) init() # GPU target else: init() context.set_auto_parallel_context( device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True)
def test(): """The function of eval.""" start_time = time.time() args = parse_args() # logger args.outputs_dir = os.path.join( args.log_path, datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S')) rank_id = int(os.environ.get('RANK_ID')) args.logger = get_logger(args.outputs_dir, rank_id) context.reset_auto_parallel_context() parallel_mode = ParallelMode.STAND_ALONE context.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=1) args.logger.info('Creating Network....') network = YOLOV3DarkNet53(is_training=False) config = ConfigYOLOV3DarkNet53() if args.testing_shape: config.test_img_shape = conver_testing_shape(args) # convert fusion network to quantization aware network if config.quantization_aware: quantizer = QuantizationAwareTraining(bn_fold=True, per_channel=[True, False], symmetric=[True, False]) network = quantizer.quantize(network) args.logger.info(args.pretrained) if os.path.isfile(args.pretrained): param_dict = load_checkpoint(args.pretrained) param_dict_new = {} for key, values in param_dict.items(): if key.startswith('moments.'): continue elif key.startswith('yolo_network.'): param_dict_new[key[13:]] = values else: param_dict_new[key] = values load_param_into_net(network, param_dict_new) args.logger.info('load_model {} success'.format(args.pretrained)) else: args.logger.info('{} not exists or not a pre-trained file'.format( args.pretrained)) assert FileNotFoundError( '{} not exists or not a pre-trained file'.format(args.pretrained)) exit(1) data_root = args.data_root ann_file = args.annFile ds, data_size = create_yolo_dataset(data_root, ann_file, is_training=False, batch_size=args.per_batch_size, max_epoch=1, device_num=1, rank=rank_id, shuffle=False, config=config) args.logger.info('testing shape : {}'.format(config.test_img_shape)) args.logger.info('totol {} images to eval'.format(data_size)) network.set_train(False) # init detection engine detection = DetectionEngine(args) input_shape = Tensor(tuple(config.test_img_shape), ms.float32) args.logger.info('Start inference....') for i, data in enumerate(ds.create_dict_iterator(num_epochs=1)): image = data["image"] image_shape = data["image_shape"] image_id = data["img_id"] prediction = network(image, input_shape) output_big, output_me, output_small = prediction output_big = output_big.asnumpy() output_me = output_me.asnumpy() output_small = output_small.asnumpy() image_id = image_id.asnumpy() image_shape = image_shape.asnumpy() detection.detect([output_small, output_me, output_big], args.per_batch_size, image_shape, image_id) if i % 1000 == 0: args.logger.info('Processing... {:.2f}% '.format( i * args.per_batch_size / data_size * 100)) args.logger.info('Calculating mAP...') detection.do_nms_for_results() result_file_path = detection.write_result() args.logger.info('result file path: {}'.format(result_file_path)) eval_result = detection.get_eval_result() cost_time = time.time() - start_time args.logger.info('\n=============coco eval reulst=========\n' + eval_result) args.logger.info('testing cost time {:.2f}h'.format(cost_time / 3600.))
def test_yolov3_darknet53(): devid = int(os.getenv('DEVICE_ID')) if os.getenv('DEVICE_ID') else 0 context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True, device_target="Ascend", device_id=devid) rank = 0 device_num = 1 lr_init = 0.001 epoch_size = 3 batch_size = 32 loss_scale = 1024 mindrecord_dir = DATA_DIR # It will generate mindrecord file in args_opt.mindrecord_dir, # and the file name is yolo.mindrecord0, 1, ... file_num. if not os.path.isdir(mindrecord_dir): raise KeyError("mindrecord path is not exist.") data_root = os.path.join(mindrecord_dir, 'train2014') annFile = os.path.join(mindrecord_dir, 'annotations/instances_train2014.json') # print("yolov3 mindrecord is ", mindrecord_file) if not os.path.exists(annFile): print("instances_train2014 file is not exist.") assert False loss_meter = AverageMeter('loss') context.reset_auto_parallel_context() parallel_mode = ParallelMode.STAND_ALONE context.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=1) network = YOLOV3DarkNet53(is_training=True) # default is kaiming-normal default_recurisive_init(network) network = YoloWithLossCell(network) print('finish get network') config = ConfigYOLOV3DarkNet53() label_smooth = 0 label_smooth_factor = 0.1 config.label_smooth = label_smooth config.label_smooth_factor = label_smooth_factor # When create MindDataset, using the fitst mindrecord file, such as yolo.mindrecord0. print("Create dataset begin!") training_shape = [int(416), int(416)] config.multi_scale = [training_shape] num_samples = 256 ds, data_size = create_yolo_dataset(image_dir=data_root, anno_path=annFile, is_training=True, batch_size=batch_size, max_epoch=epoch_size, device_num=device_num, rank=rank, config=config, num_samples=num_samples) print("Create dataset done!") per_batch_size = batch_size group_size = 1 print("data_size:", data_size) steps_per_epoch = int(data_size / per_batch_size / group_size) print("steps_per_epoch:", steps_per_epoch) warmup_epochs = 0. max_epoch = epoch_size T_max = 1 eta_min = 0 lr = warmup_cosine_annealing_lr(lr_init, steps_per_epoch, warmup_epochs, max_epoch, T_max, eta_min) opt = Momentum(params=get_param_groups(network), learning_rate=Tensor(lr), momentum=0.9, weight_decay=0.0005, loss_scale=loss_scale) network = TrainingWrapper(network, opt) network.set_train() old_progress = -1 t_end = time.time() data_loader = ds.create_dict_iterator(output_numpy=True) train_starttime = time.time() time_used_per_epoch = 0 print("time:", time.time()) for i, data in enumerate(data_loader): images = data["image"] input_shape = images.shape[2:4] print('iter[{}], shape{}'.format(i, input_shape[0])) images = Tensor.from_numpy(images) batch_y_true_0 = Tensor.from_numpy(data['bbox1']) batch_y_true_1 = Tensor.from_numpy(data['bbox2']) batch_y_true_2 = Tensor.from_numpy(data['bbox3']) batch_gt_box0 = Tensor.from_numpy(data['gt_box1']) batch_gt_box1 = Tensor.from_numpy(data['gt_box2']) batch_gt_box2 = Tensor.from_numpy(data['gt_box3']) input_shape = Tensor(tuple(input_shape[::-1]), ms.float32) loss = network(images, batch_y_true_0, batch_y_true_1, batch_y_true_2, batch_gt_box0, batch_gt_box1, batch_gt_box2, input_shape) loss_meter.update(loss.asnumpy()) if (i + 1) % steps_per_epoch == 0: time_used = time.time() - t_end epoch = int(i / steps_per_epoch) fps = per_batch_size * (i - old_progress) * group_size / time_used if rank == 0: print( 'epoch[{}], iter[{}], {}, {:.2f} imgs/sec, lr:{}, time_used:{}' .format(epoch, i, loss_meter, fps, lr[i], time_used)) t_end = time.time() loss_meter.reset() old_progress = i time_used_per_epoch = time_used train_endtime = time.time() - train_starttime print('train_time_used:{}'.format(train_endtime)) expect_loss_value = 3210.0 loss_value = re.findall(r"\d+\.?\d*", str(loss_meter)) print('loss_value:{}'.format(loss_value[0])) assert float(loss_value[0]) < expect_loss_value export_time_used = 20.0 print('time_used_per_epoch:{}'.format(time_used_per_epoch)) assert time_used_per_epoch < export_time_used print('==========test case passed===========')
type=int, default=0, help="Rank id, default: 0.") args_opt = parser.parse_args() context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target, device_id=args_opt.device_id) if __name__ == '__main__': if args_opt.run_distribute: if args_opt.device_target == "Ascend": rank = args_opt.rank_id device_num = args_opt.device_num context.set_auto_parallel_context( device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) init() else: init("nccl") context.reset_auto_parallel_context() rank = get_rank() device_num = get_group_size() context.set_auto_parallel_context( device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) else: rank = 0 device_num = 1
from mindspore import Model, context from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor from mindspore.train import ParallelMode from mindspore.communication.management import get_rank, get_group_size, init from src.wide_and_deep import PredictWithSigmoid, TrainStepWrap, NetWithLossClass, WideDeepModel from src.callbacks import LossCallBack, EvalCallBack from src.datasets import create_dataset from src.metrics import AUCMetric from src.config import WideDeepConfig sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=True) context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True) init() def get_WideDeep_net(config): WideDeep_net = WideDeepModel(config) loss_net = NetWithLossClass(WideDeep_net, config) train_net = TrainStepWrap(loss_net) eval_net = PredictWithSigmoid(WideDeep_net) return train_net, eval_net class ModelBuilder(): """ ModelBuilder """
def main(): parser = argparse.ArgumentParser(description="YOLOv3 train") parser.add_argument("--only_create_dataset", type=bool, default=False, help="If set it true, only create " "Mindrecord, default is false.") parser.add_argument("--distribute", type=bool, default=False, help="Run distribute, default is false.") parser.add_argument("--device_id", type=int, default=0, help="Device id, default is 0.") parser.add_argument("--device_num", type=int, default=1, help="Use device nums, default is 1.") parser.add_argument("--lr", type=float, default=0.001, help="Learning rate, default is 0.001.") parser.add_argument("--mode", type=str, default="sink", help="Run sink mode or not, default is sink") parser.add_argument("--epoch_size", type=int, default=10, help="Epoch size, default is 10") parser.add_argument("--batch_size", type=int, default=32, help="Batch size, default is 32.") parser.add_argument("--pre_trained", type=str, default=None, help="Pretrained checkpoint file path") parser.add_argument("--pre_trained_epoch_size", type=int, default=0, help="Pretrained epoch size") parser.add_argument("--save_checkpoint_epochs", type=int, default=5, help="Save checkpoint epochs, default is 5.") parser.add_argument("--loss_scale", type=int, default=1024, help="Loss scale, default is 1024.") parser.add_argument( "--mindrecord_dir", type=str, default="./Mindrecord", help= "Mindrecord directory. If the mindrecord_dir is empty, it wil generate mindrecord file by" "image_dir and anno_path. Note if mindrecord_dir isn't empty, it will use mindrecord_dir " "rather than image_dir and anno_path. Default is ./Mindrecord_train") parser.add_argument('--data_url', type=str, default=None, help='Dataset path') parser.add_argument('--train_url', type=str, default=None, help='Train output path') parser.add_argument("--anno_path", type=str, default="", help="Annotation path.") args_opt = parser.parse_args() device_id = int(os.getenv('DEVICE_ID')) device_num = int(os.getenv('RANK_SIZE')) rankid = int(os.getenv('RANK_ID')) local_data_url = '/cache/data' local_train_url = '/cache/ckpt' local_anno_url = '/cache/anno' local_mindrecord_url = '/cache/mindrecord' mox.file.copy_parallel(args_opt.mindrecord_dir, local_mindrecord_url) context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=device_id) if args_opt.distribute: context.reset_auto_parallel_context() context.set_auto_parallel_context( parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True, device_num=device_num) init() rank = rankid local_train_url = os.path.join(local_train_url, str(device_id)) else: rank = 0 device_num = 1 print("Start create dataset!") # It will generate mindrecord file in args_opt.mindrecord_dir, # and the file name is yolo.mindrecord0, 1, ... file_num. if not os.path.isdir(local_mindrecord_url): os.makedirs(local_mindrecord_url) prefix = "train.mindrecord" mindrecord_file = os.path.join(local_mindrecord_url, prefix + "0") if not os.path.exists(mindrecord_file): mox.file.copy_parallel(args_opt.data_url, local_data_url) if args_opt.anno_path: anno_file = os.path.join(local_anno_url, os.path.split(args_opt.anno_path)[1]) mox.file.copy_parallel(args_opt.anno_path, anno_file) if os.path.isdir(local_data_url) or os.path.exists(anno_file): print("Create Mindrecord.") data_to_mindrecord_byte_image(local_data_url, anno_file, local_mindrecord_url, prefix=prefix, file_num=8) print("Create Mindrecord Done, at {}".format( args_opt.mindrecord_dir)) mox.file.copy_parallel(local_mindrecord_url, args_opt.mindrecord_dir) else: print("image_dir or anno_path not exits.") if not args_opt.only_create_dataset: loss_scale = float(args_opt.loss_scale) # When create MindDataset, using the fitst mindrecord file, such as yolo.mindrecord0. dataset = create_yolo_dataset(mindrecord_file, repeat_num=args_opt.epoch_size, batch_size=args_opt.batch_size, device_num=device_num, rank=rank) dataset_size = dataset.get_dataset_size() print("Create dataset done!") net = yolov3_resnet18(ConfigYOLOV3ResNet18()) net = YoloWithLossCell(net, ConfigYOLOV3ResNet18()) init_net_param(net, "XavierUniform") # checkpoint ckpt_config = CheckpointConfig(save_checkpoint_steps=dataset_size * args_opt.save_checkpoint_epochs) ckpoint_cb = ModelCheckpoint(prefix="yolov3", directory=local_train_url, config=ckpt_config) if args_opt.pre_trained: if args_opt.pre_trained_epoch_size <= 0: raise KeyError( "pre_trained_epoch_size must be greater than 0.") param_dict = load_checkpoint(args_opt.pre_trained) load_param_into_net(net, param_dict) total_epoch_size = 60 if args_opt.distribute: total_epoch_size = 160 lr = Tensor( get_lr(learning_rate=args_opt.lr, start_step=args_opt.pre_trained_epoch_size * dataset_size, global_step=total_epoch_size * dataset_size, decay_step=1000, decay_rate=0.95, steps=True)) opt = nn.Adam(filter(lambda x: x.requires_grad, net.get_parameters()), lr, loss_scale=loss_scale) net = TrainingWrapper(net, opt, loss_scale) callback = [ TimeMonitor(data_size=dataset_size), LossMonitor(), ckpoint_cb ] model = Model(net) dataset_sink_mode = False if args_opt.mode == "sink": print("In sink mode, one epoch return a loss.") dataset_sink_mode = True print( "Start train YOLOv3, the first epoch will be slower because of the graph compilation." ) model.train(args_opt.epoch_size, dataset, callbacks=callback, dataset_sink_mode=dataset_sink_mode) if device_id == 1: mox.file.copy_parallel(local_train_url, args_opt.train_url)
def test_range3(): context.set_auto_parallel_context(parallel_mode="auto_parallel", device_num=dev_num, global_rank=2) net = Net(_w1, 0.0, 4.0, 0.5) compile_net(net)
def train(args): '''train''' print('=============yolov3 start trainging==================') # init distributed if args.world_size != 1: init() args.local_rank = get_rank() args.world_size = get_group_size() args.batch_size = config.batch_size args.warmup_lr = config.warmup_lr args.lr_rates = config.lr_rates args.lr_steps = config.lr_steps args.gamma = config.gamma args.weight_decay = config.weight_decay args.momentum = config.momentum args.max_epoch = config.max_epoch args.log_interval = config.log_interval args.ckpt_path = config.ckpt_path args.ckpt_interval = config.ckpt_interval args.outputs_dir = os.path.join(args.ckpt_path, datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S')) print('args.outputs_dir', args.outputs_dir) args.logger = get_logger(args.outputs_dir, args.local_rank) if args.world_size != 8: args.lr_steps = [i * 8 // args.world_size for i in args.lr_steps] if args.world_size == 1: args.weight_decay = 0. if args.world_size != 1: parallel_mode = ParallelMode.DATA_PARALLEL else: parallel_mode = ParallelMode.STAND_ALONE context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=args.world_size, gradients_mean=True) mindrecord_path = args.mindrecord_path num_classes = config.num_classes anchors = config.anchors anchors_mask = config.anchors_mask num_anchors_list = [len(x) for x in anchors_mask] momentum = args.momentum args.logger.info('train opt momentum:{}'.format(momentum)) weight_decay = args.weight_decay * float(args.batch_size) args.logger.info('real weight_decay:{}'.format(weight_decay)) lr_scale = args.world_size / 8 args.logger.info('lr_scale:{}'.format(lr_scale)) # dataloader args.logger.info('start create dataloader') epoch = args.max_epoch ds = de.MindDataset(mindrecord_path + "0", columns_list=["image", "annotation"], num_shards=args.world_size, shard_id=args.local_rank) ds = ds.map(input_columns=["image", "annotation"], output_columns=["image", "annotation", 'coord_mask_0', 'conf_pos_mask_0', 'conf_neg_mask_0', 'cls_mask_0', 't_coord_0', 't_conf_0', 't_cls_0', 'gt_list_0', 'coord_mask_1', 'conf_pos_mask_1', 'conf_neg_mask_1', 'cls_mask_1', 't_coord_1', 't_conf_1', 't_cls_1', 'gt_list_1', 'coord_mask_2', 'conf_pos_mask_2', 'conf_neg_mask_2', 'cls_mask_2', 't_coord_2', 't_conf_2', 't_cls_2', 'gt_list_2'], column_order=["image", "annotation", 'coord_mask_0', 'conf_pos_mask_0', 'conf_neg_mask_0', 'cls_mask_0', 't_coord_0', 't_conf_0', 't_cls_0', 'gt_list_0', 'coord_mask_1', 'conf_pos_mask_1', 'conf_neg_mask_1', 'cls_mask_1', 't_coord_1', 't_conf_1', 't_cls_1', 'gt_list_1', 'coord_mask_2', 'conf_pos_mask_2', 'conf_neg_mask_2', 'cls_mask_2', 't_coord_2', 't_conf_2', 't_cls_2', 'gt_list_2'], operations=compose_map_func, num_parallel_workers=16, python_multiprocessing=True) ds = ds.batch(args.batch_size, drop_remainder=True, num_parallel_workers=8) args.steps_per_epoch = ds.get_dataset_size() lr = warmup_step_new(args, lr_scale=lr_scale) ds = ds.repeat(epoch) args.logger.info('args.steps_per_epoch:{}'.format(args.steps_per_epoch)) args.logger.info('args.world_size:{}'.format(args.world_size)) args.logger.info('args.local_rank:{}'.format(args.local_rank)) args.logger.info('end create dataloader') args.logger.save_args(args) args.logger.important_info('start create network') create_network_start = time.time() # backbone and loss network = backbone_HwYolov3(num_classes, num_anchors_list, args) criterion0 = YoloLoss(num_classes, anchors, anchors_mask[0], 64, 0, head_idx=0.0) criterion1 = YoloLoss(num_classes, anchors, anchors_mask[1], 32, 0, head_idx=1.0) criterion2 = YoloLoss(num_classes, anchors, anchors_mask[2], 16, 0, head_idx=2.0) # load pretrain model if os.path.isfile(args.pretrained): param_dict = load_checkpoint(args.pretrained) param_dict_new = {} for key, values in param_dict.items(): if key.startswith('moments.'): continue elif key.startswith('network.'): param_dict_new[key[8:]] = values else: param_dict_new[key] = values load_param_into_net(network, param_dict_new) args.logger.info('load model {} success'.format(args.pretrained)) train_net = BuildTrainNetworkV2(network, criterion0, criterion1, criterion2, args) # optimizer opt = Momentum(params=train_net.trainable_params(), learning_rate=Tensor(lr), momentum=momentum, weight_decay=weight_decay) # package training process train_net = TrainOneStepWithLossScaleCell(train_net, opt) train_net.set_broadcast_flag() # checkpoint ckpt_max_num = args.max_epoch * args.steps_per_epoch // args.ckpt_interval train_config = CheckpointConfig(save_checkpoint_steps=args.ckpt_interval, keep_checkpoint_max=ckpt_max_num) ckpt_cb = ModelCheckpoint(config=train_config, directory=args.outputs_dir, prefix='{}'.format(args.local_rank)) cb_params = _InternalCallbackParam() cb_params.train_network = train_net cb_params.epoch_num = ckpt_max_num cb_params.cur_epoch_num = 1 run_context = RunContext(cb_params) ckpt_cb.begin(run_context) train_net.set_train() t_end = time.time() t_epoch = time.time() old_progress = -1 i = 0 scale_manager = DynamicLossScaleManager(init_loss_scale=2 ** 10, scale_factor=2, scale_window=2000) for data in ds.create_tuple_iterator(output_numpy=True): batch_images = data[0] batch_labels = data[1] coord_mask_0 = data[2] conf_pos_mask_0 = data[3] conf_neg_mask_0 = data[4] cls_mask_0 = data[5] t_coord_0 = data[6] t_conf_0 = data[7] t_cls_0 = data[8] gt_list_0 = data[9] coord_mask_1 = data[10] conf_pos_mask_1 = data[11] conf_neg_mask_1 = data[12] cls_mask_1 = data[13] t_coord_1 = data[14] t_conf_1 = data[15] t_cls_1 = data[16] gt_list_1 = data[17] coord_mask_2 = data[18] conf_pos_mask_2 = data[19] conf_neg_mask_2 = data[20] cls_mask_2 = data[21] t_coord_2 = data[22] t_conf_2 = data[23] t_cls_2 = data[24] gt_list_2 = data[25] img_tensor = Tensor(batch_images, mstype.float32) coord_mask_tensor_0 = Tensor(coord_mask_0.astype(np.float32)) conf_pos_mask_tensor_0 = Tensor(conf_pos_mask_0.astype(np.float32)) conf_neg_mask_tensor_0 = Tensor(conf_neg_mask_0.astype(np.float32)) cls_mask_tensor_0 = Tensor(cls_mask_0.astype(np.float32)) t_coord_tensor_0 = Tensor(t_coord_0.astype(np.float32)) t_conf_tensor_0 = Tensor(t_conf_0.astype(np.float32)) t_cls_tensor_0 = Tensor(t_cls_0.astype(np.float32)) gt_list_tensor_0 = Tensor(gt_list_0.astype(np.float32)) coord_mask_tensor_1 = Tensor(coord_mask_1.astype(np.float32)) conf_pos_mask_tensor_1 = Tensor(conf_pos_mask_1.astype(np.float32)) conf_neg_mask_tensor_1 = Tensor(conf_neg_mask_1.astype(np.float32)) cls_mask_tensor_1 = Tensor(cls_mask_1.astype(np.float32)) t_coord_tensor_1 = Tensor(t_coord_1.astype(np.float32)) t_conf_tensor_1 = Tensor(t_conf_1.astype(np.float32)) t_cls_tensor_1 = Tensor(t_cls_1.astype(np.float32)) gt_list_tensor_1 = Tensor(gt_list_1.astype(np.float32)) coord_mask_tensor_2 = Tensor(coord_mask_2.astype(np.float32)) conf_pos_mask_tensor_2 = Tensor(conf_pos_mask_2.astype(np.float32)) conf_neg_mask_tensor_2 = Tensor(conf_neg_mask_2.astype(np.float32)) cls_mask_tensor_2 = Tensor(cls_mask_2.astype(np.float32)) t_coord_tensor_2 = Tensor(t_coord_2.astype(np.float32)) t_conf_tensor_2 = Tensor(t_conf_2.astype(np.float32)) t_cls_tensor_2 = Tensor(t_cls_2.astype(np.float32)) gt_list_tensor_2 = Tensor(gt_list_2.astype(np.float32)) scaling_sens = Tensor(scale_manager.get_loss_scale(), dtype=mstype.float32) loss0, overflow, _ = train_net(img_tensor, coord_mask_tensor_0, conf_pos_mask_tensor_0, conf_neg_mask_tensor_0, cls_mask_tensor_0, t_coord_tensor_0, t_conf_tensor_0, t_cls_tensor_0, gt_list_tensor_0, coord_mask_tensor_1, conf_pos_mask_tensor_1, conf_neg_mask_tensor_1, cls_mask_tensor_1, t_coord_tensor_1, t_conf_tensor_1, t_cls_tensor_1, gt_list_tensor_1, coord_mask_tensor_2, conf_pos_mask_tensor_2, conf_neg_mask_tensor_2, cls_mask_tensor_2, t_coord_tensor_2, t_conf_tensor_2, t_cls_tensor_2, gt_list_tensor_2, scaling_sens) overflow = np.all(overflow.asnumpy()) if overflow: scale_manager.update_loss_scale(overflow) else: scale_manager.update_loss_scale(False) args.logger.info('rank[{}], iter[{}], loss[{}], overflow:{}, loss_scale:{}, lr:{}, batch_images:{}, ' 'batch_labels:{}'.format(args.local_rank, i, loss0, overflow, scaling_sens, lr[i], batch_images.shape, batch_labels.shape)) # save ckpt cb_params.cur_step_num = i + 1 # current step number cb_params.batch_num = i + 2 if args.local_rank == 0: ckpt_cb.step_end(run_context) # save Log if i == 0: time_for_graph_compile = time.time() - create_network_start args.logger.important_info('Yolov3, graph compile time={:.2f}s'.format(time_for_graph_compile)) if i % args.steps_per_epoch == 0: cb_params.cur_epoch_num += 1 if i % args.log_interval == 0 and args.local_rank == 0: time_used = time.time() - t_end epoch = int(i / args.steps_per_epoch) fps = args.batch_size * (i - old_progress) * args.world_size / time_used args.logger.info('epoch[{}], iter[{}], loss:[{}], {:.2f} imgs/sec'.format(epoch, i, loss0, fps)) t_end = time.time() old_progress = i if i % args.steps_per_epoch == 0 and args.local_rank == 0: epoch_time_used = time.time() - t_epoch epoch = int(i / args.steps_per_epoch) fps = args.batch_size * args.world_size * args.steps_per_epoch / epoch_time_used args.logger.info('=================================================') args.logger.info('epoch time: epoch[{}], iter[{}], {:.2f} imgs/sec'.format(epoch, i, fps)) args.logger.info('=================================================') t_epoch = time.time() i = i + 1 args.logger.info('=============yolov3 training finished==================')
def main(): """Main entrance for training""" args = parser.parse_args() print(sys.argv) devid, args.rank_id, args.rank_size = 0, 0, 1 context.set_context(mode=context.GRAPH_MODE) if args.distributed: if args.GPU: init("nccl") context.set_context(device_target='GPU') else: init() devid = int(os.getenv('DEVICE_ID')) context.set_context(device_target='Ascend', device_id=devid, reserve_class_name_in_scope=False) context.reset_auto_parallel_context() args.rank_id = get_rank() args.rank_size = get_group_size() context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=args.rank_size) else: if args.GPU: context.set_context(device_target='GPU') is_master = not args.distributed or (args.rank_id == 0) # parse model argument assert args.model.startswith( "tinynet"), "Only Tinynet models are supported." _, sub_name = args.model.split("_") net = tinynet(sub_model=sub_name, num_classes=args.num_classes, drop_rate=args.drop, drop_connect_rate=args.drop_connect, global_pool="avg", bn_tf=args.bn_tf, bn_momentum=args.bn_momentum, bn_eps=args.bn_eps) if is_master: print("Total number of parameters:", count_params(net)) # input image size of the network input_size = net.default_cfg['input_size'][1] train_dataset = val_dataset = None train_data_url = os.path.join(args.data_path, 'train') val_data_url = os.path.join(args.data_path, 'val') val_dataset = create_dataset_val(args.batch_size, val_data_url, workers=args.workers, distributed=False, input_size=input_size) if args.train: train_dataset = create_dataset(args.batch_size, train_data_url, workers=args.workers, distributed=args.distributed, input_size=input_size) batches_per_epoch = train_dataset.get_dataset_size() loss = LabelSmoothingCrossEntropy( smooth_factor=args.smoothing, num_classes=args.num_classes) time_cb = TimeMonitor(data_size=batches_per_epoch) loss_scale_manager = FixedLossScaleManager( args.loss_scale, drop_overflow_update=False) lr_array = get_lr(base_lr=args.lr, total_epochs=args.epochs, steps_per_epoch=batches_per_epoch, decay_epochs=args.decay_epochs, decay_rate=args.decay_rate, warmup_epochs=args.warmup_epochs, warmup_lr_init=args.warmup_lr, global_epoch=0) lr = Tensor(lr_array) loss_cb = LossMonitor(lr_array, args.epochs, per_print_times=args.per_print_times, start_epoch=0) param_group = add_weight_decay(net, weight_decay=args.weight_decay) if args.opt == 'sgd': if is_master: print('Using SGD optimizer') optimizer = SGD(param_group, learning_rate=lr, momentum=args.momentum, weight_decay=args.weight_decay, loss_scale=args.loss_scale) elif args.opt == 'rmsprop': if is_master: print('Using rmsprop optimizer') optimizer = RMSProp(param_group, learning_rate=lr, decay=0.9, weight_decay=args.weight_decay, momentum=args.momentum, epsilon=args.opt_eps, loss_scale=args.loss_scale) loss.add_flags_recursive(fp32=True, fp16=False) eval_metrics = {'Validation-Loss': Loss(), 'Top1-Acc': Top1CategoricalAccuracy(), 'Top5-Acc': Top5CategoricalAccuracy()} if args.ckpt: ckpt = load_checkpoint(args.ckpt) load_param_into_net(net, ckpt) net.set_train(False) model = Model(net, loss, optimizer, metrics=eval_metrics, loss_scale_manager=loss_scale_manager, amp_level=args.amp_level) net_ema = copy.deepcopy(net) net_ema.set_train(False) assert args.ema_decay > 0, "EMA should be used in tinynet training." ema_cb = EmaEvalCallBack(model=model, ema_network=net_ema, loss_fn=loss, eval_dataset=val_dataset, decay=args.ema_decay, save_epoch=args.ckpt_save_epoch, dataset_sink_mode=args.dataset_sink, start_epoch=0) callbacks = [loss_cb, ema_cb, time_cb] if is_master else [] if is_master: print("Training on " + args.model + " with " + str(args.num_classes) + " classes") model.train(args.epochs, train_dataset, callbacks=callbacks, dataset_sink_mode=args.dataset_sink)
def test_set_auto_parallel_context(): context.set_auto_parallel_context(device_num=4, global_rank=3, gradients_mean=True, gradient_fp32_sync=False, parallel_mode="auto_parallel", parameter_broadcast=False) device_num = context.get_auto_parallel_context("device_num") global_rank = context.get_auto_parallel_context("global_rank") gradients_mean = context.get_auto_parallel_context("gradients_mean") gradient_fp32_sync = context.get_auto_parallel_context( "gradient_fp32_sync") parallel_mode = context.get_auto_parallel_context("parallel_mode") parameter_broadcast = context.get_auto_parallel_context( "parameter_broadcast") assert device_num == 4 assert global_rank == 3 assert gradients_mean assert not gradient_fp32_sync assert parallel_mode == "auto_parallel" assert not parameter_broadcast auto_parallel_context().set_device_num(4) device_num = auto_parallel_context().get_device_num() device_num_is_set = auto_parallel_context().get_device_num_is_set() assert device_num == 4 assert device_num_is_set auto_parallel_context().set_global_rank(4) global_rank = auto_parallel_context().get_global_rank() assert global_rank == 4 auto_parallel_context().set_gradients_mean(True) gradients_mean = auto_parallel_context().get_gradients_mean() assert gradients_mean auto_parallel_context().set_gradient_fp32_sync(False) gradient_fp32_sync = auto_parallel_context().get_gradient_fp32_sync() assert not gradient_fp32_sync parameter_broadcast_is_set = auto_parallel_context( ).get_parameter_broadcast_is_set() assert parameter_broadcast_is_set with pytest.raises(ValueError): context.set_auto_parallel_context(device_num=0) with pytest.raises(ValueError): context.set_auto_parallel_context(device_num=4097) with pytest.raises(ValueError): context.set_auto_parallel_context(global_rank=-1) with pytest.raises(ValueError): context.set_auto_parallel_context(parallel_mode="wrong_mode") with pytest.raises(ValueError): context.set_auto_parallel_context(global_rank=4096) with pytest.raises(ValueError): set_algo_parameters(tensor_slice_align_size=0) with pytest.raises(ValueError): set_algo_parameters(tensor_slice_align_size=1025) context.set_auto_parallel_context(enable_parallel_optimizer=True) assert context.get_auto_parallel_context("enable_parallel_optimizer") assert not auto_parallel_context().get_all_reduce_fusion_split_indices()
if args_opt.device_target == "Ascend": device_id = int(os.getenv('DEVICE_ID', '0')) rank_id = int(os.getenv('RANK_ID', '0')) rank_size = int(os.getenv('RANK_SIZE', '1')) run_distribute = rank_size > 1 context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=device_id, save_graphs=False) elif args_opt.device_target == "GPU": context.set_context(mode=context.GRAPH_MODE, device_target="GPU", save_graphs=False) init() context.set_auto_parallel_context(device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True) else: raise ValueError("Unsupported device target.") class CrossEntropyWithLabelSmooth(_Loss): """ CrossEntropyWith LabelSmooth. Args: smooth_factor (float): smooth factor, default=0. num_classes (int): num classes Returns: None.
def run_pretrain(): """pre-train bert_clue""" parser = argparse.ArgumentParser(description='bert pre_training') parser.add_argument( '--device_target', type=str, default='Ascend', choices=['Ascend', 'GPU'], help='device where the code will be implemented. (Default: Ascend)') parser.add_argument("--distribute", type=str, default="false", choices=["true", "false"], help="Run distribute, default is false.") parser.add_argument("--epoch_size", type=int, default="1", help="Epoch size, default is 1.") parser.add_argument("--device_id", type=int, default=0, help="Device id, default is 0.") parser.add_argument("--device_num", type=int, default=1, help="Use device nums, default is 1.") parser.add_argument("--enable_save_ckpt", type=str, default="true", choices=["true", "false"], help="Enable save checkpoint, default is true.") parser.add_argument("--enable_lossscale", type=str, default="true", choices=["true", "false"], help="Use lossscale or not, default is not.") parser.add_argument("--do_shuffle", type=str, default="true", choices=["true", "false"], help="Enable shuffle for dataset, default is true.") parser.add_argument("--enable_data_sink", type=str, default="true", choices=["true", "false"], help="Enable data sink, default is true.") parser.add_argument("--data_sink_steps", type=int, default="1", help="Sink steps for each epoch, default is 1.") parser.add_argument( "--accumulation_steps", type=int, default="1", help= "Accumulating gradients N times before weight update, default is 1.") parser.add_argument("--save_checkpoint_path", type=str, default="", help="Save checkpoint path") parser.add_argument("--load_checkpoint_path", type=str, default="", help="Load checkpoint file path") parser.add_argument("--save_checkpoint_steps", type=int, default=1000, help="Save checkpoint steps, " "default is 1000.") parser.add_argument("--train_steps", type=int, default=-1, help="Training Steps, default is -1, " "meaning run all steps according to epoch number.") parser.add_argument("--save_checkpoint_num", type=int, default=1, help="Save checkpoint numbers, default is 1.") parser.add_argument("--data_dir", type=str, default="", help="Data path, it is better to use absolute path") parser.add_argument("--schema_dir", type=str, default="", help="Schema path, it is better to use absolute path") args_opt = parser.parse_args() context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target, device_id=args_opt.device_id) context.set_context(reserve_class_name_in_scope=False) ckpt_save_dir = args_opt.save_checkpoint_path if args_opt.distribute == "true": if args_opt.device_target == 'Ascend': D.init() device_num = args_opt.device_num rank = args_opt.device_id % device_num else: D.init() device_num = D.get_group_size() rank = D.get_rank() ckpt_save_dir = args_opt.save_checkpoint_path + 'ckpt_' + str( get_rank()) + '/' context.reset_auto_parallel_context() context.set_auto_parallel_context( parallel_mode=ParallelMode.AUTO_PARALLEL, gradients_mean=True, device_num=device_num, auto_parallel_search_mode="recursive_programming") _set_bert_all_reduce_split() else: rank = 0 device_num = 1 if args_opt.device_target == 'GPU' and bert_net_cfg.compute_type != mstype.float32: logger.warning('Gpu only support fp32 temporarily, run with fp32.') bert_net_cfg.compute_type = mstype.float32 if args_opt.accumulation_steps > 1: logger.info("accumulation steps: {}".format( args_opt.accumulation_steps)) logger.info("global batch size: {}".format( cfg.batch_size * args_opt.accumulation_steps)) if args_opt.enable_data_sink == "true": args_opt.data_sink_steps *= args_opt.accumulation_steps logger.info("data sink steps: {}".format(args_opt.data_sink_steps)) if args_opt.enable_save_ckpt == "true": args_opt.save_checkpoint_steps *= args_opt.accumulation_steps logger.info("save checkpoint steps: {}".format( args_opt.save_checkpoint_steps)) ds = create_bert_dataset(device_num, rank, args_opt.do_shuffle, args_opt.data_dir, args_opt.schema_dir) net_with_loss = BertNetworkWithLoss(bert_net_cfg, True) new_repeat_count = args_opt.epoch_size * ds.get_dataset_size( ) // args_opt.data_sink_steps if args_opt.train_steps > 0: train_steps = args_opt.train_steps * args_opt.accumulation_steps new_repeat_count = min(new_repeat_count, train_steps // args_opt.data_sink_steps) else: args_opt.train_steps = args_opt.epoch_size * ds.get_dataset_size( ) // args_opt.accumulation_steps logger.info("train steps: {}".format(args_opt.train_steps)) optimizer = _get_optimizer(args_opt, net_with_loss) callback = [ TimeMonitor(args_opt.data_sink_steps), LossCallBack(ds.get_dataset_size()) ] if args_opt.enable_save_ckpt == "true" and args_opt.device_id % min( 8, device_num) == 0: config_ck = CheckpointConfig( save_checkpoint_steps=args_opt.save_checkpoint_steps, keep_checkpoint_max=args_opt.save_checkpoint_num) ckpoint_cb = ModelCheckpoint( prefix='checkpoint_bert', directory=None if ckpt_save_dir == "" else ckpt_save_dir, config=config_ck) callback.append(ckpoint_cb) if args_opt.load_checkpoint_path: param_dict = load_checkpoint(args_opt.load_checkpoint_path) load_param_into_net(net_with_loss, param_dict) if args_opt.enable_lossscale == "true": update_cell = DynamicLossScaleUpdateCell( loss_scale_value=cfg.loss_scale_value, scale_factor=cfg.scale_factor, scale_window=cfg.scale_window) if args_opt.accumulation_steps <= 1: net_with_grads = BertTrainOneStepWithLossScaleCell( net_with_loss, optimizer=optimizer, scale_update_cell=update_cell) else: accumulation_steps = args_opt.accumulation_steps net_with_grads = BertTrainAccumulateStepsWithLossScaleCell( net_with_loss, optimizer=optimizer, scale_update_cell=update_cell, accumulation_steps=accumulation_steps, enable_global_norm=cfg.enable_global_norm) else: net_with_grads = BertTrainOneStepCell(net_with_loss, optimizer=optimizer) model = Model(net_with_grads) model.train(new_repeat_count, ds, callbacks=callback, dataset_sink_mode=(args_opt.enable_data_sink == "true"), sink_size=args_opt.data_sink_steps) open("bert_4gpu.txt", "w").write(str(_executor._get_shard_strategy(model._train_network)))
def test_auto_parallel_activation(): context.set_auto_parallel_context(parallel_mode="auto_parallel", device_num=16, global_rank=0) strategy1 = ((4, 4), (4, 4)) strategy2 = None net = Net(_w1, strategy1, strategy2) compile_net(net)
def test_train(): """train entry method""" if args.is_distributed: if args.device_target == "Ascend": init() context.set_context(device_id=args.device_id) elif args.device_target == "GPU": init() args.rank = get_rank() args.group_size = get_group_size() device_num = args.group_size context.reset_auto_parallel_context() context.set_auto_parallel_context( device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, parameter_broadcast=True, gradients_mean=True) else: context.set_context(device_id=args.device_id) context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) if not os.path.exists(args.output_path): os.makedirs(args.output_path) layers = cfg.layers num_factors = cfg.num_factors epochs = args.train_epochs ds_train, num_train_users, num_train_items = create_dataset( test_train=True, data_dir=args.data_path, dataset=args.dataset, train_epochs=1, batch_size=args.batch_size, num_neg=args.num_neg) print("ds_train.size: {}".format(ds_train.get_dataset_size())) ncf_net = NCFModel(num_users=num_train_users, num_items=num_train_items, num_factors=num_factors, model_layers=layers, mf_regularization=0, mlp_reg_layers=[0.0, 0.0, 0.0, 0.0], mf_dim=16) loss_net = NetWithLossClass(ncf_net) train_net = TrainStepWrap(loss_net, ds_train.get_dataset_size() * (epochs + 1)) train_net.set_train() model = Model(train_net) callback = LossMonitor(per_print_times=ds_train.get_dataset_size()) ckpt_config = CheckpointConfig( save_checkpoint_steps=(4970845 + args.batch_size - 1) // (args.batch_size), keep_checkpoint_max=100) ckpoint_cb = ModelCheckpoint(prefix='NCF', directory=args.checkpoint_path, config=ckpt_config) model.train(epochs, ds_train, callbacks=[ TimeMonitor(ds_train.get_dataset_size()), callback, ckpoint_cb ], dataset_sink_mode=True)
def train_process_thor(q, device_id, epoch_size, device_num, enable_hccl): os.system("mkdir " + str(device_id)) os.chdir(str(device_id)) context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False) context.set_context(device_id=device_id) os.environ['MINDSPORE_HCCL_CONFIG_PATH'] = MINDSPORE_HCCL_CONFIG_PATH_2 os.environ['RANK_ID'] = str(device_id - 4) os.environ['RANK_SIZE'] = str(device_num) if enable_hccl: context.set_auto_parallel_context( device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, all_reduce_fusion_config=[107]) init() # network damping = get_model_damping(0, 0.03, 0.87, 50, 5004) net = resnet50_thor(class_num=thor_config.class_num, damping=damping, loss_scale=thor_config.loss_scale, frequency=thor_config.frequency) # evaluation network dist_eval_network = ClassifyCorrectCell(net) if not thor_config.label_smooth: thor_config.label_smooth_factor = 0.0 # loss loss = CrossEntropySmooth(sparse=True, reduction="mean", smooth_factor=thor_config.label_smooth_factor, num_classes=thor_config.class_num) # train dataset dataset = create_dataset(dataset_path=dataset_path, do_train=True, repeat_num=1, batch_size=thor_config.batch_size) step_size = dataset.get_dataset_size() eval_interval = thor_config.eval_interval # evalutation dataset eval_dataset = create_dataset(dataset_path=eval_path, do_train=False, repeat_num=1, batch_size=thor_config.eval_batch_size) # loss scale loss_scale = FixedLossScaleManager(thor_config.loss_scale, drop_overflow_update=False) # learning rate lr = Tensor(get_model_lr(0, 0.045, 6, 70, 5004)) # optimizer opt = THOR(filter(lambda x: x.requires_grad, net.get_parameters()), lr, thor_config.momentum, filter(lambda x: 'matrix_A' in x.name, net.get_parameters()), filter(lambda x: 'matrix_G' in x.name, net.get_parameters()), filter(lambda x: 'A_inv_max' in x.name, net.get_parameters()), filter(lambda x: 'G_inv_max' in x.name, net.get_parameters()), thor_config.weight_decay, thor_config.loss_scale) # model model = THOR_Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, amp_level="O2", keep_batchnorm_fp32=False, metrics={ 'acc': DistAccuracy(batch_size=thor_config.eval_batch_size, device_num=device_num) }, eval_network=dist_eval_network, frequency=thor_config.frequency) # model init print("init_start", device_id) model.init(dataset, eval_dataset) print("init_stop", device_id) # callbacks loss_cb = LossGet(1, step_size) # train and eval acc = 0.0 time_cost = 0.0 print("run_start", device_id) for epoch_idx in range(0, int(epoch_size / eval_interval)): model.train(eval_interval, dataset, callbacks=loss_cb) eval_start = time.time() output = model.eval(eval_dataset) eval_cost = (time.time() - eval_start) * 1000 acc = float(output["acc"]) time_cost = loss_cb.get_per_step_time() loss = loss_cb.get_loss() print( "the {} epoch's resnet result:\n " "device{}, training loss {}, acc {}, " "training per step cost {:.2f} ms, eval cost {:.2f} ms, total_cost {:.2f} ms" .format(epoch_idx, device_id, loss, acc, time_cost, eval_cost, time_cost * step_size + eval_cost)) q.put({'acc': acc, 'cost': time_cost})
def test_expand_dims_auto_parallel(): context.set_auto_parallel_context(parallel_mode="auto_parallel", device_num=16, global_rank=0) net = Net(_w1) compile(net)
def train(): args = parse_args() # init multicards training if args.is_distributed: init() args.rank = get_rank() args.group_size = get_group_size() parallel_mode = ParallelMode.DATA_PARALLEL context.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=args.group_size) # dataset dataset = data_generator.SegDataset(image_mean=args.image_mean, image_std=args.image_std, data_file=args.data_file, batch_size=args.batch_size, crop_size=args.crop_size, max_scale=args.max_scale, min_scale=args.min_scale, ignore_label=args.ignore_label, num_classes=args.num_classes, num_readers=2, num_parallel_calls=4, shard_id=args.rank, shard_num=args.group_size) dataset = dataset.get_dataset(repeat=1) # network if args.model == 'deeplab_v3_s16': network = net_factory.nets_map[args.model]('train', args.num_classes, 16, args.freeze_bn) elif args.model == 'deeplab_v3_s8': network = net_factory.nets_map[args.model]('train', args.num_classes, 8, args.freeze_bn) else: raise NotImplementedError('model [{:s}] not recognized'.format( args.model)) # loss loss_ = loss.SoftmaxCrossEntropyLoss(args.num_classes, args.ignore_label) loss_.add_flags_recursive(fp32=True) train_net = BuildTrainNetwork(network, loss_) # load pretrained model if args.ckpt_pre_trained: param_dict = load_checkpoint(args.ckpt_pre_trained) load_param_into_net(train_net, param_dict) # optimizer iters_per_epoch = dataset.get_dataset_size() total_train_steps = iters_per_epoch * args.train_epochs if args.lr_type == 'cos': lr_iter = learning_rates.cosine_lr(args.base_lr, total_train_steps, total_train_steps) elif args.lr_type == 'poly': lr_iter = learning_rates.poly_lr(args.base_lr, total_train_steps, total_train_steps, end_lr=0.0, power=0.9) elif args.lr_type == 'exp': lr_iter = learning_rates.exponential_lr(args.base_lr, args.lr_decay_step, args.lr_decay_rate, total_train_steps, staircase=True) else: raise ValueError('unknown learning rate type') opt = nn.Momentum(params=train_net.trainable_params(), learning_rate=lr_iter, momentum=0.9, weight_decay=0.0001, loss_scale=args.loss_scale) # loss scale manager_loss_scale = FixedLossScaleManager(args.loss_scale, drop_overflow_update=False) model = Model(train_net, optimizer=opt, amp_level="O3", loss_scale_manager=manager_loss_scale) # callback for saving ckpts time_cb = TimeMonitor(data_size=iters_per_epoch) loss_cb = LossMonitor() cbs = [time_cb, loss_cb] if args.rank == 0: config_ck = CheckpointConfig( save_checkpoint_steps=args.save_steps, keep_checkpoint_max=args.keep_checkpoint_max) ckpoint_cb = ModelCheckpoint(prefix=args.model, directory=args.train_dir, config=config_ck) cbs.append(ckpoint_cb) model.train(args.train_epochs, dataset, callbacks=cbs)
def train(args): '''train''' print('=============yolov3 start trainging==================') devid = int(os.getenv('DEVICE_ID', '0')) if args.run_platform != 'CPU' else 0 context.set_context(mode=context.GRAPH_MODE, device_target=args.run_platform, save_graphs=False, device_id=devid) # init distributed if args.world_size != 1: init() args.local_rank = get_rank() args.world_size = get_group_size() context.set_auto_parallel_context( parallel_mode=ParallelMode.DATA_PARALLEL, device_num=args.world_size, gradients_mean=True) args.logger = get_logger(args.outputs_dir, args.local_rank) # dataloader ds = create_dataset(args) args.logger.important_info('start create network') create_network_start = time.time() train_net = define_network(args) # checkpoint ckpt_max_num = args.max_epoch * args.steps_per_epoch // args.ckpt_interval train_config = CheckpointConfig(save_checkpoint_steps=args.ckpt_interval, keep_checkpoint_max=ckpt_max_num) ckpt_cb = ModelCheckpoint(config=train_config, directory=args.outputs_dir, prefix='{}'.format(args.local_rank)) cb_params = _InternalCallbackParam() cb_params.train_network = train_net cb_params.epoch_num = ckpt_max_num cb_params.cur_epoch_num = 1 run_context = RunContext(cb_params) ckpt_cb.begin(run_context) train_net.set_train() t_end = time.time() t_epoch = time.time() old_progress = -1 i = 0 if args.use_loss_scale: scale_manager = DynamicLossScaleManager(init_loss_scale=2**10, scale_factor=2, scale_window=2000) for data in ds.create_tuple_iterator(output_numpy=True): batch_images = data[0] batch_labels = data[1] input_list = [Tensor(batch_images, mstype.float32)] for idx in range(2, 26): input_list.append(Tensor(data[idx], mstype.float32)) if args.use_loss_scale: scaling_sens = Tensor(scale_manager.get_loss_scale(), dtype=mstype.float32) loss0, overflow, _ = train_net(*input_list, scaling_sens) overflow = np.all(overflow.asnumpy()) if overflow: scale_manager.update_loss_scale(overflow) else: scale_manager.update_loss_scale(False) args.logger.info( 'rank[{}], iter[{}], loss[{}], overflow:{}, loss_scale:{}, lr:{}, batch_images:{}, ' 'batch_labels:{}'.format(args.local_rank, i, loss0, overflow, scaling_sens, args.lr[i], batch_images.shape, batch_labels.shape)) else: loss0 = train_net(*input_list) args.logger.info( 'rank[{}], iter[{}], loss[{}], lr:{}, batch_images:{}, ' 'batch_labels:{}'.format(args.local_rank, i, loss0, args.lr[i], batch_images.shape, batch_labels.shape)) # save ckpt cb_params.cur_step_num = i + 1 # current step number cb_params.batch_num = i + 2 if args.local_rank == 0: ckpt_cb.step_end(run_context) # save Log if i == 0: time_for_graph_compile = time.time() - create_network_start args.logger.important_info( 'Yolov3, graph compile time={:.2f}s'.format( time_for_graph_compile)) if i % args.steps_per_epoch == 0: cb_params.cur_epoch_num += 1 if i % args.log_interval == 0 and args.local_rank == 0: time_used = time.time() - t_end epoch = int(i / args.steps_per_epoch) fps = args.batch_size * ( i - old_progress) * args.world_size / time_used args.logger.info( 'epoch[{}], iter[{}], loss:[{}], {:.2f} imgs/sec'.format( epoch, i, loss0, fps)) t_end = time.time() old_progress = i if i % args.steps_per_epoch == 0 and args.local_rank == 0: epoch_time_used = time.time() - t_epoch epoch = int(i / args.steps_per_epoch) fps = args.batch_size * args.world_size * args.steps_per_epoch / epoch_time_used args.logger.info( '=================================================') args.logger.info( 'epoch time: epoch[{}], iter[{}], {:.2f} imgs/sec'.format( epoch, i, fps)) args.logger.info( '=================================================') t_epoch = time.time() i = i + 1 args.logger.info('=============yolov3 training finished==================')
def main(): parser = argparse.ArgumentParser(description="retinanet training") parser.add_argument("--only_create_dataset", type=ast.literal_eval, default=False, help="If set it true, only create Mindrecord, default is False.") parser.add_argument("--distribute", type=ast.literal_eval, default=False, help="Run distribute, default is False.") parser.add_argument("--device_id", type=int, default=0, help="Device id, default is 0.") parser.add_argument("--device_num", type=int, default=1, help="Use device nums, default is 1.") parser.add_argument("--lr", type=float, default=0.1, help="Learning rate, default is 0.1.") parser.add_argument("--mode", type=str, default="sink", help="Run sink mode or not, default is sink.") parser.add_argument("--dataset", type=str, default="coco", help="Dataset, default is coco.") parser.add_argument("--epoch_size", type=int, default=500, help="Epoch size, default is 500.") parser.add_argument("--batch_size", type=int, default=32, help="Batch size, default is 32.") parser.add_argument("--pre_trained", type=str, default=None, help="Pretrained Checkpoint file path.") parser.add_argument("--pre_trained_epoch_size", type=int, default=0, help="Pretrained epoch size.") parser.add_argument("--save_checkpoint_epochs", type=int, default=1, help="Save checkpoint epochs, default is 1.") parser.add_argument("--loss_scale", type=int, default=1024, help="Loss scale, default is 1024.") parser.add_argument("--filter_weight", type=ast.literal_eval, default=False, help="Filter weight parameters, default is False.") parser.add_argument("--run_platform", type=str, default="Ascend", choices=("Ascend"), help="run platform, only support Ascend.") args_opt = parser.parse_args() if args_opt.run_platform == "Ascend": context.set_context(mode=context.GRAPH_MODE, device_target="Ascend") if args_opt.distribute: if os.getenv("DEVICE_ID", "not_set").isdigit(): context.set_context(device_id=int(os.getenv("DEVICE_ID"))) init() device_num = args_opt.device_num rank = get_rank() context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=device_num) else: rank = 0 device_num = 1 context.set_context(device_id=args_opt.device_id) else: raise ValueError("Unsupported platform.") mindrecord_file = create_mindrecord(args_opt.dataset, "retinanet.mindrecord", True) if not args_opt.only_create_dataset: loss_scale = float(args_opt.loss_scale) # When create MindDataset, using the fitst mindrecord file, such as retinanet.mindrecord0. dataset = create_retinanet_dataset(mindrecord_file, repeat_num=1, batch_size=args_opt.batch_size, device_num=device_num, rank=rank) dataset_size = dataset.get_dataset_size() print("Create dataset done!") backbone = resnet50(config.num_classes) retinanet = retinanet50(backbone, config) net = retinanetWithLossCell(retinanet, config) net.to_float(mindspore.float16) init_net_param(net) if args_opt.pre_trained: if args_opt.pre_trained_epoch_size <= 0: raise KeyError("pre_trained_epoch_size must be greater than 0.") param_dict = load_checkpoint(args_opt.pre_trained) if args_opt.filter_weight: filter_checkpoint_parameter(param_dict) load_param_into_net(net, param_dict) lr = Tensor(get_lr(global_step=config.global_step, lr_init=config.lr_init, lr_end=config.lr_end_rate * args_opt.lr, lr_max=args_opt.lr, warmup_epochs1=config.warmup_epochs1, warmup_epochs2=config.warmup_epochs2, warmup_epochs3=config.warmup_epochs3, warmup_epochs4=config.warmup_epochs4, warmup_epochs5=config.warmup_epochs5, total_epochs=args_opt.epoch_size, steps_per_epoch=dataset_size)) opt = nn.Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum, config.weight_decay, loss_scale) net = TrainingWrapper(net, opt, loss_scale) model = Model(net) print("Start train retinanet, the first epoch will be slower because of the graph compilation.") cb = [TimeMonitor(), LossMonitor()] cb += [Monitor(lr_init=lr.asnumpy())] config_ck = CheckpointConfig(save_checkpoint_steps=dataset_size * args_opt.save_checkpoint_epochs, keep_checkpoint_max=config.keep_checkpoint_max) ckpt_cb = ModelCheckpoint(prefix="retinanet", directory=config.save_checkpoint_path, config=config_ck) if args_opt.distribute: if rank == 0: cb += [ckpt_cb] model.train(args_opt.epoch_size, dataset, callbacks=cb, dataset_sink_mode=True) else: cb += [ckpt_cb] model.train(args_opt.epoch_size, dataset, callbacks=cb, dataset_sink_mode=True)
def main(): parser = argparse.ArgumentParser(description="SSD training") parser.add_argument( "--only_create_dataset", type=ast.literal_eval, default=False, help="If set it true, only create Mindrecord, default is False.") parser.add_argument("--distribute", type=ast.literal_eval, default=False, help="Run distribute, default is False.") parser.add_argument("--device_id", type=int, default=0, help="Device id, default is 0.") parser.add_argument("--device_num", type=int, default=1, help="Use device nums, default is 1.") parser.add_argument("--lr", type=float, default=0.05, help="Learning rate, default is 0.05.") parser.add_argument("--mode", type=str, default="sink", help="Run sink mode or not, default is sink.") parser.add_argument("--dataset", type=str, default="coco", help="Dataset, defalut is coco.") parser.add_argument("--epoch_size", type=int, default=500, help="Epoch size, default is 500.") parser.add_argument("--batch_size", type=int, default=32, help="Batch size, default is 32.") parser.add_argument("--pre_trained", type=str, default=None, help="Pretrained Checkpoint file path.") parser.add_argument("--pre_trained_epoch_size", type=int, default=0, help="Pretrained epoch size.") parser.add_argument("--save_checkpoint_epochs", type=int, default=10, help="Save checkpoint epochs, default is 10.") parser.add_argument("--loss_scale", type=int, default=1024, help="Loss scale, default is 1024.") parser.add_argument("--filter_weight", type=ast.literal_eval, default=False, help="Filter weight parameters, default is False.") parser.add_argument("--run_platform", type=str, default="Ascend", choices=("Ascend", "GPU"), help="run platform, only support Ascend and GPU.") args_opt = parser.parse_args() if args_opt.run_platform == "Ascend": context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=args_opt.device_id) if args_opt.distribute: device_num = args_opt.device_num context.reset_auto_parallel_context() context.set_auto_parallel_context( parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=device_num) init() rank = args_opt.device_id % device_num else: rank = 0 device_num = 1 elif args_opt.run_platform == "GPU": context.set_context(mode=context.GRAPH_MODE, device_target="GPU", device_id=args_opt.device_id) init() if args_opt.distribute: device_num = args_opt.device_num context.reset_auto_parallel_context() context.set_auto_parallel_context( parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=device_num) rank = get_rank() else: rank = 0 device_num = 1 else: raise ValueError("Unsupported platform.") print("Start create dataset!") # It will generate mindrecord file in args_opt.mindrecord_dir, # and the file name is ssd.mindrecord0, 1, ... file_num. prefix = "ssd.mindrecord" mindrecord_dir = config.mindrecord_dir mindrecord_file = os.path.join(mindrecord_dir, prefix + "0") if not os.path.exists(mindrecord_file): if not os.path.isdir(mindrecord_dir): os.makedirs(mindrecord_dir) if args_opt.dataset == "coco": if os.path.isdir(config.coco_root): print("Create Mindrecord.") data_to_mindrecord_byte_image("coco", True, prefix) print("Create Mindrecord Done, at {}".format(mindrecord_dir)) else: print("coco_root not exits.") elif args_opt.dataset == "voc": if os.path.isdir(config.voc_dir): print("Create Mindrecord.") voc_data_to_mindrecord(mindrecord_dir, True, prefix) print("Create Mindrecord Done, at {}".format(mindrecord_dir)) else: print("voc_dir not exits.") else: if os.path.isdir(config.image_dir) and os.path.exists( config.anno_path): print("Create Mindrecord.") data_to_mindrecord_byte_image("other", True, prefix) print("Create Mindrecord Done, at {}".format(mindrecord_dir)) else: print("image_dir or anno_path not exits.") if not args_opt.only_create_dataset: loss_scale = float(args_opt.loss_scale) # When create MindDataset, using the fitst mindrecord file, such as ssd.mindrecord0. dataset = create_ssd_dataset(mindrecord_file, repeat_num=1, batch_size=args_opt.batch_size, device_num=device_num, rank=rank) dataset_size = dataset.get_dataset_size() print("Create dataset done!") backbone = ssd_mobilenet_v2() ssd = SSD300(backbone=backbone, config=config) if args_opt.run_platform == "GPU": ssd.to_float(dtype.float16) net = SSDWithLossCell(ssd, config) init_net_param(net) # checkpoint ckpt_config = CheckpointConfig(save_checkpoint_steps=dataset_size * args_opt.save_checkpoint_epochs) save_ckpt_path = './ckpt_' + str(rank) + '/' ckpoint_cb = ModelCheckpoint(prefix="ssd", directory=save_ckpt_path, config=ckpt_config) if args_opt.pre_trained: if args_opt.pre_trained_epoch_size <= 0: raise KeyError( "pre_trained_epoch_size must be greater than 0.") param_dict = load_checkpoint(args_opt.pre_trained) if args_opt.filter_weight: filter_checkpoint_parameter(param_dict) load_param_into_net(net, param_dict) lr = Tensor( get_lr(global_step=config.global_step, lr_init=config.lr_init, lr_end=config.lr_end_rate * args_opt.lr, lr_max=args_opt.lr, warmup_epochs=config.warmup_epochs, total_epochs=args_opt.epoch_size, steps_per_epoch=dataset_size)) opt = nn.Momentum( filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum, config.weight_decay, loss_scale) net = TrainingWrapper(net, opt, loss_scale) callback = [ TimeMonitor(data_size=dataset_size), LossMonitor(), ckpoint_cb ] model = Model(net) dataset_sink_mode = False if args_opt.mode == "sink": print("In sink mode, one epoch return a loss.") dataset_sink_mode = True print( "Start train SSD, the first epoch will be slower because of the graph compilation." ) model.train(args_opt.epoch_size, dataset, callbacks=callback, dataset_sink_mode=dataset_sink_mode)
def test_two_matmul(): class Net(nn.Cell): def __init__(self): super().__init__() self.matmul1 = P.MatMul() self.matmul2 = P.MatMul() def construct(self, x, y, b): out = self.matmul1(x, y) out = self.matmul2(out, b) return out size = 16 context.set_auto_parallel_context(device_num=size, global_rank=0) cost_model_context.set_cost_model_context(device_memory_capacity= 32.0 * 1024.0 * 1024.0 * 1024.0, costmodel_alpha=1.0, costmodel_beta=60.0, costmodel_gamma=0.1, costmodel_communi_threshold=1024.0, costmodel_communi_const=2222.0, costmodel_communi_bias=1111.0) dev_mem_cap = cost_model_context.get_cost_model_context("device_memory_capacity") assert dev_mem_cap == 32.0 * 1024.0 * 1024.0 * 1024.0 costmodel_alpha = cost_model_context.get_cost_model_context("costmodel_alpha") assert costmodel_alpha == 1.0 costmodel_beta = cost_model_context.get_cost_model_context("costmodel_beta") assert costmodel_beta == 60.0 costmodel_gamma = cost_model_context.get_cost_model_context("costmodel_gamma") assert costmodel_gamma == 0.1 costmodel_communi_threshold = cost_model_context.get_cost_model_context("costmodel_communi_threshold") assert costmodel_communi_threshold == 1024.0 costmodel_communi_const = cost_model_context.get_cost_model_context("costmodel_communi_const") assert costmodel_communi_const == 2222.0 costmodel_communi_bias = cost_model_context.get_cost_model_context("costmodel_communi_bias") assert costmodel_communi_bias == 1111.0 cost_model_context.reset_cost_model_context() dev_mem_cap = cost_model_context.get_cost_model_context("device_memory_capacity") assert dev_mem_cap == 16.0 * 1024.0 * 1024.0 * 1024.0 costmodel_alpha = cost_model_context.get_cost_model_context("costmodel_alpha") assert costmodel_alpha == 1.0 costmodel_beta = cost_model_context.get_cost_model_context("costmodel_beta") assert costmodel_beta == 65.0 costmodel_gamma = cost_model_context.get_cost_model_context("costmodel_gamma") assert costmodel_gamma == 0.02 costmodel_communi_threshold = cost_model_context.get_cost_model_context("costmodel_communi_threshold") assert costmodel_communi_threshold == 2048.0 costmodel_communi_const = cost_model_context.get_cost_model_context("costmodel_communi_const") assert costmodel_communi_const == 3072.0 costmodel_communi_bias = cost_model_context.get_cost_model_context("costmodel_communi_bias") assert costmodel_communi_bias == 1024.0 set_algo_parameters(simplify_cal=True, tensor_slice_align_enable=False, tensor_slice_align_size=32, not_fully_use_devices=True, elementwise_op_strategy_follow=False) para_simplify_cal = get_algo_parameters("simplify_cal") assert para_simplify_cal == True para_slice_align_enable = get_algo_parameters("tensor_slice_align_enable") assert para_slice_align_enable == False para_slice_align_size = get_algo_parameters("tensor_slice_align_size") assert para_slice_align_size == 32 not_fully_use_devices = get_algo_parameters("not_fully_use_devices") assert not_fully_use_devices == True elementwise_op_strategy_follow = get_algo_parameters("elementwise_op_strategy_follow") assert elementwise_op_strategy_follow == False reset_algo_parameters() para_simplify_cal = get_algo_parameters("simplify_cal") assert para_simplify_cal == True para_slice_align_enable = get_algo_parameters("tensor_slice_align_enable") assert para_slice_align_enable == False para_slice_align_size = get_algo_parameters("tensor_slice_align_size") assert para_slice_align_size == 16 not_fully_use_devices = get_algo_parameters("not_fully_use_devices") assert not_fully_use_devices == False elementwise_op_strategy_follow = get_algo_parameters("elementwise_op_strategy_follow") assert elementwise_op_strategy_follow == False x = Tensor(np.ones([128, 32]), dtype=ms.float32) y = Tensor(np.ones([32, 64]), dtype=ms.float32) b = Tensor(np.ones([64, 64]), dtype=ms.float32) net = NetWithLoss(Net()) context.set_auto_parallel_context(parallel_mode="auto_parallel") reset_op_id() _executor.compile(net, x, y, b, phase='train') strategies = _executor._get_strategy(net) expected_strategies = {'Default/network-Net/MatMul-op2': [[16, 1], [1, 1]], 'Default/network-Net/MatMul-op3': [[16, 1], [1, 1]]} assert strategies == expected_strategies
type=str, default=None, help='Pretrained checkpoint path') parser.add_argument('--device_target', type=str, default="GPU", help='run device_target') args_opt = parser.parse_args() if args_opt.device_target == "GPU": context.set_context(mode=context.GRAPH_MODE, device_target="GPU", save_graphs=False) init() context.set_auto_parallel_context(device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True) else: raise ValueError("Unsupported device_target.") class CrossEntropyWithLabelSmooth(_Loss): """ CrossEntropyWith LabelSmooth. Args: smooth_factor (float): smooth factor, default=0. num_classes (int): num classes Returns: None.
def train(cloud_args=None): """training process""" args = parse_args(cloud_args) context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True, device_target=args.platform, save_graphs=False) if os.getenv('DEVICE_ID', "not_set").isdigit(): context.set_context(device_id=int(os.getenv('DEVICE_ID'))) # init distributed if args.is_distributed: parallel_mode = ParallelMode.DATA_PARALLEL context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=args.group_size, gradients_mean=True) # dataloader de_dataset = classification_dataset(args.data_dir, args.image_size, args.per_batch_size, 1, args.rank, args.group_size, num_parallel_workers=8) de_dataset.map_model = 4 # !!!important args.steps_per_epoch = de_dataset.get_dataset_size() args.logger.save_args(args) # network args.logger.important_info('start create network') # get network and init network = get_network(args.backbone, num_classes=args.num_classes, platform=args.platform) if network is None: raise NotImplementedError('not implement {}'.format(args.backbone)) load_pretrain_model(args.pretrained, network, args) # lr scheduler lr = get_lr(args) # optimizer opt = Momentum(params=get_param_groups(network), learning_rate=Tensor(lr), momentum=args.momentum, weight_decay=args.weight_decay, loss_scale=args.loss_scale) # loss if not args.label_smooth: args.label_smooth_factor = 0.0 loss = CrossEntropy(smooth_factor=args.label_smooth_factor, num_classes=args.num_classes) if args.is_dynamic_loss_scale == 1: loss_scale_manager = DynamicLossScaleManager(init_loss_scale=65536, scale_factor=2, scale_window=2000) else: loss_scale_manager = FixedLossScaleManager(args.loss_scale, drop_overflow_update=False) if args.platform == "Ascend": model = Model(network, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale_manager, metrics={'acc'}, amp_level="O3") else: model = Model(network, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale_manager, metrics={'acc'}, amp_level="O2") # checkpoint save progress_cb = ProgressMonitor(args) callbacks = [ progress_cb, ] if args.rank_save_ckpt_flag: ckpt_config = CheckpointConfig( save_checkpoint_steps=args.ckpt_interval * args.steps_per_epoch, keep_checkpoint_max=args.ckpt_save_max) save_ckpt_path = os.path.join(args.outputs_dir, 'ckpt_' + str(args.rank) + '/') ckpt_cb = ModelCheckpoint(config=ckpt_config, directory=save_ckpt_path, prefix='{}'.format(args.rank)) callbacks.append(ckpt_cb) model.train(args.max_epoch, de_dataset, callbacks=callbacks, dataset_sink_mode=True)
type=int, default=0, help="Rank id, default is 0.") args_opt = parser.parse_args() context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=args_opt.device_id) if __name__ == '__main__': if not args_opt.do_eval and args_opt.run_distribute: rank = args_opt.rank_id device_num = args_opt.device_num context.set_auto_parallel_context( device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True, parameter_broadcast=True) init() else: rank = 0 device_num = 1 print("Start create dataset!") # It will generate mindrecord file in args_opt.mindrecord_dir, # and the file name is FasterRcnn.mindrecord0, 1, ... file_num. prefix = "FasterRcnn.mindrecord" mindrecord_dir = config.mindrecord_dir mindrecord_file = os.path.join(mindrecord_dir, prefix + "0") if not os.path.exists(mindrecord_file):
def run_pretrain(): """pre-train bert_clue""" parser = argparse.ArgumentParser(description='bert pre_training') parser.add_argument( '--device_target', type=str, default='Ascend', choices=['Ascend', 'GPU'], help='device where the code will be implemented. (Default: Ascend)') parser.add_argument("--distribute", type=str, default="false", help="Run distribute, default is false.") parser.add_argument("--epoch_size", type=int, default="1", help="Epoch size, default is 1.") parser.add_argument("--device_id", type=int, default=4, help="Device id, default is 0.") parser.add_argument("--device_num", type=int, default=1, help="Use device nums, default is 1.") parser.add_argument("--enable_save_ckpt", type=str, default="true", help="Enable save checkpoint, default is true.") parser.add_argument("--enable_lossscale", type=str, default="false", help="Use lossscale or not, default is not.") parser.add_argument("--do_shuffle", type=str, default="false", help="Enable shuffle for dataset, default is true.") parser.add_argument("--enable_data_sink", type=str, default="true", help="Enable data sink, default is true.") parser.add_argument("--data_sink_steps", type=int, default="100", help="Sink steps for each epoch, default is 1.") parser.add_argument("--save_checkpoint_path", type=str, default="", help="Save checkpoint path") parser.add_argument("--load_checkpoint_path", type=str, default="", help="Load checkpoint file path") parser.add_argument("--save_checkpoint_steps", type=int, default=1000, help="Save checkpoint steps, " "default is 1000.") parser.add_argument("--train_steps", type=int, default=-1, help="Training Steps, default is -1, " "meaning run all steps according to epoch number.") parser.add_argument("--save_checkpoint_num", type=int, default=1, help="Save checkpoint numbers, default is 1.") parser.add_argument("--data_dir", type=str, default="", help="Data path, it is better to use absolute path") parser.add_argument("--schema_dir", type=str, default="", help="Schema path, it is better to use absolute path") args_opt = parser.parse_args() context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target, device_id=args_opt.device_id, save_graphs=False) context.set_context(reserve_class_name_in_scope=False) context.set_context(variable_memory_max_size="30GB") ckpt_save_dir = args_opt.save_checkpoint_path if args_opt.distribute == "true": if args_opt.device_target == 'Ascend': D.init() device_num = args_opt.device_num rank = args_opt.device_id % device_num else: D.init() device_num = D.get_group_size() rank = D.get_rank() ckpt_save_dir = args_opt.save_checkpoint_path + 'ckpt_' + str( rank) + '/' context.reset_auto_parallel_context() context.set_auto_parallel_context( parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True, device_num=device_num) from mindspore.parallel._auto_parallel_context import auto_parallel_context if bert_net_cfg.num_hidden_layers == 12: if bert_net_cfg.use_relative_positions: auto_parallel_context().set_all_reduce_fusion_split_indices( [29, 58, 87, 116, 145, 174, 203, 217], "hccl_world_groupsum1") auto_parallel_context().set_all_reduce_fusion_split_indices( [29, 58, 87, 116, 145, 174, 203, 217], "hccl_world_groupsum3") else: auto_parallel_context().set_all_reduce_fusion_split_indices( [28, 55, 82, 109, 136, 163, 190, 205], "hccl_world_groupsum1") auto_parallel_context().set_all_reduce_fusion_split_indices( [28, 55, 82, 109, 136, 163, 190, 205], "hccl_world_groupsum3") elif bert_net_cfg.num_hidden_layers == 24: if bert_net_cfg.use_relative_positions: auto_parallel_context().set_all_reduce_fusion_split_indices( [30, 90, 150, 210, 270, 330, 390, 421], "hccl_world_groupsum1") auto_parallel_context().set_all_reduce_fusion_split_indices( [30, 90, 150, 210, 270, 330, 390, 421], "hccl_world_groupsum3") else: auto_parallel_context().set_all_reduce_fusion_split_indices( [38, 93, 148, 203, 258, 313, 368, 397], "hccl_world_groupsum1") auto_parallel_context().set_all_reduce_fusion_split_indices( [38, 93, 148, 203, 258, 313, 368, 397], "hccl_world_groupsum3") else: rank = 0 device_num = 1 if args_opt.device_target == 'GPU' and bert_net_cfg.compute_type != mstype.float32: logger.warning('Gpu only support fp32 temporarily, run with fp32.') bert_net_cfg.compute_type = mstype.float32 ds = create_bert_dataset(device_num, rank, args_opt.do_shuffle, args_opt.data_dir, args_opt.schema_dir) net_with_loss = BertNetworkWithLoss(bert_net_cfg, True) new_repeat_count = args_opt.epoch_size * ds.get_dataset_size( ) // args_opt.data_sink_steps if args_opt.train_steps > 0: new_repeat_count = min( new_repeat_count, args_opt.train_steps // args_opt.data_sink_steps) else: args_opt.train_steps = args_opt.epoch_size * ds.get_dataset_size() logger.info("train steps: {}".format(args_opt.train_steps)) if cfg.optimizer == 'Lamb': lr_schedule = BertLearningRate( learning_rate=cfg.Lamb.learning_rate, end_learning_rate=cfg.Lamb.end_learning_rate, warmup_steps=cfg.Lamb.warmup_steps, decay_steps=args_opt.train_steps, power=cfg.Lamb.power) params = net_with_loss.trainable_params() decay_params = list(filter(cfg.Lamb.decay_filter, params)) other_params = list( filter(lambda x: not cfg.Lamb.decay_filter(x), params)) group_params = [{ 'params': decay_params, 'weight_decay': cfg.Lamb.weight_decay }, { 'params': other_params }, { 'order_params': params }] optimizer = Lamb(group_params, learning_rate=lr_schedule, eps=cfg.Lamb.eps) elif cfg.optimizer == 'Momentum': optimizer = Momentum(net_with_loss.trainable_params(), learning_rate=cfg.Momentum.learning_rate, momentum=cfg.Momentum.momentum) elif cfg.optimizer == 'AdamWeightDecay': lr_schedule = BertLearningRate( learning_rate=cfg.AdamWeightDecay.learning_rate, end_learning_rate=cfg.AdamWeightDecay.end_learning_rate, warmup_steps=cfg.AdamWeightDecay.warmup_steps, decay_steps=args_opt.train_steps, power=cfg.AdamWeightDecay.power) params = net_with_loss.trainable_params() decay_params = list(filter(cfg.AdamWeightDecay.decay_filter, params)) other_params = list( filter(lambda x: not cfg.AdamWeightDecay.decay_filter(x), params)) group_params = [{ 'params': decay_params, 'weight_decay': cfg.AdamWeightDecay.weight_decay }, { 'params': other_params, 'weight_decay': 0.0 }, { 'order_params': params }] optimizer = AdamWeightDecay(group_params, learning_rate=lr_schedule, eps=cfg.AdamWeightDecay.eps) elif cfg.optimizer == "Thor": lr = get_bert_lr() damping = get_bert_damping() optimizer = THOR( filter(lambda x: x.requires_grad, net_with_loss.get_parameters()), lr, cfg.Thor.momentum, filter(lambda x: 'matrix_A' in x.name, net_with_loss.get_parameters()), filter(lambda x: 'matrix_G' in x.name, net_with_loss.get_parameters()), cfg.Thor.weight_decay, cfg.Thor.loss_scale, bert_net_cfg.num_hidden_layers, bert_net_cfg.batch_size, damping) else: raise ValueError( "Don't support optimizer {}, only support [Lamb, Momentum, AdamWeightDecay, Thor]" .format(cfg.optimizer)) callback = [TimeMonitor(args_opt.data_sink_steps), LossCallBack()] if args_opt.enable_save_ckpt == "true" and rank == 0: config_ck = CheckpointConfig( save_checkpoint_steps=args_opt.save_checkpoint_steps, keep_checkpoint_max=args_opt.save_checkpoint_num) ckpoint_cb = ModelCheckpoint(prefix='checkpoint_bert', directory=ckpt_save_dir, config=config_ck) callback.append(ckpoint_cb) if args_opt.load_checkpoint_path: param_dict = load_checkpoint(args_opt.load_checkpoint_path) load_param_into_net(net_with_loss, param_dict) if args_opt.enable_lossscale == "true": update_cell = DynamicLossScaleUpdateCell( loss_scale_value=cfg.loss_scale_value, scale_factor=cfg.scale_factor, scale_window=cfg.scale_window) net_with_grads = BertTrainOneStepWithLossScaleCell( net_with_loss, optimizer=optimizer, scale_update_cell=update_cell) else: net_with_grads = BertTrainOneStepCell(net_with_loss, optimizer=optimizer) model = Model(net_with_grads, frequency=cfg.Thor.frequency) model.train(new_repeat_count, ds, callbacks=callback, dataset_sink_mode=(args_opt.enable_data_sink == "true"), sink_size=args_opt.data_sink_steps)
def val(args): '''eval''' print('=============yolov3 start evaluating==================') # logger args.batch_size = config.batch_size args.input_shape = config.input_shape args.result_path = config.result_path args.conf_thresh = config.conf_thresh args.nms_thresh = config.nms_thresh context.set_auto_parallel_context(parallel_mode=ParallelMode.STAND_ALONE, device_num=args.world_size, gradients_mean=True) mindrecord_path = args.mindrecord_path print('Loading data from {}'.format(mindrecord_path)) num_classes = config.num_classes if num_classes > 1: raise NotImplementedError( 'num_classes > 1: Yolov3 postprocess not implemented!') anchors = config.anchors anchors_mask = config.anchors_mask num_anchors_list = [len(x) for x in anchors_mask] reduction_0 = 64.0 reduction_1 = 32.0 reduction_2 = 16.0 labels = ['face'] classes = {0: 'face'} # dataloader ds = de.MindDataset( mindrecord_path + "0", columns_list=["image", "annotation", "image_name", "image_size"]) single_scale_trans = SingleScaleTrans(resize=args.input_shape) ds = ds.batch( args.batch_size, per_batch_map=single_scale_trans, input_columns=["image", "annotation", "image_name", "image_size"], num_parallel_workers=8) args.steps_per_epoch = ds.get_dataset_size() # backbone network = backbone_HwYolov3(num_classes, num_anchors_list, args) # load pretrain model if os.path.isfile(args.pretrained): param_dict = load_checkpoint(args.pretrained) param_dict_new = {} for key, values in param_dict.items(): if key.startswith('moments.'): continue elif key.startswith('network.'): param_dict_new[key[8:]] = values else: param_dict_new[key] = values load_param_into_net(network, param_dict_new) print('load model {} success'.format(args.pretrained)) else: print( 'load model {} failed, please check the path of model, evaluating end' .format(args.pretrained)) exit(0) ds = ds.repeat(1) det = {} img_size = {} img_anno = {} model_name = args.pretrained.split('/')[-1].replace('.ckpt', '') result_path = os.path.join(args.result_path, model_name) if os.path.exists(result_path): pass if not os.path.isdir(result_path): os.makedirs(result_path, exist_ok=True) # result file ret_files_set = { 'face': os.path.join(result_path, 'comp4_det_test_face_rm5050.txt'), } test_net = BuildTestNetwork(network, reduction_0, reduction_1, reduction_2, anchors, anchors_mask, num_classes, args) print('conf_thresh:', args.conf_thresh) eval_times = 0 for data in ds.create_tuple_iterator(output_numpy=True): batch_images = data[0] batch_labels = data[1] batch_image_name = data[2] batch_image_size = data[3] eval_times += 1 img_tensor = Tensor(batch_images, mstype.float32) dets = [] tdets = [] coords_0, cls_scores_0, coords_1, cls_scores_1, coords_2, cls_scores_2 = test_net( img_tensor) boxes_0, boxes_1, boxes_2 = get_bounding_boxes( coords_0, cls_scores_0, coords_1, cls_scores_1, coords_2, cls_scores_2, args.conf_thresh, args.input_shape, num_classes) converted_boxes_0, converted_boxes_1, converted_boxes_2 = tensor_to_brambox( boxes_0, boxes_1, boxes_2, args.input_shape, labels) tdets.append(converted_boxes_0) tdets.append(converted_boxes_1) tdets.append(converted_boxes_2) batch = len(tdets[0]) for b in range(batch): single_dets = [] for op in range(3): single_dets.extend(tdets[op][b]) dets.append(single_dets) det.update({ batch_image_name[k].decode('UTF-8'): v for k, v in enumerate(dets) }) img_size.update({ batch_image_name[k].decode('UTF-8'): v for k, v in enumerate(batch_image_size) }) img_anno.update({ batch_image_name[k].decode('UTF-8'): v for k, v in enumerate(batch_labels) }) print('eval times:', eval_times) print('batch size: ', args.batch_size) netw, neth = args.input_shape reorg_dets = voc_wrapper.reorg_detection(det, netw, neth, img_size) voc_wrapper.gen_results(reorg_dets, result_path, img_size, args.nms_thresh) # compute mAP ground_truth = parse_gt_from_anno(img_anno, classes) ret_list = parse_rets(ret_files_set) iou_thr = 0.5 evaluate = calc_recall_presicion_ap(ground_truth, ret_list, iou_thr) aps_str = '' for cls in evaluate: per_line, = plt.plot(evaluate[cls]['recall'], evaluate[cls]['presicion'], 'b-') per_line.set_label('%s:AP=%.3f' % (cls, evaluate[cls]['ap'])) aps_str += '_%s_AP_%.3f' % (cls, evaluate[cls]['ap']) plt.plot([i / 1000.0 for i in range(1, 1001)], [i / 1000.0 for i in range(1, 1001)], 'y--') plt.axis([0, 1.2, 0, 1.2]) plt.xlabel('recall') plt.ylabel('precision') plt.grid() plt.legend() plt.title('PR') # save mAP ap_save_path = os.path.join( result_path, result_path.replace('/', '_') + aps_str + '.png') print('Saving {}'.format(ap_save_path)) plt.savefig(ap_save_path) print('=============yolov3 evaluating finished==================')
raise ValueError("Only supported GPU training.") context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.platform, save_graphs=False) if os.getenv('DEVICE_ID', "not_set").isdigit(): context.set_context(device_id=int(os.getenv('DEVICE_ID'))) # init distributed if args_opt.is_distributed: init("nccl") cfg.rank = get_rank() cfg.group_size = get_group_size() parallel_mode = ParallelMode.DATA_PARALLEL context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=cfg.group_size, gradients_mean=True) else: cfg.rank = 0 cfg.group_size = 1 # dataloader dataset = create_dataset(args_opt.dataset_path, True, cfg.rank, cfg.group_size) batches_per_epoch = dataset.get_dataset_size() print("Batches Per Epoch: ", batches_per_epoch) # network net = ShuffleNetV2(n_class=cfg.num_classes, model_size=args_opt.model_size) # loss loss = CrossEntropySmooth(sparse=True,
def train_process(q, device_id, epoch_size, device_num, enable_hccl): os.system("mkdir " + str(device_id)) os.chdir(str(device_id)) context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False) context.set_context(device_id=device_id) os.environ['MINDSPORE_HCCL_CONFIG_PATH'] = MINDSPORE_HCCL_CONFIG_PATH os.environ['RANK_ID'] = str(device_id) os.environ['RANK_SIZE'] = str(device_num) if enable_hccl: context.set_auto_parallel_context( device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, all_reduce_fusion_config=[107, 160]) init() # network net = resnet50(class_num=config.class_num) # evaluation network dist_eval_network = ClassifyCorrectCell(net) if not config.use_label_smooth: config.label_smooth_factor = 0.0 # loss loss = CrossEntropySmooth(sparse=True, reduction="mean", smooth_factor=config.label_smooth_factor, num_classes=config.class_num) # train dataset dataset = create_dataset(dataset_path=dataset_path, do_train=True, repeat_num=1, batch_size=config.batch_size) step_size = dataset.get_dataset_size() eval_interval = config.eval_interval dataset.__loop_size__ = step_size * eval_interval # evalutation dataset eval_dataset = create_dataset(dataset_path=eval_path, do_train=False, repeat_num=1, batch_size=config.eval_batch_size) # loss scale loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) # learning rate lr = Tensor( get_learning_rate(lr_init=config.lr_init, lr_end=0.0, lr_max=config.lr_max, warmup_epochs=config.warmup_epochs, total_epochs=config.epoch_size, steps_per_epoch=step_size, lr_decay_mode=config.lr_decay_mode)) # optimizer decayed_params = [] no_decayed_params = [] for param in net.trainable_params(): if 'beta' not in param.name and 'gamma' not in param.name and 'bias' not in param.name: decayed_params.append(param) else: no_decayed_params.append(param) group_params = [{ 'params': decayed_params, 'weight_decay': config.weight_decay }, { 'params': no_decayed_params, 'weight_decay': 0.0 }, { 'order_params': net.trainable_params() }] if config.use_lars: momentum = nn.Momentum(group_params, lr, config.momentum, loss_scale=config.loss_scale, use_nesterov=config.use_nesterov) opt = nn.LARS(momentum, epsilon=config.lars_epsilon, coefficient=config.lars_coefficient, lars_filter=lambda x: 'beta' not in x.name and 'gamma' not in x.name and 'bias' not in x.name) else: opt = nn.Momentum(group_params, lr, config.momentum, loss_scale=config.loss_scale, use_nesterov=config.use_nesterov) # model model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, amp_level="O2", keep_batchnorm_fp32=False, metrics={ 'acc': DistAccuracy(batch_size=config.eval_batch_size, device_num=device_num) }, eval_network=dist_eval_network) # callbacks loss_cb = LossGet(1, step_size) # train and eval print("run_start", device_id) acc = 0.0 time_cost = 0.0 for epoch_idx in range(0, int(epoch_size / eval_interval)): model.train(1, dataset, callbacks=loss_cb) eval_start = time.time() output = model.eval(eval_dataset) eval_cost = (time.time() - eval_start) * 1000 acc = float(output["acc"]) time_cost = loss_cb.get_per_step_time() loss = loss_cb.get_loss() print( "the {} epoch's resnet result:\n " "device{}, training loss {}, acc {}, " "training per step cost {:.2f} ms, eval cost {:.2f} ms, total_cost {:.2f} ms" .format(epoch_idx, device_id, loss, acc, time_cost, eval_cost, time_cost * step_size + eval_cost)) q.put({'acc': acc, 'cost': time_cost})
def run_Readcomprehension(): ''' run Readcomprehension task ''' parser = argparse.ArgumentParser(description="Finetune and Evaluate translation") parser.add_argument("--device_target", type=str, default="Ascend", help="Device type. Default: Ascend.") parser.add_argument("--device_id", type=int, default=0, help="ID of target device. ") parser.add_argument("--metric_method", type=str, default="F1", help="The eval method including [F1]. Default: F1.") parser.add_argument("--do_train", type=str, default="false", help="Enable train. Default: false.") parser.add_argument("--do_eval", type=str, default="true", help="Enable evaluation. Default: false.") parser.add_argument("--eval_type", type=str, default="zero-shot", help="The type of evaluation including [zero-shot, finetuned]. Default: zero-shot.") parser.add_argument("--epoch_num", type=int, default=1, help="Epoch number. Default: 1.") parser.add_argument("--train_data_shuffle", type=str, default="true", help="Enable train data shuffle. Default: true.") parser.add_argument("--eval_data_shuffle", type=str, default="false", help="Enable eval data shuffle. Default: false.") parser.add_argument("--save_finetune_ckpt_path", type=str, default="", help="Save the checkpoint path.") parser.add_argument("--load_pretrain_ckpt_path", type=str, default="", help="Load the checkpoint file path.") parser.add_argument("--load_finetune_ckpt_path", type=str, default="", help="Load the checkpoint file path.") parser.add_argument("--train_data_file_path", type=str, default="", help="Data path, it is better to use absolute path") parser.add_argument("--eval_data_file_path", type=str, default="", help="Data path, it is better to use absolute path") parser.add_argument("--tokenizer_file_path", type=str, default="", help="pretrained vocab and merge file path.") parser.add_argument("--generate_length", type=int, default=55, help="The generation length of translation sentence.") parser.add_argument("--top_k", type=int, default=1, help="Parameter for Top-K sampling.") parser.add_argument("--top_p", type=str, default="1.0", help="parameter for Top-P sampling.") parser.add_argument("--temperature", type=str, default="1.0", help="Parameter for generation, greater if generation more diverse. ") args_opt = parser.parse_args() epoch_num = args_opt.epoch_num metric = args_opt.metric_method save_finetune_ckpt_path = args_opt.save_finetune_ckpt_path load_finetune_ckpt_path = args_opt.load_finetune_ckpt_path load_pretrain_ckpt_path = args_opt.load_pretrain_ckpt_path if args_opt.do_train.lower() == "false" and args_opt.do_eval.lower() == "false": raise ValueError("At least one of 'do_train' or 'do_eval' must be true") if args_opt.do_train.lower() == "true" and args_opt.train_data_file_path == "": raise ValueError("'train_data_file_path' must be set when do finetune task") if args_opt.do_eval.lower() == "true" and args_opt.eval_data_file_path == "": raise ValueError("'eval_data_file_path' must be set when do evaluation task") device_target = args_opt.device_target if device_target == "Ascend": context.set_context(mode=context.GRAPH_MODE, device_target=device_target, device_id=args_opt.device_id, max_call_depth=3000) context.set_auto_parallel_context(parallel_mode="stand_alone") print(" | Device: {} | Device id: {}".format(device_target, args_opt.device_id)) else: raise Exception("Device target error, Ascend is supported.") gpt2_loss = GPT2CoQA(config=gpt2_net_cfg, is_training=True, use_one_hot_embeddings=False) if args_opt.do_train.lower() == "true": get_train_setting(cfg) get_model_setting(cfg, gpt2_net_cfg) print("============== Start Loading Translation Train Dataset ==============") print(" | Train Dataset: {}".format(args_opt.train_data_file_path)) print(" | Checkpoint: {}".format(args_opt.load_pretrain_ckpt_path)) train_dataset = create_language_model_dataset(do_shuffle=(args_opt.train_data_shuffle.lower() == "true"), dataset_path=args_opt.train_data_file_path) do_train(train_dataset, gpt2_loss, load_pretrain_ckpt_path, save_finetune_ckpt_path, epoch_num) if args_opt.do_eval.lower() == "true": get_model_setting(cfg, gpt2_net_cfg) print("============ Start Loading Translation Evaluation Dataset ============") print(" | Eval Dataset: {}".format(args_opt.eval_data_file_path)) print(" | Checkpoint: {}".format(args_opt.load_finetune_ckpt_path)) eval_dataset = create_language_model_dataset(do_shuffle=(args_opt.eval_data_shuffle.lower() == "true"), dataset_path=args_opt.eval_data_file_path) do_eval(eval_dataset, GPT2CoQAModel, metric, load_finetune_ckpt_path, args_opt.eval_type, args_opt.tokenizer_file_path, args_opt.generate_length, args_opt.top_k, args_opt.top_p, args_opt.temperature)
type=str, default=None, help='Dataset path, default is None') args_opt = parser.parse_args() device_id = int(os.getenv('DEVICE_ID')) context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False, device_id=device_id) if __name__ == '__main__': if args_opt.run_distribute: context.reset_auto_parallel_context() context.set_auto_parallel_context( device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True) init() max_captcha_digits = cf.max_captcha_digits input_size = m.ceil(cf.captcha_height / 64) * 64 * 3 # create dataset dataset = create_dataset(dataset_path=args_opt.dataset_path, repeat_num=1, batch_size=cf.batch_size) step_size = dataset.get_dataset_size() # define lr lr_init = cf.learning_rate if not args_opt.run_distribute else cf.learning_rate * args_opt.device_num lr = get_lr(cf.epoch_size, step_size, lr_init) # define loss loss = CTCLoss(max_sequence_length=cf.captcha_width, max_label_length=max_captcha_digits,