def modelarts_pre_process(): '''modelarts pre process function.''' def unzip(zip_file, save_dir): import zipfile s_time = time.time() if not os.path.exists( os.path.join(save_dir, "face_recognition_dataset")): zip_isexist = zipfile.is_zipfile(zip_file) if zip_isexist: fz = zipfile.ZipFile(zip_file, 'r') data_num = len(fz.namelist()) print("Extract Start...") print("unzip file num: {}".format(data_num)) i = 0 for file in fz.namelist(): if i % int(data_num / 100) == 0: print("unzip percent: {}%".format(i / int(data_num / 100)), flush=True) i += 1 fz.extract(file, save_dir) print("cost time: {}min:{}s.".format( int((time.time() - s_time) / 60), int(int(time.time() - s_time) % 60))) print("Extract Done.") else: print("This is not zip.") else: print("Zip has been extracted.") if config.need_modelarts_dataset_unzip: zip_file_1 = os.path.join(config.data_path, "face_recognition_dataset.zip") save_dir_1 = os.path.join(config.data_path) sync_lock = "/tmp/unzip_sync.lock" # Each server contains 8 devices as most. if get_device_id() % min(get_device_num(), 8) == 0 and not os.path.exists(sync_lock): print("Zip file path: ", zip_file_1) print("Unzip file save dir: ", save_dir_1) unzip(zip_file_1, save_dir_1) print("===Finish extract data synchronization===") try: os.mknod(sync_lock) except IOError: pass while True: if os.path.exists(sync_lock): break time.sleep(1) print("Device: {}, Finish sync unzip data from {} to {}.".format( get_device_id(), zip_file_1, save_dir_1)) config.ckpt_path = os.path.join(config.output_path, str(get_rank_id()), config.ckpt_path)
def eval_alexnet(): print("============== Starting Testing ==============") device_num = get_device_num() if device_num > 1: # context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) context.set_context(mode=context.GRAPH_MODE, device_target='Davinci', save_graphs=False) if config.device_target == "Ascend": context.set_context(device_id=get_device_id()) init() elif config.device_target == "GPU": init() if config.dataset_name == 'cifar10': network = AlexNet(config.num_classes, phase='test') loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") opt = nn.Momentum(network.trainable_params(), config.learning_rate, config.momentum) ds_eval = create_dataset_cifar10(config.data_path, config.batch_size, status="test", \ target=config.device_target) param_dict = load_checkpoint(load_path) print("load checkpoint from [{}].".format(load_path)) load_param_into_net(network, param_dict) network.set_train(False) model = Model(network, loss, opt, metrics={"Accuracy": Accuracy()}) elif config.dataset_name == 'imagenet': network = AlexNet(config.num_classes, phase='test') loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") ds_eval = create_dataset_imagenet(config.data_path, config.batch_size, training=False) param_dict = load_checkpoint(load_path) print("load checkpoint from [{}].".format(load_path)) load_param_into_net(network, param_dict) network.set_train(False) model = Model(network, loss_fn=loss, metrics={'top_1_accuracy', 'top_5_accuracy'}) else: raise ValueError("Unsupported dataset.") if ds_eval.get_dataset_size() == 0: raise ValueError( "Please check dataset size > 0 and batch_size <= dataset size") result = model.eval(ds_eval, dataset_sink_mode=config.dataset_sink_mode) print("result : {}".format(result))
def run_train(): '''run train function.''' config.local_rank = get_rank_id() config.world_size = get_device_num() log_path = os.path.join(config.ckpt_path, 'logs') config.logger = get_logger(log_path, config.local_rank) support_train_stage = ['base', 'beta'] if config.train_stage.lower() not in support_train_stage: config.logger.info('your train stage is not support.') raise ValueError('train stage not support.') if not os.path.exists(config.data_dir): config.logger.info( 'ERROR, data_dir is not exists, please set data_dir in config.py') raise ValueError( 'ERROR, data_dir is not exists, please set data_dir in config.py') parallel_mode = ParallelMode.HYBRID_PARALLEL if config.is_distributed else ParallelMode.STAND_ALONE context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=config.world_size, gradients_mean=True) if config.is_distributed: init() if config.local_rank % 8 == 0: if not os.path.exists(config.ckpt_path): os.makedirs(config.ckpt_path) de_dataset, steps_per_epoch, num_classes = get_de_dataset(config) config.logger.info('de_dataset: %d', de_dataset.get_dataset_size()) config.steps_per_epoch = steps_per_epoch config.num_classes = num_classes config.lr_epochs = list(map(int, config.lr_epochs.split(','))) config.logger.info('config.num_classes: %d', config.num_classes) config.logger.info('config.world_size: %d', config.world_size) config.logger.info('config.local_rank: %d', config.local_rank) config.logger.info('config.lr: %f', config.lr) if config.nc_16 == 1: if config.model_parallel == 0: if config.num_classes % 16 == 0: config.logger.info('data parallel aleardy 16, nums: %d', config.num_classes) else: config.num_classes = (config.num_classes // 16 + 1) * 16 else: if config.num_classes % (config.world_size * 16) == 0: config.logger.info('model parallel aleardy 16, nums: %d', config.num_classes) else: config.num_classes = (config.num_classes // (config.world_size * 16) + 1) * config.world_size * 16 config.logger.info('for D, loaded, class nums: %d', config.num_classes) config.logger.info('steps_per_epoch: %d', config.steps_per_epoch) config.logger.info('img_total_num: %d', config.steps_per_epoch * config.per_batch_size) config.logger.info('get_backbone----in----') _backbone = get_backbone(config) config.logger.info('get_backbone----out----') config.logger.info('get_metric_fc----in----') margin_fc_1 = get_metric_fc(config) config.logger.info('get_metric_fc----out----') config.logger.info('DistributedHelper----in----') network_1 = DistributedHelper(_backbone, margin_fc_1) config.logger.info('DistributedHelper----out----') config.logger.info('network fp16----in----') if config.fp16 == 1: network_1.add_flags_recursive(fp16=True) config.logger.info('network fp16----out----') criterion_1 = get_loss(config) if config.fp16 == 1 and config.model_parallel == 0: criterion_1.add_flags_recursive(fp32=True) network_1 = load_pretrain(config, network_1) train_net = BuildTrainNetwork(network_1, criterion_1, config) # call warmup_step should behind the config steps_per_epoch config.lrs = warmup_step_list(config, gamma=0.1) lrs_gen = list_to_gen(config.lrs) opt = Momentum(params=train_net.trainable_params(), learning_rate=lrs_gen, momentum=config.momentum, weight_decay=config.weight_decay) scale_manager = DynamicLossScaleManager( init_loss_scale=config.dynamic_init_loss_scale, scale_factor=2, scale_window=2000) model = Model(train_net, optimizer=opt, metrics=None, loss_scale_manager=scale_manager) save_checkpoint_steps = config.ckpt_steps config.logger.info('save_checkpoint_steps: %d', save_checkpoint_steps) if config.max_ckpts == -1: keep_checkpoint_max = int(config.steps_per_epoch * config.max_epoch / save_checkpoint_steps) + 5 else: keep_checkpoint_max = config.max_ckpts config.logger.info('keep_checkpoint_max: %d', keep_checkpoint_max) ckpt_config = CheckpointConfig(save_checkpoint_steps=save_checkpoint_steps, keep_checkpoint_max=keep_checkpoint_max) config.logger.info('max_epoch_train: %d', config.max_epoch) ckpt_cb = ModelCheckpoint(config=ckpt_config, directory=config.ckpt_path, prefix='{}'.format(config.local_rank)) config.epoch_cnt = 0 progress_cb = ProgressMonitor(config) new_epoch_train = config.max_epoch * steps_per_epoch // config.log_interval model.train(new_epoch_train, de_dataset, callbacks=[progress_cb, ckpt_cb], sink_size=config.log_interval)
def train_alexnet(): print(config) print('device id:', get_device_id()) print('device num:', get_device_num()) print('rank id:', get_rank_id()) print('job id:', get_job_id()) device_target = config.device_target context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) context.set_context(save_graphs=False) device_num = get_device_num() if config.dataset_name == "cifar10": if device_num > 1: config.learning_rate = config.learning_rate * device_num config.epoch_size = config.epoch_size * 2 elif config.dataset_name == "imagenet": pass else: raise ValueError("Unsupported dataset.") if device_num > 1: context.reset_auto_parallel_context() context.set_auto_parallel_context(device_num=device_num, \ parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) if device_target == "Ascend": context.set_context(device_id=get_device_id()) init() elif device_target == "GPU": init() else: context.set_context(device_id=get_device_id()) if config.dataset_name == "cifar10": ds_train = create_dataset_cifar10(config.data_path, config.batch_size, target=config.device_target) elif config.dataset_name == "imagenet": ds_train = create_dataset_imagenet(config.data_path, config.batch_size) else: raise ValueError("Unsupported dataset.") if ds_train.get_dataset_size() == 0: raise ValueError( "Please check dataset size > 0 and batch_size <= dataset size") network = AlexNet(config.num_classes, phase='train') loss_scale_manager = None metrics = None step_per_epoch = ds_train.get_dataset_size( ) if config.sink_size == -1 else config.sink_size if config.dataset_name == 'cifar10': loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") lr = Tensor( get_lr_cifar10(0, config.learning_rate, config.epoch_size, step_per_epoch)) opt = nn.Momentum(network.trainable_params(), lr, config.momentum) metrics = {"Accuracy": Accuracy()} elif config.dataset_name == 'imagenet': loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") lr = Tensor( get_lr_imagenet(config.learning_rate, config.epoch_size, step_per_epoch)) opt = nn.Momentum(params=get_param_groups(network), learning_rate=lr, momentum=config.momentum, weight_decay=config.weight_decay, loss_scale=config.loss_scale) from mindspore.train.loss_scale_manager import DynamicLossScaleManager, FixedLossScaleManager if config.is_dynamic_loss_scale == 1: loss_scale_manager = DynamicLossScaleManager(init_loss_scale=65536, scale_factor=2, scale_window=2000) else: loss_scale_manager = FixedLossScaleManager( config.loss_scale, drop_overflow_update=False) else: raise ValueError("Unsupported dataset.") if device_target == "Ascend": model = Model(network, loss_fn=loss, optimizer=opt, metrics=metrics, amp_level="O2", keep_batchnorm_fp32=False, loss_scale_manager=loss_scale_manager) elif device_target == "GPU": model = Model(network, loss_fn=loss, optimizer=opt, metrics=metrics, loss_scale_manager=loss_scale_manager) else: raise ValueError("Unsupported platform.") if device_num > 1: ckpt_save_dir = os.path.join(config.checkpoint_path + "_" + str(get_rank())) else: ckpt_save_dir = config.checkpoint_path time_cb = TimeMonitor(data_size=step_per_epoch) config_ck = CheckpointConfig( save_checkpoint_steps=config.save_checkpoint_steps, keep_checkpoint_max=config.keep_checkpoint_max) ckpoint_cb = ModelCheckpoint(prefix="checkpoint_alexnet", directory=ckpt_save_dir, config=config_ck) print("============== Starting Training ==============") model.train(config.epoch_size, ds_train, callbacks=[time_cb, ckpoint_cb, LossMonitor()], dataset_sink_mode=config.dataset_sink_mode, sink_size=config.sink_size)