def test_deepfm(): data_config = DataConfig() train_config = TrainConfig() device_id = int(os.getenv('DEVICE_ID')) context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=device_id) rank_size = None rank_id = None dataset_path = "/home/workspace/mindspore_dataset/criteo_data/criteo_h5/" print("dataset_path:", dataset_path) ds_train = create_dataset(dataset_path, train_mode=True, epochs=1, batch_size=train_config.batch_size, data_type=DataType(data_config.data_format), rank_size=rank_size, rank_id=rank_id) model_builder = ModelBuilder(ModelConfig, TrainConfig) train_net, eval_net = model_builder.get_train_eval_net() auc_metric = AUCMetric() model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric}) loss_file_name = './loss.log' time_callback = TimeMonitor(data_size=ds_train.get_dataset_size()) loss_callback = LossCallBack(loss_file_path=loss_file_name) callback_list = [time_callback, loss_callback] eval_file_name = './auc.log' ds_eval = create_dataset(dataset_path, train_mode=False, epochs=1, batch_size=train_config.batch_size, data_type=DataType(data_config.data_format)) eval_callback = EvalCallBack(model, ds_eval, auc_metric, eval_file_path=eval_file_name) callback_list.append(eval_callback) print("train_config.train_epochs:", train_config.train_epochs) model.train(train_config.train_epochs, ds_train, callbacks=callback_list) export_loss_value = 0.51 print("loss_callback.loss:", loss_callback.loss) assert loss_callback.loss < export_loss_value export_per_step_time = 40.0 print("time_callback:", time_callback.per_step_time) assert time_callback.per_step_time < export_per_step_time print("*******test case pass!********")
def train_on_gpu(): config = config_gpu_quant if args_opt.quantization_aware else config_gpu print("training args: {}".format(args_opt)) print("training configure: {}".format(config)) # define network network = mobilenetV2(num_classes=config.num_classes) # define loss if config.label_smooth > 0: loss = CrossEntropyWithLabelSmooth(smooth_factor=config.label_smooth, num_classes=config.num_classes) else: loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction='mean') # define dataset epoch_size = config.epoch_size dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=True, config=config, device_target=args_opt.device_target, repeat_num=1, batch_size=config.batch_size) step_size = dataset.get_dataset_size() # resume if args_opt.pre_trained: param_dict = load_checkpoint(args_opt.pre_trained) load_param_into_net(network, param_dict) # convert fusion network to quantization aware network if config.quantization_aware: network = quant.convert_quant_network(network, bn_fold=True, per_channel=[True, False], symmetric=[True, True]) # get learning rate loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) lr = Tensor(get_lr(global_step=config.start_epoch * step_size, lr_init=0, lr_end=0, lr_max=config.lr, warmup_epochs=config.warmup_epochs, total_epochs=epoch_size + config.start_epoch, steps_per_epoch=step_size)) # define optimization opt = nn.Momentum(filter(lambda x: x.requires_grad, network.get_parameters()), lr, config.momentum, config.weight_decay, config.loss_scale) # define model model = Model(network, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale) print("============== Starting Training ==============") callback = [Monitor(lr_init=lr.asnumpy())] ckpt_save_dir = config.save_checkpoint_path + "ckpt_" + str(get_rank()) + "/" if config.save_checkpoint: config_ck = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_epochs * step_size, keep_checkpoint_max=config.keep_checkpoint_max) ckpt_cb = ModelCheckpoint(prefix="mobilenetV2", directory=ckpt_save_dir, config=config_ck) callback += [ckpt_cb] model.train(epoch_size, dataset, callbacks=callback) print("============== End Training ==============")
def val(net, data_dir, filename, num_consumer=4, batch=32): """ Validation function, estimate the performance of trained model Input: net: the trained neural network data_dir: path to the validation dataset filename: name of the validation dataset num_consumer: split number of validation dataset batch: validation batch size Outputs Float, AUC """ data_train = create_dataset(data_dir, filename, 32, ['feature', 'label'], num_consumer) data_train = data_train.create_tuple_iterator() res_pred = [] res_true = [] for data, label in data_train: x = net(Tensor(data, dtype=mstype.float32)) res_pred.append(x.asnumpy()) res_true.append(label.asnumpy()) res_pred = np.concatenate(res_pred, axis=0) res_true = np.concatenate(res_true, axis=0) auc = calculate_auc(res_true, res_pred) return auc
def train(model, dataset_direct, filename, columns_list, num_consumer=4, batch=16, epoch=50, save_checkpoint_steps=2172, keep_checkpoint_max=50, prefix="model", directory='./'): """ train network """ config_ck = CheckpointConfig(save_checkpoint_steps=save_checkpoint_steps, keep_checkpoint_max=keep_checkpoint_max) ckpoint_cb = ModelCheckpoint(prefix=prefix, directory=directory, config=config_ck) data_train = create_dataset(dataset_direct, filename, batch, columns_list, num_consumer) model.train(epoch, data_train, callbacks=[ ckpoint_cb, LossMonitor(per_print_times=181), TimeMonitor() ], dataset_sink_mode=True)
def do_eval_standalone(args_opt): """ do eval standalone """ ckpt_file = os.path.join(args_opt.model_dir, args_opt.task_name) ckpt_file = get_ckpt(ckpt_file) print('ckpt file:', ckpt_file) task = task_cfg[args_opt.task_name] student_net_cfg.seq_length = task.seq_length eval_cfg.batch_size = args_opt.batch_size eval_data_dir = os.path.join(args_opt.data_dir, args_opt.task_name, DATA_NAME) context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target, device_id=args.device_id) eval_dataset = create_dataset(batch_size=eval_cfg.batch_size, device_num=1, rank=0, do_shuffle='false', data_dir=eval_data_dir, data_type=args_opt.dataset_type, seq_length=task.seq_length, task_type=task.task_type, drop_remainder=False) print('eval dataset size:', eval_dataset.get_dataset_size()) print('eval dataset batch size:', eval_dataset.get_batch_size()) eval_model = BertModelCLS(student_net_cfg, False, task.num_labels, 0.0, phase_type='student') param_dict = load_checkpoint(ckpt_file) new_param_dict = {} for key, value in param_dict.items(): new_key = re.sub('tinybert_', 'bert_', key) new_key = re.sub('^bert.', '', new_key) new_param_dict[new_key] = value load_param_into_net(eval_model, new_param_dict) eval_model.set_train(False) columns_list = ["input_ids", "input_mask", "segment_ids", "label_ids"] callback = task.metrics() for step, data in enumerate(eval_dataset.create_dict_iterator()): input_data = [] for i in columns_list: input_data.append(data[i]) input_ids, input_mask, token_type_id, label_ids = input_data _, _, logits, _ = eval_model(input_ids, token_type_id, input_mask) callback.update(logits, label_ids) print('eval step: {}, {}: {}'.format(step, callback.name, callback.get_metrics())) metrics = callback.get_metrics() print('The best {}: {}'.format(callback.name, metrics))
def train_net(data_dir, seg_dir, config=None): train_dataset = create_dataset(data_path=data_dir, seg_path=seg_dir, config=config, is_training=True) # for item in train_dataset: # print(item) # exit(0) train_data_size = len(train_dataset) print("train dataset length is:", train_data_size) network = UNet3d(config=config) criterion = DiceLoss() optimizer = torch.optim.Adam(params=network.parameters(), lr=1) scheduler = dynamic_lr_scheduler(config, train_data_size, optimizer) device = torch.device('cuda:0') network.to(device) print("============== Starting Training ==============") network.train() step_per_epoch = train_data_size for epoch_id in range(cfg.epoch_size): time_epoch = 0.0 torch.cuda.synchronize(0) time_start = time.time() for batch_idx, data in enumerate(train_dataset, 0): inputs, labels = data inputs.squeeze_(0) labels.squeeze_(0) inputs = inputs.to(device) labels = labels.to(device) # zeros the parameter gradients optimizer.zero_grad() outputs = network(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() scheduler.step() # print statistics running_loss = loss.item() torch.cuda.synchronize(0) time_end = time.time() time_step = time_end - time_start time_epoch = time_epoch + time_step print('Epoch: [%3d/%3d], step: [%5d/%5d], loss: [%6.4f], time: [%.4f]' % (epoch_id, cfg.epoch_size, batch_idx + 1, step_per_epoch, running_loss, time_step)) time_start = time.time() print('Epoch time: %10.4f, per step time: %7.4f' % (time_epoch, time_epoch / step_per_epoch)) ckpt_path = "./ckpt_0/Unet3d" ckpt_file = ('%s-%d_%d.ckpt' % (ckpt_path, epoch_id + 1, step_per_epoch)) torch.save(network, ckpt_file) print("============== End Training ==============")
def predict(): """Predict function.""" args = get_args("predict") G_A = get_generator(args) G_B = get_generator(args) # Use BatchNorm2d with batchsize=1, affine=False, training=True instead of InstanceNorm2d # Use real mean and varance rather than moving_men and moving_varance in BatchNorm2d G_A.set_train(True) G_B.set_train(True) load_ckpt(args, G_A, G_B) imgs_out = os.path.join(args.outputs_dir, "predict") if not os.path.exists(imgs_out): os.makedirs(imgs_out) if not os.path.exists(os.path.join(imgs_out, "fake_A")): os.makedirs(os.path.join(imgs_out, "fake_A")) if not os.path.exists(os.path.join(imgs_out, "fake_B")): os.makedirs(os.path.join(imgs_out, "fake_B")) args.data_dir = 'testA' ds = create_dataset(args) reporter = Reporter(args) reporter.start_predict("A to B") for data in ds.create_dict_iterator(output_numpy=True): img_A = Tensor(data["image"]) path_A = str(data["image_name"][0], encoding="utf-8") fake_B = G_A(img_A) save_image(fake_B, os.path.join(imgs_out, "fake_B", path_A)) reporter.info('save fake_B at %s', os.path.join(imgs_out, "fake_B", path_A)) reporter.end_predict() args.data_dir = 'testB' ds = create_dataset(args) reporter.dataset_size = args.dataset_size reporter.start_predict("B to A") for data in ds.create_dict_iterator(output_numpy=True): img_B = Tensor(data["image"]) path_B = str(data["image_name"][0], encoding="utf-8") fake_A = G_B(img_B) save_image(fake_A, os.path.join(imgs_out, "fake_A", path_B)) reporter.info('save fake_A at %s', os.path.join(imgs_out, "fake_A", path_B)) reporter.end_predict()
def run_eval(): """eval method""" if not os.path.exists(config.output_path): os.makedirs(config.output_path) context.set_context(mode=context.GRAPH_MODE, device_target="Davinci", save_graphs=False, device_id=get_device_id()) layers = config.layers num_factors = config.num_factors topk = rconst.TOP_K num_eval_neg = rconst.NUM_EVAL_NEGATIVES ds_eval, num_eval_users, num_eval_items = create_dataset( test_train=False, data_dir=config.data_path, dataset=config.dataset, train_epochs=0, eval_batch_size=config.eval_batch_size) print("ds_eval.size: {}".format(ds_eval.get_dataset_size())) ncf_net = NCFModel(num_users=num_eval_users, num_items=num_eval_items, num_factors=num_factors, model_layers=layers, mf_regularization=0, mlp_reg_layers=[0.0, 0.0, 0.0, 0.0], mf_dim=16) param_dict = load_checkpoint(config.checkpoint_file_path) load_param_into_net(ncf_net, param_dict) loss_net = NetWithLossClass(ncf_net) train_net = TrainStepWrap(loss_net) eval_net = PredictWithSigmoid(ncf_net, topk, num_eval_neg) ncf_metric = NCFMetric() model = Model(train_net, eval_network=eval_net, metrics={"ncf": ncf_metric}) ncf_metric.clear() out = model.eval(ds_eval) eval_file_path = os.path.join(config.output_path, config.eval_file_name) eval_file = open(eval_file_path, "a+") eval_file.write("EvalCallBack: HR = {}, NDCG = {}\n".format( out['ncf'][0], out['ncf'][1])) eval_file.close() print("EvalCallBack: HR = {}, NDCG = {}".format(out['ncf'][0], out['ncf'][1])) print("=" * 100 + "Eval Finish!" + "=" * 100)
def resnet50_train(args_opt): epoch_size = args_opt.epoch_size batch_size = cfg.batch_size class_num = cfg.class_num loss_scale_num = cfg.loss_scale local_data_path = '/cache/data' local_ckpt_path = '/cache/ckpt_file' # set graph mode and parallel mode context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False) # data download print('Download data.') mox.file.copy_parallel(src_url=args_opt.data_url, dst_url=local_data_path) # create dataset print('Create train and evaluate dataset.') train_dataset = create_dataset(dataset_path=local_data_path, do_train=True, repeat_num=epoch_size, batch_size=batch_size) train_step_size = train_dataset.get_dataset_size() print('Create dataset success.') # create model net = resnet50(class_num=class_num) # reduction='mean' means that apply reduction of mean to loss loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') lr = Tensor(get_lr(global_step=0, total_epochs=epoch_size, steps_per_epoch=train_step_size)) opt = Momentum(net.trainable_params(), lr, momentum=0.9, weight_decay=1e-4, loss_scale=loss_scale_num) loss_scale = FixedLossScaleManager(loss_scale_num, False) # amp_level="O2" means that the hybrid precision of O2 mode is used for training # the whole network except that batchnorm will be cast into float16 format and dynamic loss scale will be used # 'keep_batchnorm_fp32 = False' means that use the float16 format model = Model(net, amp_level="O2", keep_batchnorm_fp32=False, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'}) # define performance callback to show ips and loss callback to show loss for every epoch time_cb = TimeMonitor(data_size=train_step_size) performance_cb = PerformanceCallback(batch_size) loss_cb = LossMonitor() cb = [time_cb, performance_cb, loss_cb] config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_epochs * train_step_size, keep_checkpoint_max=cfg.keep_checkpoint_max) ckpt_cb = ModelCheckpoint(prefix="resnet", directory=local_ckpt_path, config=config_ck) cb += [ckpt_cb] print(f'Start run training, total epoch: {epoch_size}.') model.train(epoch_size, train_dataset, callbacks=cb) # upload checkpoint files print('Upload checkpoint.') mox.file.copy_parallel(src_url=local_ckpt_path, dst_url=args_opt.train_url)
def train(): """Train function.""" args = get_args("train") if args.need_profiler: from mindspore.profiler.profiling import Profiler profiler = Profiler(output_path=args.outputs_dir, is_detail=True, is_show_op_path=True) ds = create_dataset(args) G_A = get_generator(args) G_B = get_generator(args) D_A = get_discriminator(args) D_B = get_discriminator(args) load_ckpt(args, G_A, G_B, D_A, D_B) imgae_pool_A = ImagePool(args.pool_size) imgae_pool_B = ImagePool(args.pool_size) generator = Generator(G_A, G_B, args.lambda_idt > 0) loss_D = DiscriminatorLoss(args, D_A, D_B) loss_G = GeneratorLoss(args, generator, D_A, D_B) optimizer_G = nn.Adam(generator.trainable_params(), get_lr(args), beta1=args.beta1) optimizer_D = nn.Adam(loss_D.trainable_params(), get_lr(args), beta1=args.beta1) net_G = TrainOneStepG(loss_G, generator, optimizer_G) net_D = TrainOneStepD(loss_D, optimizer_D) data_loader = ds.create_dict_iterator() reporter = Reporter(args) reporter.info('==========start training===============') for _ in range(args.max_epoch): reporter.epoch_start() for data in data_loader: img_A = data["image_A"] img_B = data["image_B"] res_G = net_G(img_A, img_B) fake_A = res_G[0] fake_B = res_G[1] res_D = net_D(img_A, img_B, imgae_pool_A.query(fake_A), imgae_pool_B.query(fake_B)) reporter.step_end(res_G, res_D) reporter.visualizer(img_A, img_B, fake_A, fake_B) reporter.epoch_end(net_G) if args.need_profiler: profiler.analyse() break reporter.info('==========end training===============')
def train_net(data_dir, seg_dir, run_distribute, config=None): if run_distribute: init() rank_id = get_rank() rank_size = get_group_size() parallel_mode = ParallelMode.DATA_PARALLEL context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=rank_size, gradients_mean=True) else: rank_id = 0 rank_size = 1 train_dataset = create_dataset(data_path=data_dir, seg_path=seg_dir, config=config, \ rank_size=rank_size, rank_id=rank_id, is_training=True) # for item in train_dataset: # print(item) # exit(0) train_data_size = train_dataset.get_dataset_size() print("train dataset length is:", train_data_size) network = UNet3d(config=config) loss = SoftmaxCrossEntropyWithLogits() lr = Tensor(dynamic_lr(config, train_data_size), mstype.float32) optimizer = nn.Adam(params=network.trainable_params(), learning_rate=lr) scale_manager = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) network.set_train() model = Model(network, loss_fn=loss, optimizer=optimizer, loss_scale_manager=scale_manager, amp_level='O2') time_cb = TimeMonitor(data_size=train_data_size) loss_cb = LossMonitor() ckpt_config = CheckpointConfig( save_checkpoint_steps=train_data_size, keep_checkpoint_max=config.keep_checkpoint_max) ckpoint_cb = ModelCheckpoint(prefix='{}'.format(config.model), directory='./ckpt_{}/'.format(rank_size), config=ckpt_config) callbacks_list = [loss_cb, time_cb, ckpoint_cb] print("============== Starting Training ==============") model.train(config.epoch_size, train_dataset, callbacks=callbacks_list) # model.train(config.epoch_size, train_dataset, callbacks=callbacks_list, dataset_sink_mode=False) print("============== End Training ==============")
def resnet50_eval(args_opt): class_num = cfg.class_num local_data_path = '/cache/data' ckpt_file_slice = args_opt.checkpoint_path.split('/') ckpt_file = ckpt_file_slice[len(ckpt_file_slice) - 1] local_ckpt_path = '/cache/' + ckpt_file # set graph mode and parallel mode context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False) # data download print('Download data.') mox.file.copy_parallel(src_url=args_opt.data_url, dst_url=local_data_path) mox.file.copy_parallel(src_url=args_opt.checkpoint_path, dst_url=local_ckpt_path) # create dataset dataset = create_dataset(dataset_path=local_data_path, do_train=False, batch_size=cfg.batch_size) # load checkpoint into net net = resnet50(class_num=class_num) param_dict = load_checkpoint(local_ckpt_path) load_param_into_net(net, param_dict) net.set_train(False) # define loss and model if not cfg.use_label_smooth: cfg.label_smooth_factor = 0.0 loss = CrossEntropySmooth(sparse=True, reduction='mean', smooth_factor=cfg.label_smooth_factor, num_classes=cfg.class_num) model = Model(net, loss_fn=loss, metrics={'top_1_accuracy', 'top_5_accuracy'}) # eval model res = model.eval(dataset) print("result:", res, "ckpt=", args_opt.checkpoint_path)
def test_net(data_dir, seg_dir, ckpt_path, config=None): eval_dataset = create_dataset(data_path=data_dir, seg_path=seg_dir, config=config, is_training=False) eval_data_size = eval_dataset.get_dataset_size() print("train dataset length is:", eval_data_size) network = UNet3d(config=config) network.set_train(False) param_dict = load_checkpoint(ckpt_path) load_param_into_net(network, param_dict) model = Model(network) index = 0 total_dice = 0 for batch in eval_dataset.create_dict_iterator(num_epochs=1, output_numpy=True): image = batch["image"] seg = batch["seg"] print("current image shape is {}".format(image.shape), flush=True) sliding_window_list, slice_list = create_sliding_window( image, config.roi_size, config.overlap) image_size = (config.batch_size, config.num_classes) + image.shape[2:] output_image = np.zeros(image_size, np.float32) count_map = np.zeros(image_size, np.float32) importance_map = np.ones(config.roi_size, np.float32) for window, slice_ in zip(sliding_window_list, slice_list): window_image = Tensor(window, mstype.float32) pred_probs = model.predict(window_image) output_image[slice_] += pred_probs.asnumpy() count_map[slice_] += importance_map output_image = output_image / count_map dice, _ = CalculateDice(output_image, seg) print("The {} batch dice is {}".format(index, dice), flush=True) total_dice += dice index = index + 1 avg_dice = total_dice / eval_data_size print( "**********************End Eval***************************************" ) print("eval average dice is {}".format(avg_dice))
def eval_lenet(): context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) network = LeNet5(config.num_classes) net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") # repeat_size = config.epoch_size net_opt = nn.Momentum(network.trainable_params(), config.lr, config.momentum) model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) print("============== Starting Testing ==============") param_dict = load_checkpoint(ckpt_path) load_param_into_net(network, param_dict) ds_eval = create_dataset(os.path.join(config.data_path, "test"), config.batch_size, 1) if ds_eval.get_dataset_size() == 0: raise ValueError("Please check dataset size > 0 and batch_size <= dataset size") acc = model.eval(ds_eval) print("============== {} ==============".format(acc))
def train_lenet(): context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) ds_train = create_dataset(os.path.join(config.data_path, "train"), config.batch_size) if ds_train.get_dataset_size() == 0: raise ValueError( "Please check dataset size > 0 and batch_size <= dataset size") network = LeNet5(config.num_classes) net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") net_opt = nn.Momentum(network.trainable_params(), config.lr, config.momentum) time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) config_ck = CheckpointConfig( save_checkpoint_steps=config.save_checkpoint_steps, keep_checkpoint_max=config.keep_checkpoint_max) ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", directory=config.checkpoint_path, config=config_ck) if config.device_target != "Ascend": model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) else: model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}, amp_level="O2") print("============== Starting Training ==============") model.train(config.epoch_size, ds_train, callbacks=[time_cb, ckpoint_cb, LossMonitor()])
def inception_v4_train(): """ Train Inceptionv4 in data parallelism """ print('epoch_size: {} batch_size: {} class_num {}'.format(config.epoch_size, config.batch_size, config.num_classes)) context.set_context(mode=context.GRAPH_MODE, device_target=args.platform) if args.platform == "Ascend": context.set_context(device_id=args.device_id) context.set_context(enable_graph_kernel=False) rank = 0 if device_num > 1: if args.platform == "Ascend": init(backend_name='hccl') elif args.platform == "GPU": init() else: raise ValueError("Unsupported device target.") rank = get_rank() context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, all_reduce_fusion_config=[200, 400]) # create dataset train_dataset = create_dataset(dataset_path=args.dataset_path, do_train=True, repeat_num=1, batch_size=config.batch_size, shard_id=rank) train_step_size = train_dataset.get_dataset_size() # create model net = Inceptionv4(classes=config.num_classes) # loss loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") # learning rate lr = Tensor(generate_cosine_lr(steps_per_epoch=train_step_size, total_epochs=config.epoch_size)) decayed_params = [] no_decayed_params = [] for param in net.trainable_params(): if 'beta' not in param.name and 'gamma' not in param.name and 'bias' not in param.name: decayed_params.append(param) else: no_decayed_params.append(param) for param in net.trainable_params(): if 'beta' not in param.name and 'gamma' not in param.name and 'bias' not in param.name: param.set_data(initializer(XavierUniform(), param.data.shape, param.data.dtype)) group_params = [{'params': decayed_params, 'weight_decay': config.weight_decay}, {'params': no_decayed_params}, {'order_params': net.trainable_params()}] opt = RMSProp(group_params, lr, decay=config.decay, epsilon=config.epsilon, weight_decay=config.weight_decay, momentum=config.momentum, loss_scale=config.loss_scale) if args.device_id == 0: print(lr) print(train_step_size) if args.resume: ckpt = load_checkpoint(args.resume) load_param_into_net(net, ckpt) loss_scale_manager = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) if args.platform == "Ascend": model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc', 'top_1_accuracy', 'top_5_accuracy'}, loss_scale_manager=loss_scale_manager, amp_level=config.amp_level) elif args.platform == "GPU": model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc', 'top_1_accuracy', 'top_5_accuracy'}, loss_scale_manager=loss_scale_manager, amp_level='O0') else: raise ValueError("Unsupported device target.") # define callbacks performance_cb = TimeMonitor(data_size=train_step_size) loss_cb = LossMonitor(per_print_times=train_step_size) ckp_save_step = config.save_checkpoint_epochs * train_step_size config_ck = CheckpointConfig(save_checkpoint_steps=ckp_save_step, keep_checkpoint_max=config.keep_checkpoint_max) ckpoint_cb = ModelCheckpoint(prefix=f"inceptionV4-train-rank{rank}", directory='ckpts_rank_' + str(rank), config=config_ck) callbacks = [performance_cb, loss_cb] if device_num > 1 and config.is_save_on_master: if args.device_id == 0: callbacks.append(ckpoint_cb) else: callbacks.append(ckpoint_cb) # train model model.train(config.epoch_size, train_dataset, callbacks=callbacks, dataset_sink_mode=True)
def train(): """Train function.""" args.outputs_dir = params['save_model_path'] if args.group_size > 1: init() context.set_auto_parallel_context( device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) args.outputs_dir = os.path.join(args.outputs_dir, "ckpt_{}/".format(str(get_rank()))) args.rank = get_rank() else: args.outputs_dir = os.path.join(args.outputs_dir, "ckpt_0/") args.rank = 0 if args.group_size > 1: args.max_epoch = params["max_epoch_train_NP"] args.loss_scale = params['loss_scale'] / 2 args.lr_steps = list(map(int, params["lr_steps_NP"].split(','))) params['train_type'] = params['train_type_NP'] params['optimizer'] = params['optimizer_NP'] params['group_params'] = params['group_params_NP'] else: args.max_epoch = params["max_epoch_train"] args.loss_scale = params['loss_scale'] args.lr_steps = list(map(int, params["lr_steps"].split(','))) # create network print('start create network') criterion = openpose_loss() criterion.add_flags_recursive(fp32=True) network = OpenPoseNet(vggpath=params['vgg_path'], vgg_with_bn=params['vgg_with_bn']) if params["load_pretrain"]: print("load pretrain model:", params["pretrained_model_path"]) load_model(network, params["pretrained_model_path"]) train_net = BuildTrainNetwork(network, criterion) # create dataset if os.path.exists(args.jsonpath_train) and os.path.exists(args.imgpath_train) \ and os.path.exists(args.maskpath_train): print('start create dataset') else: print('Error: wrong data path') return 0 num_worker = 20 if args.group_size > 1 else 48 de_dataset_train = create_dataset(args.jsonpath_train, args.imgpath_train, args.maskpath_train, batch_size=params['batch_size'], rank=args.rank, group_size=args.group_size, num_worker=num_worker, multiprocessing=True, shuffle=True, repeat_num=1) steps_per_epoch = de_dataset_train.get_dataset_size() print("steps_per_epoch: ", steps_per_epoch) # lr scheduler lr_stage, lr_base, lr_vgg = get_lr(params['lr'] * args.group_size, params['lr_gamma'], steps_per_epoch, args.max_epoch, args.lr_steps, args.group_size, lr_type=params['lr_type'], warmup_epoch=params['warmup_epoch']) # optimizer if params['group_params']: vgg19_base_params = list( filter(lambda x: 'base.vgg_base' in x.name, train_net.trainable_params())) base_params = list( filter(lambda x: 'base.conv' in x.name, train_net.trainable_params())) stages_params = list( filter(lambda x: 'base' not in x.name, train_net.trainable_params())) group_params = [{ 'params': vgg19_base_params, 'lr': lr_vgg }, { 'params': base_params, 'lr': lr_base }, { 'params': stages_params, 'lr': lr_stage }] if params['optimizer'] == "Momentum": opt = Momentum(group_params, learning_rate=lr_stage, momentum=0.9) elif params['optimizer'] == "Adam": opt = Adam(group_params) else: raise ValueError("optimizer not support.") else: if params['optimizer'] == "Momentum": opt = Momentum(train_net.trainable_params(), learning_rate=lr_stage, momentum=0.9) elif params['optimizer'] == "Adam": opt = Adam(train_net.trainable_params(), learning_rate=lr_stage) else: raise ValueError("optimizer not support.") # callback config_ck = CheckpointConfig( save_checkpoint_steps=params['ckpt_interval'], keep_checkpoint_max=params["keep_checkpoint_max"]) ckpoint_cb = ModelCheckpoint(prefix='{}'.format(args.rank), directory=args.outputs_dir, config=config_ck) time_cb = TimeMonitor(data_size=de_dataset_train.get_dataset_size()) if args.rank == 0: callback_list = [MyLossMonitor(), time_cb, ckpoint_cb] else: callback_list = [MyLossMonitor(), time_cb] # train if params['train_type'] == 'clip_grad': train_net = TrainOneStepWithClipGradientCell(train_net, opt, sens=args.loss_scale) train_net.set_train() model = Model(train_net) elif params['train_type'] == 'fix_loss_scale': loss_scale_manager = FixedLossScaleManager(args.loss_scale, drop_overflow_update=False) train_net.set_train() model = Model(train_net, optimizer=opt, loss_scale_manager=loss_scale_manager) else: raise ValueError("Type {} is not support.".format( params['train_type'])) print("============== Starting Training ==============") model.train(args.max_epoch, de_dataset_train, callbacks=callback_list, dataset_sink_mode=False) return 0
# define net net = mobilenet_v2(num_classes=config_gpu.num_classes, platform="GPU") # define loss if config_gpu.label_smooth > 0: loss = CrossEntropyWithLabelSmooth( smooth_factor=config_gpu.label_smooth, num_classes=config_gpu.num_classes) else: loss = SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction='mean') # define dataset epoch_size = config_gpu.epoch_size dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=True, config=config_gpu, platform=args_opt.platform, repeat_num=1, batch_size=config_gpu.batch_size) step_size = dataset.get_dataset_size() # resume if args_opt.pre_trained: param_dict = load_checkpoint(args_opt.pre_trained) load_param_into_net(net, param_dict) # define optimizer loss_scale = FixedLossScaleManager(config_gpu.loss_scale, drop_overflow_update=False) lr = Tensor( get_lr(global_step=0, lr_init=0, lr_end=0, lr_max=config_gpu.lr,
def train_on_ascend(): config = config_ascend_quant print("training args: {}".format(args_opt)) print("training configure: {}".format(config)) print("parallel args: rank_id {}, device_id {}, rank_size {}".format( rank_id, device_id, rank_size)) epoch_size = config.epoch_size # distribute init if run_distribute: context.set_auto_parallel_context( device_num=rank_size, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) init() # define network network = mobilenetV2(num_classes=config.num_classes) # define loss if config.label_smooth > 0: loss = CrossEntropyWithLabelSmooth(smooth_factor=config.label_smooth, num_classes=config.num_classes) else: loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') # define dataset dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=True, config=config, device_target=args_opt.device_target, repeat_num=1, batch_size=config.batch_size) step_size = dataset.get_dataset_size() # load pre trained ckpt if args_opt.pre_trained: param_dict = load_checkpoint(args_opt.pre_trained) load_nonquant_param_into_quant_net(network, param_dict) # convert fusion network to quantization aware network network = quant.convert_quant_network(network, bn_fold=True, per_channel=[True, False], symmetric=[True, False]) # get learning rate lr = Tensor( get_lr(global_step=config.start_epoch * step_size, lr_init=0, lr_end=0, lr_max=config.lr, warmup_epochs=config.warmup_epochs, total_epochs=epoch_size + config.start_epoch, steps_per_epoch=step_size)) # define optimization opt = nn.Momentum( filter(lambda x: x.requires_grad, network.get_parameters()), lr, config.momentum, config.weight_decay) # define model model = Model(network, loss_fn=loss, optimizer=opt) print("============== Starting Training ==============") callback = None if rank_id == 0: callback = [Monitor(lr_init=lr.asnumpy())] if config.save_checkpoint: config_ck = CheckpointConfig( save_checkpoint_steps=config.save_checkpoint_epochs * step_size, keep_checkpoint_max=config.keep_checkpoint_max) ckpt_cb = ModelCheckpoint(prefix="mobilenetV2", directory=config.save_checkpoint_path, config=config_ck) callback += [ckpt_cb] model.train(epoch_size, dataset, callbacks=callback) print("============== End Training ==============")
help='path where the dataset is saved') parser.add_argument('--summary_path', type=str, default="./summary", help='path where the summary to be saved') parser.add_argument('--dataset_sink_mode', type=bool, default=True, help='dataset_sink_mode is False or True') parser.add_argument('--device_id', type=int, default=0, help='device id of GPU. (Default: 0)') args = parser.parse_args() if args.device_target == "CPU": args.dataset_sink_mode = False context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=args.device_id) network = Inceptionv3(cfg.num_classes) net_loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction="mean", smooth_factor=cfg.label_smoothing_eps) ds_train = create_dataset(args.data_path, cfg.batch_size, cfg.epoch_size) step_per_epoch = ds_train.get_dataset_size() total_step = step_per_epoch * cfg.epoch_size lr = exponential_decay_lr(learning_rate=cfg.lr_init, decay_rate=cfg.lr_decay_rate, total_step=total_step, step_per_epoch=step_per_epoch, decay_epoch=cfg.lr_decay_epoch) net_opt = nn.RMSProp(network.trainable_params(), learning_rate=lr, decay=cfg.rmsprop_decay, momentum=cfg.rmsprop_momentum, epsilon=cfg.rmsprop_epsilon) time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_steps, keep_checkpoint_max=cfg.keep_checkpoint_max) ckpoint_cb = ModelCheckpoint(prefix="checkpoint_inceptionv3", config=config_ck) # summary_cb = SummaryCollector(args.summary_path, # collect_freq=1, # keep_default_action=False,
init() else: init("nccl") cfg.rank = get_rank() cfg.group_size = get_group_size() parallel_mode = ParallelMode.DATA_PARALLEL context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=cfg.group_size, parameter_broadcast=True, mirror_mean=True) else: cfg.rank = 0 cfg.group_size = 1 # dataloader dataset = create_dataset(args_opt.dataset_path, True, cfg.rank, cfg.group_size) batches_per_epoch = dataset.get_dataset_size() # network net = InceptionV3(num_classes=cfg.num_classes) # loss loss = CrossEntropy(smooth_factor=cfg.smooth_factor, num_classes=cfg.num_classes, factor=cfg.aux_factor) # learning rate schedule lr = get_lr(lr_init=cfg.lr_init, lr_end=cfg.lr_end, lr_max=cfg.lr_max, warmup_epochs=cfg.warmup_epochs,
device_id=device_id) def add_write(file_path, print_str): with open(file_path, 'a+', encoding='utf-8') as file_out: file_out.write(print_str + '\n') if __name__ == '__main__': data_config = DataConfig() model_config = ModelConfig() train_config = TrainConfig() ds_eval = create_dataset(args_opt.dataset_path, train_mode=False, epochs=1, batch_size=train_config.batch_size, data_type=DataType(data_config.data_format)) model_builder = ModelBuilder(ModelConfig, TrainConfig) train_net, eval_net = model_builder.get_train_eval_net() train_net.set_train() eval_net.set_train(False) auc_metric = AUCMetric() model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric}) param_dict = load_checkpoint(args_opt.checkpoint_path) load_param_into_net(eval_net, param_dict) start = time.time() res = model.eval(ds_eval)
# init distributed if args_opt.is_distributed: init("nccl") cfg.rank = get_rank() cfg.group_size = get_group_size() parallel_mode = ParallelMode.DATA_PARALLEL context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=cfg.group_size, gradients_mean=True) else: cfg.rank = 0 cfg.group_size = 1 # dataloader dataset = create_dataset(args_opt.dataset_path, cfg, True) batches_per_epoch = dataset.get_dataset_size() # network net_with_loss = NASNetAMobileWithLoss(cfg) if args_opt.resume: ckpt = load_checkpoint(args_opt.resume) load_param_into_net(net_with_loss, ckpt) # learning rate schedule lr = get_lr(lr_init=cfg.lr_init, lr_decay_rate=cfg.lr_decay_rate, num_epoch_per_decay=cfg.num_epoch_per_decay, total_epochs=cfg.epoch_size, steps_per_epoch=batches_per_epoch, is_stair=True)
rank = get_rank() context.reset_auto_parallel_context() context.set_auto_parallel_context( device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) else: device_num = 1 rank = 0 max_captcha_digits = cf.max_captcha_digits input_size = m.ceil(cf.captcha_height / 64) * 64 * 3 # create dataset dataset = create_dataset(dataset_path=args_opt.dataset_path, batch_size=cf.batch_size, num_shards=device_num, shard_id=rank, device_target=args_opt.platform) step_size = dataset.get_dataset_size() # define lr lr_init = cf.learning_rate if not args_opt.run_distribute else cf.learning_rate * device_num * lr_scale lr = get_lr(cf.epoch_size, step_size, lr_init) loss = CTCLoss(max_sequence_length=cf.captcha_width, max_label_length=max_captcha_digits, batch_size=cf.batch_size) if args_opt.platform == 'Ascend': net = StackedRNN(input_size=input_size, batch_size=cf.batch_size, hidden_size=cf.hidden_size) else: net = StackedRNNForGPU(input_size=input_size,
path where the trained ckpt file') parser.add_argument('--dataset_sink_mode', type=bool, default=False, help='dataset_sink_mode is False or True') args = parser.parse_args() context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) network = LeNet5(cfg.num_classes) net_loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction="mean") repeat_size = cfg.epoch_size net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) config_ck = CheckpointConfig( save_checkpoint_steps=cfg.save_checkpoint_steps, keep_checkpoint_max=cfg.keep_checkpoint_max) ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", config=config_ck) model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) print("============== Starting Testing ==============") param_dict = load_checkpoint(args.ckpt_path) load_param_into_net(network, param_dict) ds_eval = create_dataset(os.path.join(args.data_path, "test"), cfg.batch_size, 1) acc = model.eval(ds_eval, dataset_sink_mode=args.dataset_sink_mode) print("============== {} ==============".format(acc))
default="./ckpt", help='if is test, must provide\ path where the trained ckpt file') parser.add_argument('--dataset_sink_mode', type=bool, default=True, help='dataset_sink_mode is False or True') args = parser.parse_args() if args.device_target == "CPU": args.dataset_sink_mode = False context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) ds_train = create_dataset(os.path.join(args.data_path, "train"), cfg.batch_size) network = LeNet5(cfg.num_classes) net_loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction="mean") net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) config_ck = CheckpointConfig( save_checkpoint_steps=cfg.save_checkpoint_steps, keep_checkpoint_max=cfg.keep_checkpoint_max) ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", directory=args.ckpt_path, config=config_ck) model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()})
parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True) init() elif device_target == "GPU": init("nccl") if device_num > 1: context.reset_auto_parallel_context() context.set_auto_parallel_context( device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True) else: raise ValueError("Unsupport platform.") dataset = create_dataset(cfg.data_path, 1) batch_num = dataset.get_dataset_size() net = GoogleNet(num_classes=cfg.num_classes) # Continue training if set pre_trained to be True if cfg.pre_trained: param_dict = load_checkpoint(cfg.checkpoint_path) load_param_into_net(net, param_dict) lr = lr_steps(0, lr_max=cfg.lr_init, total_epochs=cfg.epoch_size, steps_per_epoch=batch_num) opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), Tensor(lr), cfg.momentum, weight_decay=cfg.weight_decay)
save_graphs=False) # define network net = xception(class_num=config.class_num) if args_opt.device_target == "Ascend": net.to_float(mstype.float16) # define loss if not config.use_label_smooth: config.label_smooth_factor = 0.0 loss = CrossEntropySmooth(smooth_factor=config.label_smooth_factor, num_classes=config.class_num) # define dataset dataset = create_dataset(args_opt.dataset_path, do_train=True, batch_size=config.batch_size, device_num=group_size, rank=rank) step_size = dataset.get_dataset_size() # resume if args_opt.resume: ckpt = load_checkpoint(args_opt.resume) load_param_into_net(net, ckpt) # get learning rate loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) lr = Tensor( get_lr(lr_init=config.lr_init, lr_end=config.lr_end,
set_seed(1) if __name__ == '__main__': args_opt = train_parse_args() args_opt.dataset_path = os.path.abspath(args_opt.dataset_path) config = set_config(args_opt) start = time.time() print(f"train args: {args_opt}\ncfg: {config}") #set context and device init context_device_init(config) # define network backbone_net, head_net, net = define_net(config, args_opt.is_training) dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=True, config=config) step_size = dataset.get_dataset_size() if args_opt.pretrain_ckpt: if args_opt.freeze_layer == "backbone": load_ckpt(backbone_net, args_opt.pretrain_ckpt, trainable=False) step_size = extract_features(backbone_net, args_opt.dataset_path, config) else: load_ckpt(net, args_opt.pretrain_ckpt) if step_size == 0: raise ValueError("The step_size of dataset is zero. Check if the images' count of train dataset is more \ than batch_size in config.py") # Currently, only Ascend support switch precision. switch_precision(net, mstype.float16, config) # define loss
else: init() context.set_auto_parallel_context( device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) if args.net == "resnet50": context.set_auto_parallel_context( all_reduce_fusion_config=[85, 160]) ckpt_save_dir = config.save_checkpoint_path + "ckpt_" + str( get_rank()) + "/" # create dataset dataset = create_dataset(dataset_path=os.path.join(args.dataset_path, "train"), do_train=True, repeat_num=1, batch_size=config.batch_size, target=target) eval_dataset = create_dataset(dataset_path=os.path.join( args.dataset_path, "val"), do_train=False, batch_size=config.batch_size, target=target) step_size = dataset.get_dataset_size() # define net net = resnet(class_num=config.class_num) if args.parameter_server: net.set_param_ps() # init weight