def _build_training_pipeline(config: TransformerConfig, pre_training_dataset=None, fine_tune_dataset=None, test_dataset=None): """ Build training pipeline. Args: config (TransformerConfig): Config of mass model. pre_training_dataset (Dataset): Pre-training dataset. fine_tune_dataset (Dataset): Fine-tune dataset. test_dataset (Dataset): Test dataset. """ net_with_loss = TransformerNetworkWithLoss(config, is_training=True) net_with_loss.init_parameters_data() if config.existed_ckpt: if config.existed_ckpt.endswith(".npz"): weights = np.load(config.existed_ckpt) else: weights = load_checkpoint(config.existed_ckpt) for param in net_with_loss.trainable_params(): weights_name = param.name if weights_name not in weights: raise ValueError( f"Param {weights_name} is not found in ckpt file.") if isinstance(weights[weights_name], Parameter): param.default_input = weights[weights_name].default_input elif isinstance(weights[weights_name], Tensor): param.default_input = Tensor(weights[weights_name].asnumpy(), config.dtype) elif isinstance(weights[weights_name], np.ndarray): param.default_input = Tensor(weights[weights_name], config.dtype) else: param.default_input = weights[weights_name] else: for param in net_with_loss.trainable_params(): name = param.name value = param.default_input if isinstance(value, Tensor): if name.endswith(".gamma"): param.default_input = one_weight(value.asnumpy().shape) elif name.endswith(".beta") or name.endswith(".bias"): param.default_input = zero_weight(value.asnumpy().shape) else: param.default_input = weight_variable( value.asnumpy().shape) dataset = pre_training_dataset if pre_training_dataset is not None \ else fine_tune_dataset if dataset is None: raise ValueError( "pre-training dataset or fine-tuning dataset must be provided one." ) update_steps = dataset.get_repeat_count() * dataset.get_dataset_size() if config.lr_scheduler == "isr": lr = Tensor(square_root_schedule( lr=config.lr, update_num=update_steps, decay_start_step=config.decay_start_step, warmup_steps=config.warmup_steps, min_lr=config.min_lr), dtype=mstype.float32) elif config.lr_scheduler == "poly": lr = Tensor(polynomial_decay_scheduler( lr=config.lr, min_lr=config.min_lr, decay_steps=config.decay_steps, total_update_num=update_steps, warmup_steps=config.warmup_steps, power=config.poly_lr_scheduler_power), dtype=mstype.float32) else: lr = config.lr if config.optimizer.lower() == "adam": optimizer = Adam(net_with_loss.trainable_params(), lr, beta1=0.9, beta2=0.98) elif config.optimizer.lower() == "lamb": lr = BertLearningRate(decay_steps=12000, learning_rate=config.lr, end_learning_rate=config.min_lr, power=10.0, warmup_steps=config.warmup_steps) decay_params = list( filter( lambda x: 'layernorm' not in x.name.lower() and 'bias' not in x .name.lower(), net_with_loss.trainable_params())) other_params = list( filter( lambda x: 'layernorm' in x.name.lower() or 'bias' in x.name. lower(), net_with_loss.trainable_params())) group_params = [{ 'params': decay_params, 'weight_decay': 0.01 }, { 'params': other_params }] optimizer = Lamb(group_params, lr, eps=1e-6) elif config.optimizer.lower() == "momentum": optimizer = Momentum(net_with_loss.trainable_params(), lr, momentum=0.9) else: raise ValueError(f"optimizer only support `adam` and `momentum` now.") # Dynamic loss scale. scale_manager = DynamicLossScaleManager( init_loss_scale=config.init_loss_scale, scale_factor=config.loss_scale_factor, scale_window=config.scale_window) net_with_grads = TransformerTrainOneStepWithLossScaleCell( network=net_with_loss, optimizer=optimizer, scale_update_cell=scale_manager.get_update_cell()) net_with_grads.set_train(True) model = Model(net_with_grads) loss_monitor = LossCallBack(config) ckpt_config = CheckpointConfig( save_checkpoint_steps=config.save_ckpt_steps, keep_checkpoint_max=config.keep_ckpt_max) rank_size = os.getenv('RANK_SIZE') callbacks = [loss_monitor] if rank_size is not None and int( rank_size) > 1 and MultiAscend.get_rank() % 8 == 0: ckpt_callback = ModelCheckpoint( prefix=config.ckpt_prefix, directory=os.path.join(config.ckpt_path, 'ckpt_{}'.format(os.getenv('DEVICE_ID'))), config=ckpt_config) callbacks.append(ckpt_callback) if rank_size is None or int(rank_size) == 1: ckpt_callback = ModelCheckpoint( prefix=config.ckpt_prefix, directory=os.path.join(config.ckpt_path, 'ckpt_{}'.format(os.getenv('DEVICE_ID'))), config=ckpt_config) callbacks.append(ckpt_callback) print(f" | ALL SET, PREPARE TO TRAIN.") _train(model=model, config=config, pre_training_dataset=pre_training_dataset, fine_tune_dataset=fine_tune_dataset, test_dataset=test_dataset, callbacks=callbacks)
parser.add_argument('--device_target', type=str, default="Ascend", choices=['Ascend', 'GPU', 'CPU'], help='device where the code will be implemented (default: Ascend)') parser.add_argument('--data_path', type=str, default="s3://hithcd/rgb/pic/", help='path where the dataset is saved') parser.add_argument('--ckpt_path', type=str, default="obs://hithcd/MA-hw_project_resnet18-05-30-19/output/V0408/", help='if mode is test, must provide\ path where the trained ckpt file') parser.add_argument('--dataset_sink_mode', type=bool, default=False, help='dataset_sink_mode is False or True') args = parser.parse_args() context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) local_data_url = '/cache/data' local_output_url = '/cache/ckpt' mox.file.copy_parallel(args.data_path, local_data_url) mox.file.copy_parallel(args.ckpt_path, local_output_url) net = resnet18(class_num=config.class_num) print("============== Starting Testing ==============") param_dict = load_checkpoint(os.path.join(local_output_url, 'resnet-50_1759.ckpt'), net=net) load_param_into_net(net, param_dict) im = np.asarray(Image.open(os.path.join(local_data_url, '490.png')).convert('L')) im = 255-im im = im/255.0 input = im.reshape((1, 1, 112, 112)) input_tensor = Tensor(input, mindspore.float32) acc = net(input_tensor) acc = acc.asnumpy() preds = np.argmax(acc, axis=1) mox.file.copy_parallel(local_output_url, args.train_url) print("Predict label:{0}, acc={1}".format(preds[0], acc[0][preds[0]]))
def yolov3_predict(instance, strategy): network = YOLOV3DarkNet53(is_training=False) pretrained_ckpt = '/dataset/ckpt-files/shanshui_full/yolov3.ckpt' if not os.path.exists(pretrained_ckpt): err_msg = "The yolov3.ckpt file does not exist!" return {"status": 1, "err_msg": err_msg} param_dict = load_checkpoint(pretrained_ckpt) param_dict_new = {} for key, values in param_dict.items(): if key.startswith('moments.'): continue elif key.startswith('yolo_network.'): param_dict_new[key[13:]] = values else: param_dict_new[key] = values load_param_into_net(network, param_dict_new) config = ConfigYOLOV3DarkNet53() # init detection engine args = edict() args.ignore_threshold = 0.01 args.nms_thresh = 0.5 detection = DetectionEngine(args) input_shape = Tensor(tuple(config.test_img_shape), ms.float32) print('Start inference....') network.set_train(False) ori_image = np.array(json.loads(instance['data']), dtype=instance['dtype']) image, image_shape = data_preprocess(ori_image, config) prediction = network(Tensor(image.reshape(1, 3, 416, 416), ms.float32), input_shape) output_big, output_me, output_small = prediction output_big = output_big.asnumpy() output_me = output_me.asnumpy() output_small = output_small.asnumpy() per_batch_size = 1 detection.detect([output_small, output_me, output_big], per_batch_size, image_shape, config) detection.do_nms_for_results() out_img = detection.draw_boxes_in_image(ori_image) # for i in range(len(detection.det_boxes)): # print("x: ", detection.det_boxes[i]['bbox'][0]) # print("y: ", detection.det_boxes[i]['bbox'][1]) # print("h: ", detection.det_boxes[i]['bbox'][2]) # print("w: ", detection.det_boxes[i]['bbox'][3]) # print("score: ", round(detection.det_boxes[i]['score'], 3)) # print("category: ", detection.det_boxes[i]['category_id']) det_boxes = detection.det_boxes if not len(det_boxes): err_msg = "抱歉!未检测到任何种类,无法标注。" return {"status": 1, "err_msg": err_msg} max_det = max(det_boxes, key=lambda k: k['score']) max_score = max_det['score'] category = det_boxes[det_boxes.index(max_det)]['category_id'] res = { "status": 0, "instance": { "boxes_num": len(det_boxes), "max_score": round(max_score, 3), "category": category, "data": numpy2base64(out_img) } } return res
def main(): parser = argparse.ArgumentParser(description="YOLOv3 train") parser.add_argument("--only_create_dataset", type=ast.literal_eval, default=False, help="If set it true, only create Mindrecord, default is False.") parser.add_argument("--distribute", type=ast.literal_eval, default=False, help="Run distribute, default is False.") parser.add_argument("--device_id", type=int, default=0, help="Device id, default is 0.") parser.add_argument("--device_num", type=int, default=1, help="Use device nums, default is 1.") parser.add_argument("--lr", type=float, default=0.001, help="Learning rate, default is 0.001.") parser.add_argument("--mode", type=str, default="sink", help="Run sink mode or not, default is sink") parser.add_argument("--epoch_size", type=int, default=50, help="Epoch size, default is 50") parser.add_argument("--batch_size", type=int, default=32, help="Batch size, default is 32.") parser.add_argument("--pre_trained", type=str, default=None, help="Pretrained checkpoint file path") parser.add_argument("--pre_trained_epoch_size", type=int, default=0, help="Pretrained epoch size") parser.add_argument("--save_checkpoint_epochs", type=int, default=5, help="Save checkpoint epochs, default is 5.") parser.add_argument("--loss_scale", type=int, default=1024, help="Loss scale, default is 1024.") parser.add_argument("--mindrecord_dir", type=str, default="./Mindrecord_train", help="Mindrecord directory. If the mindrecord_dir is empty, it wil generate mindrecord file by " "image_dir and anno_path. Note if mindrecord_dir isn't empty, it will use mindrecord_dir " "rather than image_dir and anno_path. Default is ./Mindrecord_train") parser.add_argument("--image_dir", type=str, default="", help="Dataset directory, " "the absolute image path is joined by the image_dir " "and the relative path in anno_path") parser.add_argument("--anno_path", type=str, default="", help="Annotation path.") args_opt = parser.parse_args() context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=args_opt.device_id) if args_opt.distribute: device_num = args_opt.device_num context.reset_auto_parallel_context() context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=device_num) init() rank = args_opt.device_id % device_num else: rank = 0 device_num = 1 print("Start create dataset!") # It will generate mindrecord file in args_opt.mindrecord_dir, # and the file name is yolo.mindrecord0, 1, ... file_num. if not os.path.isdir(args_opt.mindrecord_dir): os.makedirs(args_opt.mindrecord_dir) prefix = "yolo.mindrecord" mindrecord_file = os.path.join(args_opt.mindrecord_dir, prefix + "0") if not os.path.exists(mindrecord_file): if os.path.isdir(args_opt.image_dir) and os.path.exists(args_opt.anno_path): print("Create Mindrecord.") data_to_mindrecord_byte_image(args_opt.image_dir, args_opt.anno_path, args_opt.mindrecord_dir, prefix, 8) print("Create Mindrecord Done, at {}".format(args_opt.mindrecord_dir)) else: raise ValueError('image_dir {} or anno_path {} does not exist'.format(\ args_opt.image_dir, args_opt.anno_path)) if not args_opt.only_create_dataset: loss_scale = float(args_opt.loss_scale) # When create MindDataset, using the fitst mindrecord file, such as yolo.mindrecord0. dataset = create_yolo_dataset(mindrecord_file, batch_size=args_opt.batch_size, device_num=device_num, rank=rank) dataset_size = dataset.get_dataset_size() print("Create dataset done!") net = yolov3_resnet18(ConfigYOLOV3ResNet18()) net = YoloWithLossCell(net, ConfigYOLOV3ResNet18()) init_net_param(net, "XavierUniform") # checkpoint ckpt_config = CheckpointConfig(save_checkpoint_steps=dataset_size * args_opt.save_checkpoint_epochs) ckpoint_cb = ModelCheckpoint(prefix="yolov3", directory='./ckpt_' + str(rank) + '/', config=ckpt_config) if args_opt.pre_trained: if args_opt.pre_trained_epoch_size <= 0: raise KeyError("pre_trained_epoch_size must be greater than 0.") param_dict = load_checkpoint(args_opt.pre_trained) load_param_into_net(net, param_dict) total_epoch_size = 60 if args_opt.distribute: total_epoch_size = 160 lr = Tensor(get_lr(learning_rate=args_opt.lr, start_step=args_opt.pre_trained_epoch_size * dataset_size, global_step=total_epoch_size * dataset_size, decay_step=1000, decay_rate=0.95, steps=True)) opt = nn.Adam(filter(lambda x: x.requires_grad, net.get_parameters()), lr, loss_scale=loss_scale) net = TrainingWrapper(net, opt, loss_scale) callback = [TimeMonitor(data_size=dataset_size), LossMonitor(), ckpoint_cb] model = Model(net) dataset_sink_mode = False if args_opt.mode == "sink": print("In sink mode, one epoch return a loss.") dataset_sink_mode = True print("Start train YOLOv3, the first epoch will be slower because of the graph compilation.") model.train(args_opt.epoch_size, dataset, callbacks=callback, dataset_sink_mode=dataset_sink_mode)
def test(cloud_args=None): """test""" args = parse_args(cloud_args) context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True, device_target=args.platform, save_graphs=False) if os.getenv('DEVICE_ID', "not_set").isdigit(): context.set_context(device_id=int(os.getenv('DEVICE_ID'))) # init distributed if args.is_distributed: if args.platform == "Ascend": init() elif args.platform == "GPU": init("nccl") args.rank = get_rank() args.group_size = get_group_size() parallel_mode = ParallelMode.DATA_PARALLEL context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=args.group_size, parameter_broadcast=True, mirror_mean=True) else: args.rank = 0 args.group_size = 1 args.outputs_dir = os.path.join( args.log_path, datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S')) args.logger = get_logger(args.outputs_dir, args.rank) args.logger.save_args(args) # network args.logger.important_info('start create network') if os.path.isdir(args.pretrained): models = list(glob.glob(os.path.join(args.pretrained, '*.ckpt'))) print(models) if args.graph_ckpt: f = lambda x: -1 * int( os.path.splitext(os.path.split(x)[-1])[0].split('-')[-1].split( '_')[0]) else: f = lambda x: -1 * int( os.path.splitext(os.path.split(x)[-1])[0].split('_')[-1]) args.models = sorted(models, key=f) else: args.models = [ args.pretrained, ] for model in args.models: de_dataset = classification_dataset(args.data_dir, image_size=args.image_size, per_batch_size=args.per_batch_size, max_epoch=1, rank=args.rank, group_size=args.group_size, mode='eval') eval_dataloader = de_dataset.create_tuple_iterator() network = get_network(args.backbone, args.num_classes, platform=args.platform) if network is None: raise NotImplementedError('not implement {}'.format(args.backbone)) param_dict = load_checkpoint(model) param_dict_new = {} for key, values in param_dict.items(): if key.startswith('moments.'): continue elif key.startswith('network.'): param_dict_new[key[8:]] = values else: param_dict_new[key] = values load_param_into_net(network, param_dict_new) args.logger.info('load model {} success'.format(model)) img_tot = 0 top1_correct = 0 top5_correct = 0 if args.platform == "Ascend": network.to_float(mstype.float16) else: auto_mixed_precision(network) network.set_train(False) t_end = time.time() it = 0 for data, gt_classes in eval_dataloader: output = network(Tensor(data, mstype.float32)) output = output.asnumpy() top1_output = np.argmax(output, (-1)) top5_output = np.argsort(output)[:, -5:] t1_correct = np.equal(top1_output, gt_classes).sum() top1_correct += t1_correct top5_correct += get_top5_acc(top5_output, gt_classes) img_tot += args.per_batch_size if args.rank == 0 and it == 0: t_end = time.time() it = 1 if args.rank == 0: time_used = time.time() - t_end fps = (img_tot - args.per_batch_size) * args.group_size / time_used args.logger.info( 'Inference Performance: {:.2f} img/sec'.format(fps)) results = [[top1_correct], [top5_correct], [img_tot]] args.logger.info('before results={}'.format(results)) if args.is_distributed: model_md5 = model.replace('/', '') tmp_dir = '/cache' if not os.path.exists(tmp_dir): os.mkdir(tmp_dir) top1_correct_npy = '/cache/top1_rank_{}_{}.npy'.format( args.rank, model_md5) top5_correct_npy = '/cache/top5_rank_{}_{}.npy'.format( args.rank, model_md5) img_tot_npy = '/cache/img_tot_rank_{}_{}.npy'.format( args.rank, model_md5) np.save(top1_correct_npy, top1_correct) np.save(top5_correct_npy, top5_correct) np.save(img_tot_npy, img_tot) while True: rank_ok = True for other_rank in range(args.group_size): top1_correct_npy = '/cache/top1_rank_{}_{}.npy'.format( other_rank, model_md5) top5_correct_npy = '/cache/top5_rank_{}_{}.npy'.format( other_rank, model_md5) img_tot_npy = '/cache/img_tot_rank_{}_{}.npy'.format( other_rank, model_md5) if not os.path.exists(top1_correct_npy) or not os.path.exists(top5_correct_npy) or \ not os.path.exists(img_tot_npy): rank_ok = False if rank_ok: break top1_correct_all = 0 top5_correct_all = 0 img_tot_all = 0 for other_rank in range(args.group_size): top1_correct_npy = '/cache/top1_rank_{}_{}.npy'.format( other_rank, model_md5) top5_correct_npy = '/cache/top5_rank_{}_{}.npy'.format( other_rank, model_md5) img_tot_npy = '/cache/img_tot_rank_{}_{}.npy'.format( other_rank, model_md5) top1_correct_all += np.load(top1_correct_npy) top5_correct_all += np.load(top5_correct_npy) img_tot_all += np.load(img_tot_npy) results = [[top1_correct_all], [top5_correct_all], [img_tot_all]] results = np.array(results) else: results = np.array(results) args.logger.info('after results={}'.format(results)) top1_correct = results[0, 0] top5_correct = results[1, 0] img_tot = results[2, 0] acc1 = 100.0 * top1_correct / img_tot acc5 = 100.0 * top5_correct / img_tot args.logger.info('after allreduce eval: top1_correct={}, tot={},' 'acc={:.2f}%(TOP1)'.format(top1_correct, img_tot, acc1)) args.logger.info('after allreduce eval: top5_correct={}, tot={},' 'acc={:.2f}%(TOP5)'.format(top5_correct, img_tot, acc5)) if args.is_distributed: release()
def test_lenet_mnist_fuzzing(): # upload trained network ckpt_path = '../common/networks/lenet5/trained_ckpt_file/checkpoint_lenet-10_1875.ckpt' net = LeNet5() load_dict = load_checkpoint(ckpt_path) load_param_into_net(net, load_dict) model = Model(net) mutate_config = [{ 'method': 'Blur', 'params': { 'radius': [0.1, 0.2, 0.3], 'auto_param': [True, False] } }, { 'method': 'Contrast', 'params': { 'auto_param': [True] } }, { 'method': 'Translate', 'params': { 'auto_param': [True] } }, { 'method': 'Brightness', 'params': { 'auto_param': [True] } }, { 'method': 'Noise', 'params': { 'auto_param': [True] } }, { 'method': 'Scale', 'params': { 'auto_param': [True] } }, { 'method': 'Shear', 'params': { 'auto_param': [True] } }, { 'method': 'FGSM', 'params': { 'eps': [0.3, 0.2, 0.4], 'alpha': [0.1] } }] # get training data data_list = "../common/dataset/MNIST/train" batch_size = 32 ds = generate_mnist_dataset(data_list, batch_size, sparse=False) train_images = [] for data in ds.create_tuple_iterator(output_numpy=True): images = data[0].astype(np.float32) train_images.append(images) train_images = np.concatenate(train_images, axis=0) # initialize fuzz test with training dataset model_coverage_test = ModelCoverageMetrics(model, 10, 1000, train_images) # fuzz test with original test data # get test data data_list = "../common/dataset/MNIST/test" batch_size = 32 ds = generate_mnist_dataset(data_list, batch_size, sparse=False) test_images = [] test_labels = [] for data in ds.create_tuple_iterator(output_numpy=True): images = data[0].astype(np.float32) labels = data[1] test_images.append(images) test_labels.append(labels) test_images = np.concatenate(test_images, axis=0) test_labels = np.concatenate(test_labels, axis=0) initial_seeds = [] # make initial seeds for img, label in zip(test_images, test_labels): initial_seeds.append([img, label]) initial_seeds = initial_seeds[:100] model_coverage_test.calculate_coverage( np.array(test_images[:100]).astype(np.float32)) LOGGER.info(TAG, 'KMNC of this test is : %s', model_coverage_test.get_kmnc()) model_fuzz_test = Fuzzer(model, train_images, 10, 1000) _, _, _, _, metrics = model_fuzz_test.fuzzing(mutate_config, initial_seeds, eval_metrics='auto') if metrics: for key in metrics: LOGGER.info(TAG, key + ': %s', metrics[key])
# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ """ ##############export checkpoint file into air and onnx models################# python export.py """ import numpy as np import mindspore as ms from mindspore import Tensor from mindspore.train.serialization import load_checkpoint, load_param_into_net, export from src.config import cifar_cfg as cfg from src.googlenet import GoogleNet if __name__ == '__main__': net = GoogleNet(num_classes=cfg.num_classes) param_dict = load_checkpoint(cfg.checkpoint_path) load_param_into_net(net, param_dict) input_arr = Tensor(np.random.uniform(0.0, 1.0, size=[1, 3, 224, 224]), ms.float32) export(net, input_arr, file_name=cfg.onnx_filename, file_format="ONNX") export(net, input_arr, file_name=cfg.air_filename, file_format="AIR")
def do_train(dataset=None, network=None, load_checkpoint_path="", save_checkpoint_path="", epoch_num=1): """ Do train Args: dataset: the train dataset. network: the network with loss load_checkpoint_path: the file path which saved pretrained model checkpoint. save_checkpoint_path: the file path which will save finetuned model checkpoint. epoch_num: the number of epoch. """ if load_checkpoint_path == "": raise ValueError("Pretrain model missed, finetune task must load pretrain model!") steps_per_epoch = dataset.get_dataset_size() # optimizer if cfg.optimizer == 'AdamWeightDecay': lr_schedule = GPT2LearningRate(learning_rate=cfg.AdamWeightDecay.learning_rate, end_learning_rate=cfg.AdamWeightDecay.end_learning_rate, warmup_steps=int(steps_per_epoch * epoch_num * 0.1), decay_steps=steps_per_epoch * epoch_num, power=cfg.AdamWeightDecay.power) params = network.trainable_params() decay_params = list(filter(cfg.AdamWeightDecay.decay_filter, params)) other_params = list(filter(lambda x: not cfg.AdamWeightDecay.decay_filter(x), params)) group_params = [{'params': decay_params, 'weight_decay': cfg.AdamWeightDecay.weight_decay}, {'params': other_params, 'weight_decay': 0.0}] optimizer = AdamWeightDecay(group_params, lr_schedule, eps=cfg.AdamWeightDecay.eps) elif cfg.optimizer == 'Lamb': lr_schedule = GPT2LearningRate(learning_rate=cfg.Lamb.learning_rate, end_learning_rate=cfg.Lamb.end_learning_rate, warmup_steps=int(steps_per_epoch * epoch_num * 0.1), decay_steps=steps_per_epoch * epoch_num, power=cfg.Lamb.power) optimizer = Lamb(network.trainable_params(), lr_schedule) elif cfg.optimizer == 'Momentum': optimizer = Momentum(network.trainable_params(), cfg.Momentum.learning_rate, cfg.Momentum.momentum) else: raise Exception("Optimizer not supported. support: [AdamWeightDecay, Lamb, Momentum]") # load checkpoint into network ckpt_config = CheckpointConfig(save_checkpoint_steps=steps_per_epoch, keep_checkpoint_max=1) prefix_name = "gpt2_translation_" + str(cfg.gpt2_network) + "_" + str(cfg.optimizer) + "_" \ + str(epoch_num) + "_bs" + str(gpt2_net_cfg.batch_size) ckpoint_cb = ModelCheckpoint(prefix=prefix_name, directory=None if save_checkpoint_path == "" else save_checkpoint_path, config=ckpt_config) param_dict = load_checkpoint(load_checkpoint_path) final_param_dict = {} for name, _ in param_dict.items(): final_param_dict['gpt2.gpt2.' + name] = param_dict[name] final_param_dict['gpt2.dense1.weight'] = param_dict['gpt2_embedding_lookup.embedding_table'] load_param_into_net(network, final_param_dict) print("Load the pretrained parameter successfully! \n") update_cell = DynamicLossScaleUpdateCell(loss_scale_value=2 ** 32, scale_factor=2, scale_window=1000) netwithgrads = GPT2FinetuneCell(network, optimizer=optimizer, scale_update_cell=update_cell) netwithgrads.set_train(True) loss_cb = LossMonitor(per_print_times=1) model = Model(netwithgrads) callbacks = [TimeMonitor(dataset.get_dataset_size()), loss_cb, ckpoint_cb] print("=================== Starting Training For Translation Task ====================") model.train(epoch_num, dataset, callbacks=callbacks, dataset_sink_mode=False) print("=================== Translation Training Success ====================")
default="./MNIST_Data", help='path where the dataset is saved') parser.add_argument( '--ckpt_path', type=str, default="", help='if mode is test, must provide path where the trained ckpt file') args = parser.parse_args() if __name__ == "__main__": context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) # define fusion network network = LeNet5Fusion(cfg.num_classes) # convert fusion network to quantization aware network network = quant.convert_quant_network(network, quant_delay=0, bn_fold=False, freeze_bn=10000, per_channel=[True, False], symmetric=[True, False]) # load quantization aware network checkpoint param_dict = load_checkpoint(args.ckpt_path) load_param_into_net(network, param_dict) # export network inputs = Tensor(np.ones([1, 1, cfg.image_height, cfg.image_width]), mindspore.float32) quant.export(network, inputs, file_name="lenet_quant", file_format='AIR')
context.set_context(enable_task_sink=True) print("test lenet predict start") seed = 0 np.random.seed(seed) batch = 1 channel = 1 input_h = 32 input_w = 32 origin_data = np.random.uniform(low=0, high=255, size=(batch, channel, input_h, input_w)).astype(np.float32) origin_data.tofile("lenet_input_data.bin") input_data = Tensor(origin_data) print(input_data.asnumpy()) net = LeNet() ckpt_file_path = "./tests/ut/python/predict/checkpoint_lenet.ckpt" predict_args = parser.parse_args() model_path_name = predict_args.path is_ckpt_exist = os.path.exists(ckpt_file_path) if is_ckpt_exist: param_dict = load_checkpoint(ckpoint_file_name=ckpt_file_path) load_param_into_net(net, param_dict) export(net, input_data, file_name=model_path_name, file_format='LITE') print("test lenet predict success.") else: print("checkpoint file is not exist.")
def do_eval(dataset=None, network=None, metric=None, load_checkpoint_path="", eval_type=None, tokenizer_file_path="", generate_length=1, top_k=1, top_p=1.0, temperature=1.0): """ Do evaluation on Translation Args: dataset: the eval dataset. network: the network with loss. metric: the evaluation method. load_checkpoint_path: the file path which saved finetune model checkpoint. """ if load_checkpoint_path == "": raise ValueError("Finetune model missed, evaluation task must load finetune model!") if metric.lower() == "bleu": print("Prepare to calculate the BLEU score ...") gpt2_translation = network(config=gpt2_net_cfg, is_training=False, use_one_hot_embeddings=False) gpt2_translation.set_train(False) param_dict = load_checkpoint(load_checkpoint_path) if eval_type == "zero-shot": final_param_dict = {} for name, _ in param_dict.items(): final_param_dict['gpt2.' + name] = param_dict[name] final_param_dict['dense1.weight'] = param_dict['gpt2_embedding_lookup.embedding_table'] load_param_into_net(gpt2_translation, final_param_dict) print("load pretrained parameter successfully!\n") elif eval_type == "finetuned": load_param_into_net(gpt2_translation, param_dict) print("load finetuned parameter successfully!\n") else: raise ValueError("Evaluation type missed, eval_type should be [zero-shot, finetuned]") model = Model(gpt2_translation) tokenizer = Tokenizer(vocab_file=tokenizer_file_path + 'gpt2-vocab.json', merge_file=tokenizer_file_path + 'gpt2-merges.txt') callback = BLEU(tokenizer) translation_generator = GenerateForTranslation(decoder=model, config=gpt2_net_cfg, tokenizer=tokenizer, generate_length=1, use_hint=True, select_first_sentence=True, topk_num=top_k, topp_prob=float(top_p), temperature=float(temperature) ) columns_list = ["input_ids", "input_mask", "label_ids"] print("==================== [BLEU] Testing ====================") num_data = 1 for data in dataset.create_dict_iterator(): input_data = [] for i in columns_list: input_data.append(data[i]) input_ids, input_mask, label_ids = input_data print("| Data count: {}".format(num_data * gpt2_net_cfg.batch_size)) print("input_ids shape: {}".format(input_ids.shape)) print("input_mask shape: {}".format(input_mask.shape)) print("label_ids shape: {}".format(label_ids.shape)) ts_predict_list, ref_list = translation_generator.generate_for_translation(input_ids) print("| Batch Reference translation:\n{}\n".format(ref_list)) if ref_list == '' or ref_list is None: print("Sorry ref_list is None, skip it!") continue else: print(" | Batch Predict translation:\n{}\n".format(ts_predict_list)) callback.update(ref_list, ts_predict_list) num_data += 1 print("\n\n") print("**************************************************************") eval_result_print(metric, callback) print("********************** Testing Finished **********************") else: raise ValueError("metric method not supported in translation, support: [BLEU]")
def load_backbone(net, ckpt_path, args): """Load darknet53 backbone checkpoint.""" param_dict = load_checkpoint(ckpt_path) yolo_backbone_prefix = 'feature_map.backbone' darknet_backbone_prefix = 'network.backbone' find_param = [] not_found_param = [] net.init_parameters_data() for name, cell in net.cells_and_names(): if name.startswith(yolo_backbone_prefix): name = name.replace(yolo_backbone_prefix, darknet_backbone_prefix) if isinstance(cell, (nn.Conv2d, nn.Dense)): darknet_weight = '{}.weight'.format(name) darknet_bias = '{}.bias'.format(name) if darknet_weight in param_dict: cell.weight.set_data(param_dict[darknet_weight].data) find_param.append(darknet_weight) else: not_found_param.append(darknet_weight) if darknet_bias in param_dict: cell.bias.set_data(param_dict[darknet_bias].data) find_param.append(darknet_bias) else: not_found_param.append(darknet_bias) elif isinstance(cell, (nn.BatchNorm2d, nn.BatchNorm1d)): darknet_moving_mean = '{}.moving_mean'.format(name) darknet_moving_variance = '{}.moving_variance'.format(name) darknet_gamma = '{}.gamma'.format(name) darknet_beta = '{}.beta'.format(name) if darknet_moving_mean in param_dict: cell.moving_mean.set_data( param_dict[darknet_moving_mean].data) find_param.append(darknet_moving_mean) else: not_found_param.append(darknet_moving_mean) if darknet_moving_variance in param_dict: cell.moving_variance.set_data( param_dict[darknet_moving_variance].data) find_param.append(darknet_moving_variance) else: not_found_param.append(darknet_moving_variance) if darknet_gamma in param_dict: cell.gamma.set_data(param_dict[darknet_gamma].data) find_param.append(darknet_gamma) else: not_found_param.append(darknet_gamma) if darknet_beta in param_dict: cell.beta.set_data(param_dict[darknet_beta].data) find_param.append(darknet_beta) else: not_found_param.append(darknet_beta) args.logger.info('================found_param {}========='.format( len(find_param))) args.logger.info(find_param) args.logger.info('================not_found_param {}========='.format( len(not_found_param))) args.logger.info(not_found_param) args.logger.info('=====load {} successfully ====='.format(ckpt_path)) return net
def validation(net, model_path, data_dir, filename, num_consumer, batch): param_dict = load_checkpoint(model_path) load_param_into_net(net, param_dict) auc = val(net, data_dir, filename, num_consumer, batch) return auc
def test_trains(args): '''test trains''' print('----eval----begin----') model_path = args.pretrained result_file = model_path.replace('.ckpt', '.txt') if os.path.exists(result_file): os.remove(result_file) epoch_result = open(result_file, 'a') epoch_result.write(model_path + '\n') network = FaceQABackbone() ckpt_path = model_path if os.path.isfile(ckpt_path): param_dict = load_checkpoint(ckpt_path) param_dict_new = {} for key, values in param_dict.items(): if key.startswith('moments.'): continue elif key.startswith('network.'): param_dict_new[key[8:]] = values else: param_dict_new[key] = values load_param_into_net(network, param_dict_new) else: print('wrong model path') return 1 path = args.eval_dir kp_error_all = [[], [], [], [], []] eulers_error_all = [[], [], []] kp_ipn = [] file_list = os.listdir(path) for file_name in tqdm(file_list): if file_name.endswith('jpg'): img_path = os.path.join(path, file_name) img, img_ori = read_img(img_path) txt_path = img_path.replace('jpg', 'txt') if os.path.exists(txt_path): euler_kps_do = True x_length = img_ori.shape[1] y_length = img_ori.shape[0] eulers_gt, kp_list = read_gt(txt_path, x_length, y_length) else: euler_kps_do = False continue out = network(img) _, _, kp_coord_ori, eulers_ori, _ = get_md_output(out) if euler_kps_do: eulgt = list(eulers_gt) for euler_id, _ in enumerate(eulers_ori): eulori = eulers_ori[euler_id] eulers_error_all[euler_id].append(abs(eulori-float(eulgt[euler_id]))) eye01 = kp_list[0] eye02 = kp_list[1] eye_dis = 1 cur_flag = True if eye01[0] < 0 or eye01[1] < 0 or eye02[0] < 0 or eye02[1] < 0: cur_flag = False else: eye_dis = np.sqrt(np.square(abs(eye01[0]-eye02[0]))+np.square(abs(eye01[1]-eye02[1]))) cur_error_list = [] for i in range(5): kp_coord_gt = kp_list[i] kp_coord_model = kp_coord_ori[i] if kp_coord_gt[0] != -1: dis = np.sqrt(np.square( kp_coord_gt[0] - kp_coord_model[0]) + np.square(kp_coord_gt[1] - kp_coord_model[1])) kp_error_all[i].append(dis) cur_error_list.append(dis) if cur_flag: kp_ipn.append(sum(cur_error_list)/len(cur_error_list)/eye_dis) kp_ave_error = [] for kps, _ in enumerate(kp_error_all): kp_ave_error.append("%.3f" % (sum(kp_error_all[kps])/len(kp_error_all[kps]))) euler_ave_error = [] elur_mae = [] for eulers, _ in enumerate(eulers_error_all): euler_ave_error.append("%.3f" % (sum(eulers_error_all[eulers])/len(eulers_error_all[eulers]))) elur_mae.append((sum(eulers_error_all[eulers])/len(eulers_error_all[eulers]))) print(r'5 keypoints average err:'+str(kp_ave_error)) print(r'3 eulers average err:'+str(euler_ave_error)) print('IPN of 5 keypoints:'+str(sum(kp_ipn)/len(kp_ipn)*100)) print('MAE of elur:'+str(sum(elur_mae)/len(elur_mae))) epoch_result.write(str(sum(kp_ipn)/len(kp_ipn)*100)+'\t'+str(sum(elur_mae)/len(elur_mae))+'\t' + str(kp_ave_error)+'\t'+str(euler_ave_error)+'\n') print('----eval----end----') return 0
def test(): """The function of eval.""" start_time = time.time() args = parse_args() devid = int(os.getenv('DEVICE_ID')) if os.getenv('DEVICE_ID') else 0 context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, save_graphs=True, device_id=devid) # logger args.outputs_dir = os.path.join( args.log_path, datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S')) rank_id = int( os.environ.get('RANK_ID')) if os.environ.get('RANK_ID') else 0 args.logger = get_logger(args.outputs_dir, rank_id) context.reset_auto_parallel_context() parallel_mode = ParallelMode.STAND_ALONE context.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=1) args.logger.info('Creating Network....') network = YOLOV3DarkNet53(is_training=False) args.logger.info(args.pretrained) if os.path.isfile(args.pretrained): param_dict = load_checkpoint(args.pretrained) param_dict_new = {} for key, values in param_dict.items(): if key.startswith('moments.'): continue elif key.startswith('yolo_network.'): param_dict_new[key[13:]] = values else: param_dict_new[key] = values load_param_into_net(network, param_dict_new) args.logger.info('load_model {} success'.format(args.pretrained)) else: args.logger.info('{} not exists or not a pre-trained file'.format( args.pretrained)) assert FileNotFoundError( '{} not exists or not a pre-trained file'.format(args.pretrained)) exit(1) data_root = args.data_root ann_file = args.annFile config = ConfigYOLOV3DarkNet53() if args.testing_shape: config.test_img_shape = conver_testing_shape(args) ds, data_size = create_yolo_dataset(data_root, ann_file, is_training=False, batch_size=args.per_batch_size, max_epoch=1, device_num=1, rank=rank_id, shuffle=False, config=config) args.logger.info('testing shape : {}'.format(config.test_img_shape)) args.logger.info('totol {} images to eval'.format(data_size)) network.set_train(False) # init detection engine detection = DetectionEngine(args) input_shape = Tensor(tuple(config.test_img_shape), ms.float32) args.logger.info('Start inference....') for i, data in enumerate(ds.create_dict_iterator(num_epochs=1)): image = data["image"] image_shape = data["image_shape"] image_id = data["img_id"] prediction = network(image, input_shape) output_big, output_me, output_small = prediction output_big = output_big.asnumpy() output_me = output_me.asnumpy() output_small = output_small.asnumpy() image_id = image_id.asnumpy() image_shape = image_shape.asnumpy() detection.detect([output_small, output_me, output_big], args.per_batch_size, image_shape, image_id) if i % 1000 == 0: args.logger.info('Processing... {:.2f}% '.format( i * args.per_batch_size / data_size * 100)) args.logger.info('Calculating mAP...') detection.do_nms_for_results() result_file_path = detection.write_result() args.logger.info('result file path: {}'.format(result_file_path)) eval_result = detection.get_eval_result() cost_time = time.time() - start_time args.logger.info('\n=============coco eval reulst=========\n' + eval_result) args.logger.info('testing cost time {:.2f}h'.format(cost_time / 3600.))
type=bool, default=True, help='dataset_sink_mode is False or True') args = parser.parse_args() if __name__ == "__main__": context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) ds_train = create_dataset(os.path.join(args.data_path, "train"), cfg.batch_size, cfg.epoch_size) step_size = ds_train.get_dataset_size() # define fusion network network = LeNet5Fusion(cfg.num_classes) # load quantization aware network checkpoint param_dict = load_checkpoint(args.ckpt_path, network.type) load_param_into_net(network, param_dict) # convert fusion network to quantization aware network network = quant.convert_quant_network(network, quant_delay=0, bn_fold=False, freeze_bn=10000) # define network loss net_loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction="mean") # define network optimization net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) # call back and monitor
def test_fast_gradient_sign_method(): """ FGSM-Attack test """ context.set_context(mode=context.GRAPH_MODE, device_target="Ascend") # upload trained network ckpt_name = './trained_ckpt_file/checkpoint_lenet-10_1875.ckpt' net = LeNet5() load_dict = load_checkpoint(ckpt_name) load_param_into_net(net, load_dict) # get test data data_list = "./MNIST_unzip/test" batch_size = 32 ds = generate_mnist_dataset(data_list, batch_size, sparse=False) # prediction accuracy before attack model = Model(net) batch_num = 3 # the number of batches of attacking samples test_images = [] test_labels = [] predict_labels = [] i = 0 for data in ds.create_tuple_iterator(): i += 1 images = data[0].astype(np.float32) labels = data[1] test_images.append(images) test_labels.append(labels) pred_labels = np.argmax(model.predict(Tensor(images)).asnumpy(), axis=1) predict_labels.append(pred_labels) if i >= batch_num: break predict_labels = np.concatenate(predict_labels) true_labels = np.argmax(np.concatenate(test_labels), axis=1) accuracy = np.mean(np.equal(predict_labels, true_labels)) LOGGER.info(TAG, "prediction accuracy before attacking is : %s", accuracy) # attacking attack = FastGradientSignMethod(net, eps=0.3) start_time = time.clock() adv_data = attack.batch_generate(np.concatenate(test_images), np.concatenate(test_labels), batch_size=32) stop_time = time.clock() np.save('./adv_data', adv_data) pred_logits_adv = model.predict(Tensor(adv_data)).asnumpy() # rescale predict confidences into (0, 1). pred_logits_adv = softmax(pred_logits_adv, axis=1) pred_labels_adv = np.argmax(pred_logits_adv, axis=1) accuracy_adv = np.mean(np.equal(pred_labels_adv, true_labels)) LOGGER.info(TAG, "prediction accuracy after attacking is : %s", accuracy_adv) attack_evaluate = AttackEvaluate(np.concatenate(test_images).transpose(0, 2, 3, 1), np.concatenate(test_labels), adv_data.transpose(0, 2, 3, 1), pred_logits_adv) LOGGER.info(TAG, 'mis-classification rate of adversaries is : %s', attack_evaluate.mis_classification_rate()) LOGGER.info(TAG, 'The average confidence of adversarial class is : %s', attack_evaluate.avg_conf_adv_class()) LOGGER.info(TAG, 'The average confidence of true class is : %s', attack_evaluate.avg_conf_true_class()) LOGGER.info(TAG, 'The average distance (l0, l2, linf) between original ' 'samples and adversarial samples are: %s', attack_evaluate.avg_lp_distance()) LOGGER.info(TAG, 'The average structural similarity between original ' 'samples and adversarial samples are: %s', attack_evaluate.avg_ssim()) LOGGER.info(TAG, 'The average costing time is %s', (stop_time - start_time)/(batch_num*batch_size))
def run_gru_eval(): """ Transformer evaluation. """ parser = argparse.ArgumentParser(description='GRU eval') parser.add_argument( "--device_target", type=str, default="Ascend", help="device where the code will be implemented, default is Ascend") parser.add_argument('--device_id', type=int, default=0, help='device id of GPU or Ascend, default is 0') parser.add_argument('--device_num', type=int, default=1, help='Use device nums, default is 1') parser.add_argument('--ckpt_file', type=str, default="", help='ckpt file path') parser.add_argument("--dataset_path", type=str, default="", help="Dataset path, default: f`sns.") args = parser.parse_args() context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, reserve_class_name_in_scope=False, \ device_id=args.device_id, save_graphs=False) dataset = create_gru_dataset(epoch_count=config.num_epochs, batch_size=config.eval_batch_size, \ dataset_path=args.dataset_path, rank_size=args.device_num, rank_id=0, do_shuffle=False, is_training=False) dataset_size = dataset.get_dataset_size() print("dataset size is {}".format(dataset_size)) network = Seq2Seq(config, is_training=False) network = GRUInferCell(network) network.set_train(False) if args.ckpt_file != "": parameter_dict = load_checkpoint(args.ckpt_file) load_param_into_net(network, parameter_dict) model = Model(network) predictions = [] source_sents = [] target_sents = [] eval_text_len = 0 for batch in dataset.create_dict_iterator(output_numpy=True, num_epochs=1): source_sents.append(batch["source_ids"]) target_sents.append(batch["target_ids"]) source_ids = Tensor(batch["source_ids"], mstype.int32) target_ids = Tensor(batch["target_ids"], mstype.int32) predicted_ids = model.predict(source_ids, target_ids) print("predicts is ", predicted_ids.asnumpy()) print("target_ids is ", target_ids) predictions.append(predicted_ids.asnumpy()) eval_text_len = eval_text_len + 1 f_output = open(config.output_file, 'w') f_target = open(config.target_file, "w") for batch_out, true_sentence in zip(predictions, target_sents): for i in range(config.eval_batch_size): target_ids = [str(x) for x in true_sentence[i].tolist()] f_target.write(" ".join(target_ids) + "\n") token_ids = [str(x) for x in batch_out[i].tolist()] f_output.write(" ".join(token_ids) + "\n") f_output.close() f_target.close()
parser.add_argument('--train_url', type=str, default=None, help='Train output path') args_opt = parser.parse_args() device_id = int(os.getenv('DEVICE_ID')) context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target) context.set_context(device_id=device_id) context.set_context(enable_mem_reuse=True) local_data_url = '/cache/data' local_ckpt_url = '/cache/ckpt' if args_opt.checkpoint_path: checkpoint_file=os.path.join(local_ckpt_url,os.path.split(args_opt.checkpoint_path)[1]) mox.file.copy_parallel(args_opt.data_url,local_data_url) mox.file.copy_parallel(args_opt.checkpoint_path,checkpoint_file) net = vgg16(num_classes=cfg.num_classes) opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), 0.01, cfg.momentum, weight_decay=cfg.weight_decay) loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean', is_grad=False) model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'}) param_dict = load_checkpoint(checkpoint_file) load_param_into_net(net, param_dict) net.set_train(False) dataset = dataset.create_dataset(local_data_url, 1, False) res = model.eval(dataset) print("result: ", res)
def test_load_checkpoint_error_filename(): ckpoint_file_name = 1 with pytest.raises(ValueError): load_checkpoint(ckpoint_file_name)
loss_scale = float(config.loss_scale) # When create MindDataset, using the fitst mindrecord file, such as MaskRcnn.mindrecord0. dataset = create_maskrcnn_dataset(mindrecord_file, batch_size=config.batch_size, device_num=device_num, rank_id=rank) dataset_size = dataset.get_dataset_size() print("total images num: ", dataset_size) print("Create dataset done!") net = Mask_Rcnn_Resnet50(config=config) net = net.set_train() load_path = args_opt.pre_trained if load_path != "": param_dict = load_checkpoint(load_path) if config.pretrain_epoch_size == 0: for item in list(param_dict.keys()): if not (item.startswith('backbone') or item.startswith('rcnn_mask')): param_dict.pop(item) load_param_into_net(net, param_dict) loss = LossNet() lr = Tensor(dynamic_lr(config, rank_size=device_num, start_steps=config.pretrain_epoch_size * dataset_size), mstype.float32) opt = Momentum(params=net.trainable_params(), learning_rate=lr, momentum=config.momentum, weight_decay=config.weight_decay, loss_scale=config.loss_scale) net_with_loss = WithLossCell(net, loss) if args_opt.run_distribute: net = TrainOneStepCell(net_with_loss, net, opt, sens=config.loss_scale, reduce_flag=True,
def test_load_checkpoint_empty_file(): os.mknod("empty.ckpt") with pytest.raises(ValueError): load_checkpoint("empty.ckpt")
single_scale_trans = SingleScaleTrans(resize=args.input_shape) ds = ds.batch( args.batch_size, per_batch_map=single_scale_trans, input_columns=["image", "annotation", "image_name", "image_size"], num_parallel_workers=8) args.steps_per_epoch = ds.get_dataset_size() # backbone network = backbone_HwYolov3(num_classes, num_anchors_list, args) # load pretrain model if os.path.isfile(args.pretrained): param_dict = load_checkpoint(args.pretrained) param_dict_new = {} for key, values in param_dict.items(): if key.startswith('moments.'): continue elif key.startswith('network.'): param_dict_new[key[8:]] = values else: param_dict_new[key] = values load_param_into_net(network, param_dict_new) print('load model {} success'.format(args.pretrained)) else: print( 'load model {} failed, please check the path of model, evaluating end' .format(args.pretrained)) exit(0)
def _load_checkpoint(self): from mindspore.train.serialization import load_checkpoint param_dict = load_checkpoint(self.checkpoint_path) return {k: v.asnumpy() for k, v in param_dict.items()}
def do_train(dataset=None, network=None, load_checkpoint_path="", save_checkpoint_path="", epoch_num=1): """ do train """ if load_checkpoint_path == "": raise ValueError( "Pretrain model missed, finetune task must load pretrain model!") steps_per_epoch = dataset.get_dataset_size() # optimizer if optimizer_cfg.optimizer == 'AdamWeightDecay': lr_schedule = BertLearningRate( learning_rate=optimizer_cfg.AdamWeightDecay.learning_rate, end_learning_rate=optimizer_cfg.AdamWeightDecay.end_learning_rate, warmup_steps=int(steps_per_epoch * epoch_num * 0.1), decay_steps=steps_per_epoch * epoch_num, power=optimizer_cfg.AdamWeightDecay.power) params = network.trainable_params() decay_params = list( filter(optimizer_cfg.AdamWeightDecay.decay_filter, params)) other_params = list( filter(lambda x: not optimizer_cfg.AdamWeightDecay.decay_filter(x), params)) group_params = [{ 'params': decay_params, 'weight_decay': optimizer_cfg.AdamWeightDecay.weight_decay }, { 'params': other_params, 'weight_decay': 0.0 }] optimizer = AdamWeightDecay(group_params, lr_schedule, eps=optimizer_cfg.AdamWeightDecay.eps) elif optimizer_cfg.optimizer == 'Lamb': lr_schedule = BertLearningRate( learning_rate=optimizer_cfg.Lamb.learning_rate, end_learning_rate=optimizer_cfg.Lamb.end_learning_rate, warmup_steps=int(steps_per_epoch * epoch_num * 0.1), decay_steps=steps_per_epoch * epoch_num, power=optimizer_cfg.Lamb.power) optimizer = Lamb(network.trainable_params(), learning_rate=lr_schedule) elif optimizer_cfg.optimizer == 'Momentum': optimizer = Momentum( network.trainable_params(), learning_rate=optimizer_cfg.Momentum.learning_rate, momentum=optimizer_cfg.Momentum.momentum) else: raise Exception( "Optimizer not supported. support: [AdamWeightDecay, Lamb, Momentum]" ) # load checkpoint into network ckpt_config = CheckpointConfig(save_checkpoint_steps=steps_per_epoch, keep_checkpoint_max=1) ckpoint_cb = ModelCheckpoint( prefix="squad", directory=None if save_checkpoint_path == "" else save_checkpoint_path, config=ckpt_config) param_dict = load_checkpoint(load_checkpoint_path) load_param_into_net(network, param_dict) update_cell = DynamicLossScaleUpdateCell(loss_scale_value=2**32, scale_factor=2, scale_window=1000) netwithgrads = BertSquadCell(network, optimizer=optimizer, scale_update_cell=update_cell) model = Model(netwithgrads) callbacks = [ TimeMonitor(dataset.get_dataset_size()), LossCallBack(), ckpoint_cb ] model.train(epoch_num, dataset, callbacks=callbacks)
def do_eval(dataset=None, network=None, metric=None, load_checkpoint_path="", translate_direction="en-fr"): """ Do evaluation on summarization Args: dataset: the eval dataset. network: the network with loss. metric: the evaluation method. load_checkpoint_path: the file path which saved finetune model checkpoint. """ if load_checkpoint_path == "": raise ValueError( "Finetune model missed, evaluation task must load finetune model!") if metric.lower() == "bleu": print("Prepare to calculate the BLEU score ...") gpt2_loss = network(config=gpt2_net_cfg, is_training=False, use_one_hot_embeddings=False) gpt2_loss.set_train(False) param_dict = load_checkpoint(load_checkpoint_path) reorganized_param_dict = dict() for netName in param_dict: reorganized_param_dict['gpt2.' + netName] = param_dict[netName] reorganized_param_dict['lm_head.weight'] = param_dict[ 'gpt2_embedding_lookup.embedding_table'] load_param_into_net(gpt2_loss, reorganized_param_dict) # for item in gpt2_loss.get_parameters(): # print('name: ',item.data.name) model = Model(gpt2_loss) tokenizer = Tokenizer( vocab_file='./src/utils/pretrain-data/gpt2-vocab.json', merge_file='./src/utils/pretrain-data/gpt2-merges.txt') callback = BLEU(tokenizer) sample = Sample(model, tokenizer=tokenizer, model_config=gpt2_net_cfg, topk_num=0, topp_prob=0.92, min_tokens_to_keep=1, demo_mode=False, early_stop=True) columns_list = ["input_ids", "input_mask", "label_ids"] for data in dataset.create_dict_iterator(): input_data = [] for i in columns_list: input_data.append(data[i]) input_ids, input_mask, label_ids = input_data print("input_ids shape: {}".format(input_ids.shape)) print("label_ids shape: {}".format(label_ids.shape)) print("============= Translation Testing =============") #input_str,ref_str = sample.extract_string_from_tensor(input_ids,mode="pair") hypo, ref = sample.generate_for_Translation( input_ids, max_generate_length=150) print("REF str:\n ", ref, "\nHYPO str:\n", hypo, "\n") #print("LENGTH: ",len(ref[1])," and ",len(hypo[1]),"\n") callback.update(ref, hypo) print("==============================================") eval_result_print(metric, callback) print("==============================================") print("************** Translation Testing Finished **************") else: raise ValueError( "metric method not supported in summarization, support: [Rouge]")
init() epoch_size = args_opt.epoch_size net = resnet50(args_opt.batch_size, args_opt.num_classes) ls = SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), 0.01, 0.9) model = Model(net, loss_fn=ls, optimizer=opt, metrics={'acc'}) # as for train, users could use model.train if args_opt.do_train: dataset = create_dataset() batch_num = dataset.get_dataset_size() config_ck = CheckpointConfig(save_checkpoint_steps=batch_num, keep_checkpoint_max=35) ckpoint_cb = ModelCheckpoint(prefix="train_resnet_cifar10", directory="./", config=config_ck) loss_cb = LossMonitor() model.train(epoch_size, dataset, callbacks=[ckpoint_cb, loss_cb]) # as for evaluation, users could use model.eval if args_opt.do_eval: if args_opt.checkpoint_path: param_dict = load_checkpoint(args_opt.checkpoint_path) load_param_into_net(net, param_dict) eval_dataset = create_dataset(training=False) res = model.eval(eval_dataset) print("result: ", res)
def do_train(dataset=None, network=None, load_checkpoint_path="", save_checkpoint_path="", epoch_num=1): """ Do train Args: dataset: the train dataset. network: the network with loss load_checkpoint_path: the file path which saved pretrain model checkpoint. save_checkpoint_path: the file path which will save finetune model checkpoint. epoch_num: the number of epoch """ if load_checkpoint_path == "": raise ValueError( "Pretrain model missed, finetune task must load pretrain model!") steps_per_epoch = dataset.get_dataset_size( ) # samples / batch_size doing#### #Select Optimizer if cfg.optimizer == 'AdamWeightDecay': lr_schedule = GPT2LearningRate( learning_rate=cfg.AdamWeightDecay.learning_rate, end_learning_rate=cfg.AdamWeightDecay.end_learning_rate, warmup_steps=int(steps_per_epoch * epoch_num * 0.1), decay_steps=steps_per_epoch * epoch_num, power=cfg.AdamWeightDecay.power) params = network.trainable_params( ) # return a list of all trainable parmeters of the network # Use parameter groups and set different values decay_params = list(filter(cfg.AdamWeightDecay.decay_filter, params)) # without layernorm and bias other_params = list( filter(lambda x: not cfg.AdamWeightDecay.decay_filter(x), params)) # with layernorm and bias group_params = [{ 'params': decay_params, 'weight_decay': cfg.AdamWeightDecay.weight_decay }, { 'params': other_params, 'weight_decay': 0.0 }] optimizer = AdamWeightDecay(group_params, lr_schedule, eps=cfg.AdamWeightDecay.eps) elif cfg.optimizer == 'Lamb': lr_schedule = GPT2LearningRate( learning_rate=cfg.Lamb.learning_rate, end_learning_rate=cfg.Lamb.end_learning_rate, warmup_steps=int(steps_per_epoch * epoch_num * 0.1), decay_steps=steps_per_epoch * epoch_num, power=cfg.Lamb.power) optimizer = Lamb(network.trainable_params(), lr_schedule) elif cfg.optimizer == 'Momentum': optimizer = Momentum(network.trainable_params(), cfg.Momentum.learning_rate, cfg.Momentum.momentum) else: raise Exception( "Optimizer not supported. support: [AdamWeightDecay, Lamb, Momentum]" ) # load checkpoint into network ckpt_config = CheckpointConfig(save_checkpoint_steps=steps_per_epoch, keep_checkpoint_max=1) ckpoint_cb = ModelCheckpoint( prefix="gpt2_translation_en_fr_", directory=None if save_checkpoint_path == "" else save_checkpoint_path, config=ckpt_config) param_dict = load_checkpoint(load_checkpoint_path) final_param_dict = {} for k, v in param_dict.items(): final_param_dict['gpt2_loss.gpt2.gpt2.' + k] = param_dict[k] # set the weights of final linear weights to weights of gpt2 token embedding final_param_dict['gpt2_loss.gpt2.dense1.weight'] = param_dict[ 'gpt2_embedding_lookup.embedding_table'] load_param_into_net(network, final_param_dict) print("| loading the pretrained weights | \n") update_cell = DynamicLossScaleUpdateCell(loss_scale_value=2**32, scale_factor=2, scale_window=1000) netwithgrads = GPT2FinetuneCell(network, optimizer=optimizer, scale_update_cell=update_cell) netwithgrads.set_train(True) loss_cb = LossMonitor() model = Model(netwithgrads, amp_level='O2') callbacks = [TimeMonitor(dataset.get_dataset_size()), loss_cb, ckpoint_cb] print( "============== Starting Training For Translation Task ==============") model.train(epoch_num, dataset, callbacks=callbacks) print( "============== Translation Training Success ==============")
auto_parallel_context().set_all_reduce_fusion_split_indices([85, 160]) ckpt_save_dir = config.save_checkpoint_path + "ckpt_" + str(get_rank()) + "/" # create dataset dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=True, repeat_num=1, batch_size=config.batch_size, target=target) step_size = dataset.get_dataset_size() # define net net = resnet(class_num=config.class_num) if args_opt.parameter_server: net.set_param_ps() # init weight if args_opt.pre_trained: param_dict = load_checkpoint(args_opt.pre_trained) load_param_into_net(net, param_dict) else: for _, cell in net.cells_and_names(): if isinstance(cell, nn.Conv2d): cell.weight.default_input = weight_init.initializer(weight_init.XavierUniform(), cell.weight.shape, cell.weight.dtype) if isinstance(cell, nn.Dense): cell.weight.default_input = weight_init.initializer(weight_init.TruncatedNormal(), cell.weight.shape, cell.weight.dtype) # init lr if args_opt.net == "resnet50": if args_opt.dataset == "cifar10":
if args_opt.platform != 'GPU': raise ValueError("Only supported GPU training.") context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.platform) net = efficientnet_b0( num_classes=cfg.num_classes, drop_rate=cfg.drop, drop_connect_rate=cfg.drop_connect, global_pool=cfg.gp, bn_tf=cfg.bn_tf, ) ckpt = load_checkpoint(args_opt.checkpoint) load_param_into_net(net, ckpt) net.set_train(False) val_data_url = args_opt.data_path dataset = create_dataset_val(cfg.batch_size, val_data_url, workers=cfg.workers, distributed=False) loss = LabelSmoothingCrossEntropy(smooth_factor=cfg.smoothing) eval_metrics = { 'Loss': nn.Loss(), 'Top1-Acc': nn.Top1CategoricalAccuracy(), 'Top5-Acc': nn.Top5CategoricalAccuracy() } model = Model(net, loss, optimizer=None, metrics=eval_metrics)