def test_resnet50_cifar10_ascend(): cur_path = os.path.dirname(os.path.abspath(__file__)) model_path = "{}/../../../../model_zoo/official/cv".format(cur_path) model_name = "resnet" utils.copy_files(model_path, cur_path, model_name) cur_model_path = os.path.join(cur_path, "resnet") old_list = ["total_epochs=config.epoch_size", "config.epoch_size - config.pretrain_epoch_size"] new_list = ["total_epochs=10", "10"] utils.exec_sed_command(old_list, new_list, os.path.join(cur_model_path, "train.py")) dataset_path = os.path.join(utils.data_root, "cifar-10-batches-bin") exec_network_shell = "cd resnet/scripts; bash run_distribute_train.sh resnet50 cifar10 {} {}"\ .format(utils.rank_table_path, dataset_path) os.system(exec_network_shell) cmd = "ps -ef | grep python | grep train.py | grep -v grep" ret = utils.process_check(100, cmd) assert ret log_file = os.path.join(cur_model_path, "scripts/train_parallel{}/log") for i in range(8): per_step_time = utils.get_perf_data(log_file.format(i)) assert per_step_time < 20.0 loss_list = [] for i in range(8): loss = utils.get_loss_data_list(log_file.format(i)) loss_list.append(loss[-1]) assert sum(loss_list) / len(loss_list) < 0.70
def test_lenet_MNIST(): cur_path = os.path.dirname(os.path.abspath(__file__)) model_path = "{}/../../../../model_zoo/official/cv".format(cur_path) model_name = "lenet" utils.copy_files(model_path, cur_path, model_name) cur_model_path = os.path.join(cur_path, model_name) train_log = os.path.join(cur_model_path, "train_ascend.log") ckpt_file = os.path.join(cur_model_path, "ckpt/checkpoint_lenet-10_1875.ckpt") infer_log = os.path.join(cur_model_path, "infer_ascend.log") dataset_path = os.path.join(utils.data_root, "mnist") exec_network_shell = "cd {0}; python train.py --data_path={1} > {2} 2>&1"\ .format(model_name, dataset_path, train_log) ret = os.system(exec_network_shell) assert ret == 0 exec_network_shell = "cd {0}; python eval.py --data_path={1} --ckpt_path={2} > {3} 2>&1"\ .format(model_name, dataset_path, ckpt_file, infer_log) ret = os.system(exec_network_shell) assert ret == 0 per_step_time = utils.get_perf_data(train_log) print("per_step_time is", per_step_time) assert per_step_time < 1.3 pattern = r"'Accuracy': ([\d\.]+)}" acc = utils.parse_log_file(pattern, infer_log) print("acc is", acc) assert acc[0] > 0.98
def test_DeeplabV3_voc2007(): cur_path = os.path.dirname(os.path.abspath(__file__)) model_path = "{}/../../../../model_zoo/official/cv".format(cur_path) model_name = "deeplabv3" utils.copy_files(model_path, cur_path, model_name) cur_model_path = os.path.join(cur_path, model_name) old_list = [ '/PATH/TO/EXPERIMENTS_DIR', '/PATH/TO/MODEL_ZOO_CODE', '/PATH/TO/MINDRECORD_NAME', '/PATH/TO/PRETRAIN_MODEL', "\\${train_code_path}/src/tools/rank_table_8p.json" ] new_list = [ cur_model_path + '/train', cur_model_path, os.path.join(utils.data_root, "voc/voc2012/mindrecord_train/vocaug_mindrecord0"), os.path.join(utils.ckpt_root, "deeplabv3/resnet101_ascend.ckpt"), utils.rank_table_path ] utils.exec_sed_command( old_list, new_list, os.path.join(cur_model_path, "scripts/run_distribute_train_s16_r1.sh")) old_list = ['model.train(args.train_epochs', 'callbacks=cbs'] new_list = ['model.train(70', 'callbacks=cbs, sink_size=2'] utils.exec_sed_command(old_list, new_list, os.path.join(cur_model_path, "train.py")) exec_network_shell = "cd {}; sh scripts/run_distribute_train_s16_r1.sh".format( model_name) ret = os.system(exec_network_shell) assert ret == 0 cmd = "ps -ef | grep python | grep train.py | grep -v grep" ret = utils.process_check(100, cmd) assert ret log_file = os.path.join(cur_model_path, "train/device{}/log") for i in range(8): per_step_time = utils.get_perf_data(log_file.format(i)) print("per_step_time is", per_step_time) assert per_step_time < 530.0 loss_list = [] for i in range(8): loss = utils.get_loss_data_list(log_file.format(i)) print("loss is", loss[-1]) loss_list.append(loss[-1]) assert sum(loss_list) / len(loss_list) < 2.5
def test_SSD_mobilenet_v1_fpn_coco2017(): cur_path = os.path.dirname(os.path.abspath(__file__)) model_path = "{}/../../../../model_zoo/official/cv".format(cur_path) model_name = "ssd" utils.copy_files(model_path, cur_path, model_name) cur_model_path = os.path.join(cur_path, model_name) old_list = [ "/data/MindRecord_COCO", "/ckpt/mobilenet_v1.ckpt", "/data/coco2017" ] new_list = [ os.path.join(utils.data_root, "coco/coco2017/mindrecord_train/ssd_mindrecord"), os.path.join(utils.ckpt_root, "ssd_mobilenet_v1/mobilenet-v1.ckpt"), os.path.join(utils.data_root, "coco/coco2017") ] utils.exec_sed_command( old_list, new_list, os.path.join(cur_model_path, "src/config_ssd_mobilenet_v1_fpn.py")) old_list = ["ssd300"] new_list = ["ssd_mobilenet_v1_fpn"] utils.exec_sed_command(old_list, new_list, os.path.join(cur_model_path, "src/config.py")) old_list = ["args_opt.epoch_size", "dataset_sink_mode=dataset_sink_mode"] new_list = ["5", "dataset_sink_mode=dataset_sink_mode, sink_size=100"] utils.exec_sed_command(old_list, new_list, os.path.join(cur_model_path, "train.py")) exec_network_shell = "cd {0}; sh -x scripts/run_distribute_train.sh 8 {1} 0.2 coco {2}"\ .format(model_name, 60, utils.rank_table_path) os.system(exec_network_shell) cmd = "ps -ef | grep train.py | grep coco | grep device_num | grep device_id | grep -v grep" ret = utils.process_check(120, cmd) assert ret log_file = os.path.join(cur_model_path, "LOG{}/log.txt") for i in range(8): per_step_time = utils.get_perf_data(log_file.format(i)) print("per_step_time is", per_step_time) assert per_step_time < 545 loss_list = [] for i in range(8): loss = utils.get_loss_data_list(log_file.format(i)) print("loss is", loss[-1]) loss_list.append(loss[-1]) assert sum(loss_list) / len(loss_list) < 2.72