def run_benchmark(model, args): if args.use_cprof: pr = cProfile.Profile() pr.enable() start_time = time.time() # Input data images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE) label = fluid.layers.data(name='label', shape=[1], dtype='int64') # Train program predict = model(images) cost = fluid.layers.cross_entropy(input=predict, label=label) avg_cost = fluid.layers.mean(x=cost) # Evaluator batch_size_tensor = fluid.layers.create_tensor(dtype='int64') batch_acc = fluid.layers.accuracy(input=predict, label=label, total=batch_size_tensor) # inference program inference_program = fluid.default_main_program().clone() with fluid.program_guard(inference_program): inference_program = fluid.io.get_inference_program( target_vars=[batch_acc, batch_size_tensor]) # Optimization opt = fluid.optimizer.AdamOptimizer(learning_rate=0.001, beta1=0.9, beta2=0.999) opt.minimize(avg_cost) fluid.memory_optimize(fluid.default_main_program()) # Initialize executor place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0) exe = fluid.Executor(place) # Parameter initialization exe.run(fluid.default_startup_program()) # Reader train_reader = paddle.batch(paddle.dataset.mnist.train(), batch_size=args.batch_size) accuracy = fluid.average.WeightedAverage() for pass_id in range(args.pass_num): accuracy.reset() pass_start = time.time() every_pass_loss = [] for batch_id, data in enumerate(train_reader()): img_data = np.array(map(lambda x: x[0].reshape([1, 28, 28]), data)).astype(DTYPE) y_data = np.array(map(lambda x: x[1], data)).astype("int64") y_data = y_data.reshape([len(y_data), 1]) start = time.time() loss, acc, weight = exe.run( fluid.default_main_program(), feed={ "pixel": img_data, "label": y_data }, fetch_list=[avg_cost, batch_acc, batch_size_tensor] ) # The accuracy is the accumulation of batches, but not the current batch. end = time.time() accuracy.add(value=acc, weight=weight) every_pass_loss.append(loss) print("Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" % (pass_id, batch_id, loss, acc)) pass_end = time.time() train_avg_acc = accuracy.eval() train_avg_loss = np.mean(every_pass_loss) test_avg_acc = eval_test(exe, batch_acc, batch_size_tensor, inference_program) print( "pass=%d, train_avg_acc=%f,train_avg_loss=%f, test_avg_acc=%f, elapse=%f" % (pass_id, train_avg_acc, train_avg_loss, test_avg_acc, (pass_end - pass_start))) train_acc_kpi.add_record(np.array(train_avg_acc, dtype='float32')) train_cost_kpi.add_record(np.array(train_avg_loss, dtype='float32')) test_acc_kpi.add_record(np.array(test_avg_acc, dtype='float32')) train_duration_kpi.add_record(pass_end - pass_start)
def train_parallel_exe(args, learning_rate, batch_size, num_passes, init_model=None, pretrained_model=None, model_save_dir='model', parallel=True, use_nccl=True, lr_strategy=None, layers=50): class_dim = 1000 image_shape = [3, 224, 224] image = fluid.layers.data(name='image', shape=image_shape, dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64') if args.model is 'se_resnext': out = SE_ResNeXt(input=image, class_dim=class_dim, layers=layers) else: out = mobile_net(img=image, class_dim=class_dim) cost = fluid.layers.cross_entropy(input=out, label=label) acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1) acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5) avg_cost = fluid.layers.mean(x=cost) test_program = fluid.default_main_program().clone(for_test=True) if "piecewise_decay" in lr_strategy: bd = lr_strategy["piecewise_decay"]["bd"] lr = lr_strategy["piecewise_decay"]["lr"] optimizer = fluid.optimizer.Momentum( learning_rate=fluid.layers.piecewise_decay(boundaries=bd, values=lr), momentum=0.9, regularization=fluid.regularizer.L2Decay(1e-4)) elif "cosine_decay" in lr_strategy: step_each_epoch = lr_strategy["cosine_decay"]["step_each_epoch"] epochs = lr_strategy["cosine_decay"]["epochs"] optimizer = fluid.optimizer.Momentum( learning_rate=cosine_decay(learning_rate=learning_rate, step_each_epoch=step_each_epoch, epochs=epochs), momentum=0.9, regularization=fluid.regularizer.L2Decay(1e-4)) else: optimizer = fluid.optimizer.Momentum( learning_rate=learning_rate, momentum=0.9, regularization=fluid.regularizer.L2Decay(1e-4)) opts = optimizer.minimize(avg_cost) if args.with_mem_opt: fluid.memory_optimize(fluid.default_main_program()) place = fluid.CUDAPlace(0) exe = fluid.Executor(place) fluid.default_startup_program.random_seed = 1000 exe.run(fluid.default_startup_program()) if init_model is not None: fluid.io.load_persistables(exe, init_model) if pretrained_model: def if_exist(var): return os.path.exists(os.path.join(pretrained_model, var.name)) fluid.io.load_vars(exe, pretrained_model, predicate=if_exist) train_reader = paddle.batch(flowers.train(), batch_size=batch_size) test_reader = paddle.batch(flowers.test(), batch_size=batch_size) feeder = fluid.DataFeeder(place=place, feed_list=[image, label]) train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name) test_exe = fluid.ParallelExecutor(use_cuda=True, main_program=test_program, share_vars_from=train_exe) fetch_list = [avg_cost.name, acc_top1.name, acc_top5.name] train_speed = [] for pass_id in range(num_passes): train_info = [[], [], []] test_info = [[], [], []] pass_time = 0 pass_num = 0 pass_speed = 0.0 for batch_id, data in enumerate(train_reader()): t1 = time.time() loss, acc1, acc5 = train_exe.run(fetch_list, feed=feeder.feed(data)) t2 = time.time() period = t2 - t1 pass_time += period pass_num += len(data) loss = np.mean(np.array(loss)) acc1 = np.mean(np.array(acc1)) acc5 = np.mean(np.array(acc5)) train_info[0].append(loss) train_info[1].append(acc1) train_info[2].append(acc5) if batch_id % 10 == 0: print("Pass {0}, trainbatch {1}, loss {2}, \ acc1 {3}, acc5 {4} time {5}" .format(pass_id, \ batch_id, loss, acc1, acc5, \ "%2.2f sec" % period)) sys.stdout.flush() train_loss = np.array(train_info[0]).mean() train_acc1 = np.array(train_info[1]).mean() train_acc5 = np.array(train_info[2]).mean() pass_speed = pass_num / pass_time train_speed.append(pass_speed) if pass_id == num_passes - 1: train_acc_top1_kpi.add_record(train_acc1) train_acc_top5_kpi.add_record(train_acc5) train_cost_kpi.add_record(train_loss) mean_pass_speed = np.array(pass_speed).mean() train_speed_kpi.add_record(mean_pass_speed) for data in test_reader(): t1 = time.time() loss, acc1, acc5 = test_exe.run(fetch_list, feed=feeder.feed(data)) t2 = time.time() period = t2 - t1 loss = np.mean(np.array(loss)) acc1 = np.mean(np.array(acc1)) acc5 = np.mean(np.array(acc5)) test_info[0].append(loss) test_info[1].append(acc1) test_info[2].append(acc5) if batch_id % 10 == 0: print("Pass {0},testbatch {1},loss {2}, \ acc1 {3},acc5 {4},time {5}" .format(pass_id, \ batch_id, loss, acc1, acc5, \ "%2.2f sec" % period)) sys.stdout.flush() test_loss = np.array(test_info[0]).mean() test_acc1 = np.array(test_info[1]).mean() test_acc5 = np.array(test_info[2]).mean() print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, \ test_loss {4}, test_acc1 {5}, test_acc5 {6}, pass_time {7}, train_speed {8}" .format(pass_id, \ train_loss, train_acc1, train_acc5, test_loss, test_acc1, \ test_acc5, pass_time, pass_num / pass_time)) sys.stdout.flush() train_acc_top1_kpi.persist() train_acc_top5_kpi.persist() train_cost_kpi.persist() train_speed_kpi.persist()
def parallel_exe(args, train_file_list, val_file_list, data_args, learning_rate, batch_size, num_passes, model_save_dir='model', pretrained_model=None): image_shape = [3, data_args.resize_h, data_args.resize_w] if data_args.dataset == 'coco': num_classes = 81 elif data_args.dataset == 'pascalvoc': num_classes = 21 devices = os.getenv("CUDA_VISIBLE_DEVICES") or "" devices_num = len(devices.split(",")) image = fluid.layers.data(name='image', shape=image_shape, dtype='float32') gt_box = fluid.layers.data(name='gt_box', shape=[4], dtype='float32', lod_level=1) gt_label = fluid.layers.data(name='gt_label', shape=[1], dtype='int32', lod_level=1) difficult = fluid.layers.data(name='gt_difficult', shape=[1], dtype='int32', lod_level=1) locs, confs, box, box_var = mobile_net(num_classes, image, image_shape) nmsed_out = fluid.layers.detection_output(locs, confs, box, box_var, nms_threshold=0.45) loss = fluid.layers.ssd_loss(locs, confs, gt_box, gt_label, box, box_var) loss = fluid.layers.reduce_sum(loss) test_program = fluid.default_main_program().clone(for_test=True) with fluid.program_guard(test_program): map_eval = fluid.evaluator.DetectionMAP(nmsed_out, gt_label, gt_box, difficult, num_classes, overlap_threshold=0.5, evaluate_difficult=False, ap_version=args.ap_version) if data_args.dataset == 'coco': # learning rate decay in 12, 19 pass, respectively if '2014' in train_file_list: epocs = 82783 / batch_size boundaries = [epocs * 12, epocs * 19] elif '2017' in train_file_list: epocs = 118287 / batch_size boundaries = [epcos * 12, epocs * 19] elif data_args.dataset == 'pascalvoc': epocs = 19200 / batch_size boundaries = [epocs * 40, epocs * 60, epocs * 80, epocs * 100] values = [ learning_rate, learning_rate * 0.5, learning_rate * 0.25, learning_rate * 0.1, learning_rate * 0.01 ] optimizer = fluid.optimizer.RMSProp( learning_rate=fluid.layers.piecewise_decay(boundaries, values), regularization=fluid.regularizer.L2Decay(0.00005), ) optimizer.minimize(loss) place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) fluid.default_startup_program.random_seed = 1000 exe.run(fluid.default_startup_program()) if pretrained_model: def if_exist(var): return os.path.exists(os.path.join(pretrained_model, var.name)) fluid.io.load_vars(exe, pretrained_model, predicate=if_exist) if args.parallel: train_exe = fluid.ParallelExecutor(use_cuda=args.use_gpu, loss_name=loss.name) train_reader = paddle.batch(reader.train(data_args, train_file_list), batch_size=batch_size) test_reader = paddle.batch(reader.test(data_args, val_file_list), batch_size=batch_size) feeder = fluid.DataFeeder(place=place, feed_list=[image, gt_box, gt_label, difficult]) def save_model(postfix): model_path = os.path.join(model_save_dir, postfix) if os.path.isdir(model_path): shutil.rmtree(model_path) print 'save models to %s' % (model_path) fluid.io.save_persistables(exe, model_path) best_map = 0. def test(pass_id, best_map): _, accum_map = map_eval.get_map_var() map_eval.reset(exe) test_map = None for data in test_reader(): test_map = exe.run(test_program, feed=feeder.feed(data), fetch_list=[accum_map]) if test_map[0] > best_map: best_map = test_map[0] save_model('best_model') print("Test {0}, map {1}".format(pass_id, test_map[0])) train_num = 0 total_train_time = 0.0 total_iters = 0 for pass_id in range(num_passes): every_pass_loss = [] iter = 0 pass_duration = 0.0 for batch_id, data in enumerate(train_reader()): batch_start = time.time() if iter == args.iterations: break if len(data) < devices_num: continue if args.parallel: loss_v, = train_exe.run(fetch_list=[loss.name], feed=feeder.feed(data)) else: loss_v, = exe.run(fluid.default_main_program(), feed=feeder.feed(data), fetch_list=[loss]) loss_v = np.mean(np.array(loss_v)) if batch_id % 20 == 0: print("Pass {0}, batch {1}, loss {2}, time {3}".format( pass_id, batch_id, loss_v, time.time() - batch_start)) if iter >= args.skip_batch_num or pass_id != 0: batch_duration = time.time() - batch_start pass_duration += batch_duration train_num += len(data) every_pass_loss.append(loss_v) iter += 1 total_iters += 1 #test(pass_id, best_map) total_train_time += pass_duration print("Pass:%d, Loss:%f, Handle Images Duration: %f\n" % (pass_id, np.mean(every_pass_loss), pass_duration)) if pass_id == num_passes - 1: examples_per_sec = train_num / total_train_time train_cost_kpi.add_record(np.mean(every_pass_loss)) train_speed_kpi.add_record( np.array(examples_per_sec, dtype='float')) four_card_speed_kpi.add_record( np.array(examples_per_sec, dtype='float')) if args.gpu_card_num == 1: train_cost_kpi.persist() train_speed_kpi.persist() else: four_card_speed_kpi.persist() print("Best test map {0}".format(best_map))
def train(batch_size, device, pass_num, iterations): print('iterations', iterations) class_dim = 10 # NCHW dshape = [3, 32, 32] input = fluid.layers.data(name='data', shape=dshape, dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64') predict = resnet_cifar10(input, class_dim) cost = fluid.layers.cross_entropy(input=predict, label=label) avg_cost = fluid.layers.mean(x=cost) optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9) opts = optimizer.minimize(avg_cost) # accuracy = fluid.evaluator.Evaluator(input=predict, label=label) # inference program inference_program = fluid.default_main_program().clone() with fluid.program_guard(inference_program): # test_target = accuracy.metrics + accuracy.states test_target = [predict, avg_cost] inference_program = fluid.io.get_inference_program(test_target) fluid.memory_optimize(fluid.default_main_program()) train_reader = paddle.batch( paddle.dataset.cifar.train10(), batch_size=batch_size) test_reader = paddle.batch( paddle.dataset.cifar.test10(), batch_size=batch_size) def test(exe): # accuracy.reset(exe) for batch_id, data in enumerate(test_reader()): img_data = np.array(map(lambda x: x[0].reshape(dshape), data)).astype("float32") y_data = np.array(map(lambda x: x[1], data)).astype("int64") y_data = y_data.reshape([-1, 1]) # print('image_data', img_data) # print('y_data', y_data) predict_, avg_cost_ = exe.run( inference_program, feed={ "data": img_data, "label": y_data }, fetch_list=[predict, avg_cost]) return avg_cost # return accuracy.eval(exe) place = core.CPUPlace() if device == 'CPU' else core.CUDAPlace(0) exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) for pass_id in range(1): logger.warning('Pass {}'.format(pass_id)) # accuracy.reset(exe) iter = 0 for batch_id, data in enumerate(train_reader()): logger.warning('Batch {}'.format(batch_id)) batch_start = time.time() if iter == iterations: break image = np.array(map(lambda x: x[0].reshape(dshape), data)).astype('float32') label = np.array(map(lambda x: x[1], data)).astype('int64') label = label.reshape([-1, 1]) avg_cost_ = exe.run( fluid.default_main_program(), feed={ 'data': image, 'label': label }, fetch_list=[avg_cost]) batch_end = time.time() print('avg_cost', np.array(avg_cost_, dtype='float32')) train_cost_kpi.add_record(np.array(avg_cost_, dtype='float32')) train_duration_kpi.add_record(batch_end - batch_start) iter += 1